fs/btrfs/file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/fs.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/time.h>
   9 #include <linux/init.h>
  10 #include <linux/string.h>
  11 #include <linux/backing-dev.h>
  12 #include <linux/falloc.h>
  13 #include <linux/writeback.h>
  14 #include <linux/compat.h>
  15 #include <linux/slab.h>
  16 #include <linux/btrfs.h>
  17 #include <linux/uio.h>
  18 #include <linux/iversion.h>
  19 #include <linux/fsverity.h>
  20 #include <linux/iomap.h>
  21 #include "ctree.h"
  22 #include "disk-io.h"
  23 #include "transaction.h"
  24 #include "btrfs_inode.h"
  25 #include "print-tree.h"
  26 #include "tree-log.h"
  27 #include "locking.h"
  28 #include "volumes.h"
  29 #include "qgroup.h"
  30 #include "compression.h"
  31 #include "delalloc-space.h"
  32 #include "reflink.h"
  33 #include "subpage.h"
  34 #include "fs.h"
  35 #include "accessors.h"
  36 #include "extent-tree.h"
  37 #include "file-item.h"
  38 #include "ioctl.h"
  39 #include "file.h"
  40 #include "super.h"
  41
  42 /* simple helper to fault in pages and copy.  This should go away
  43  * and be replaced with calls into generic code.
  44  */
  45 static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
  46                                          struct page **prepared_pages,
  47                                          struct iov_iter *i)
  48 {
  49         size_t copied = 0;
  50         size_t total_copied = 0;
  51         int pg = 0;
  52         int offset = offset_in_page(pos);
  53
  54         while (write_bytes > 0) {
  55                 size_t count = min_t(size_t,
  56                                      PAGE_SIZE - offset, write_bytes);
  57                 struct page *page = prepared_pages[pg];
  58                 /*
  59                  * Copy data from userspace to the current page
  60                  */
  61                 copied = copy_page_from_iter_atomic(page, offset, count, i);
  62
  63                 /* Flush processor's dcache for this page */
  64                 flush_dcache_page(page);
  65
  66                 /*
  67                  * if we get a partial write, we can end up with
  68                  * partially up to date pages.  These add
  69                  * a lot of complexity, so make sure they don't
  70                  * happen by forcing this copy to be retried.
  71                  *
  72                  * The rest of the btrfs_file_write code will fall
  73                  * back to page at a time copies after we return 0.
  74                  */
  75                 if (unlikely(copied < count)) {
  76                         if (!PageUptodate(page)) {
  77                                 iov_iter_revert(i, copied);
  78                                 copied = 0;
  79                         }
  80                         if (!copied)
  81                                 break;
  82                 }
  83
  84                 write_bytes -= copied;
  85                 total_copied += copied;
  86                 offset += copied;
  87                 if (offset == PAGE_SIZE) {
  88                         pg++;
  89                         offset = 0;
  90                 }
  91         }
  92         return total_copied;
  93 }
  94
  95 /*
  96  * unlocks pages after btrfs_file_write is done with them
  97  */
  98 static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
  99                              struct page **pages, size_t num_pages,
 100                              u64 pos, u64 copied)
 101 {
 102         size_t i;
 103         u64 block_start = round_down(pos, fs_info->sectorsize);
 104         u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
 105
 106         ASSERT(block_len <= U32_MAX);
 107         for (i = 0; i < num_pages; i++) {
 108                 /* page checked is some magic around finding pages that
 109                  * have been modified without going through btrfs_set_page_dirty
 110                  * clear it here. There should be no need to mark the pages
 111                  * accessed as prepare_pages should have marked them accessed
 112                  * in prepare_pages via find_or_create_page()
 113                  */
 114                 btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
 115                                                block_len);
 116                 unlock_page(pages[i]);
 117                 put_page(pages[i]);
 118         }
 119 }
 120
 121 /*
 122  * After btrfs_copy_from_user(), update the following things for delalloc:
 123  * - Mark newly dirtied pages as DELALLOC in the io tree.
 124  *   Used to advise which range is to be written back.
 125  * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
 126  * - Update inode size for past EOF write
 127  */
 128 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 129                       size_t num_pages, loff_t pos, size_t write_bytes,
 130                       struct extent_state **cached, bool noreserve)
 131 {
 132         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 133         int err = 0;
 134         int i;
 135         u64 num_bytes;
 136         u64 start_pos;
 137         u64 end_of_last_block;
 138         u64 end_pos = pos + write_bytes;
 139         loff_t isize = i_size_read(&inode->vfs_inode);
 140         unsigned int extra_bits = 0;
 141
 142         if (write_bytes == 0)
 143                 return 0;
 144
 145         if (noreserve)
 146                 extra_bits |= EXTENT_NORESERVE;
 147
 148         start_pos = round_down(pos, fs_info->sectorsize);
 149         num_bytes = round_up(write_bytes + pos - start_pos,
 150                              fs_info->sectorsize);
 151         ASSERT(num_bytes <= U32_MAX);
 152
 153         end_of_last_block = start_pos + num_bytes - 1;
 154
 155         /*
 156          * The pages may have already been dirty, clear out old accounting so
 157          * we can set things up properly
 158          */
 159         clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
 160                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 161                          cached);
 162
 163         err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
 164                                         extra_bits, cached);
 165         if (err)
 166                 return err;
 167
 168         for (i = 0; i < num_pages; i++) {
 169                 struct page *p = pages[i];
 170
 171                 btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
 172                 btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
 173                 btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
 174         }
 175
 176         /*
 177          * we've only changed i_size in ram, and we haven't updated
 178          * the disk i_size.  There is no need to log the inode
 179          * at this time.
 180          */
 181         if (end_pos > isize)
 182                 i_size_write(&inode->vfs_inode, end_pos);
 183         return 0;
 184 }
 185
 186 /*
 187  * this is very complex, but the basic idea is to drop all extents
 188  * in the range start - end.  hint_block is filled in with a block number
 189  * that would be a good hint to the block allocator for this file.
 190  *
 191  * If an extent intersects the range but is not entirely inside the range
 192  * it is either truncated or split.  Anything entirely inside the range
 193  * is deleted from the tree.
 194  *
 195  * Note: the VFS' inode number of bytes is not updated, it's up to the caller
 196  * to deal with that. We set the field 'bytes_found' of the arguments structure
 197  * with the number of allocated bytes found in the target range, so that the
 198  * caller can update the inode's number of bytes in an atomic way when
 199  * replacing extents in a range to avoid races with stat(2).
 200  */
 201 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 202                        struct btrfs_root *root, struct btrfs_inode *inode,
 203                        struct btrfs_drop_extents_args *args)
 204 {
 205         struct btrfs_fs_info *fs_info = root->fs_info;
 206         struct extent_buffer *leaf;
 207         struct btrfs_file_extent_item *fi;
 208         struct btrfs_ref ref = { 0 };
 209         struct btrfs_key key;
 210         struct btrfs_key new_key;
 211         u64 ino = btrfs_ino(inode);
 212         u64 search_start = args->start;
 213         u64 disk_bytenr = 0;
 214         u64 num_bytes = 0;
 215         u64 extent_offset = 0;
 216         u64 extent_end = 0;
 217         u64 last_end = args->start;
 218         int del_nr = 0;
 219         int del_slot = 0;
 220         int extent_type;
 221         int recow;
 222         int ret;
 223         int modify_tree = -1;
 224         int update_refs;
 225         int found = 0;
 226         struct btrfs_path *path = args->path;
 227
 228         args->bytes_found = 0;
 229         args->extent_inserted = false;
 230
 231         /* Must always have a path if ->replace_extent is true */
 232         ASSERT(!(args->replace_extent && !args->path));
 233
 234         if (!path) {
 235                 path = btrfs_alloc_path();
 236                 if (!path) {
 237                         ret = -ENOMEM;
 238                         goto out;
 239                 }
 240         }
 241
 242         if (args->drop_cache)
 243                 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
 244
 245         if (args->start >= inode->disk_i_size && !args->replace_extent)
 246                 modify_tree = 0;
 247
 248         update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
 249         while (1) {
 250                 recow = 0;
 251                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
 252                                                search_start, modify_tree);
 253                 if (ret < 0)
 254                         break;
 255                 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
 256                         leaf = path->nodes[0];
 257                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
 258                         if (key.objectid == ino &&
 259                             key.type == BTRFS_EXTENT_DATA_KEY)
 260                                 path->slots[0]--;
 261                 }
 262                 ret = 0;
 263 next_slot:
 264                 leaf = path->nodes[0];
 265                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 266                         BUG_ON(del_nr > 0);
 267                         ret = btrfs_next_leaf(root, path);
 268                         if (ret < 0)
 269                                 break;
 270                         if (ret > 0) {
 271                                 ret = 0;
 272                                 break;
 273                         }
 274                         leaf = path->nodes[0];
 275                         recow = 1;
 276                 }
 277
 278                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 279
 280                 if (key.objectid > ino)
 281                         break;
 282                 if (WARN_ON_ONCE(key.objectid < ino) ||
 283                     key.type < BTRFS_EXTENT_DATA_KEY) {
 284                         ASSERT(del_nr == 0);
 285                         path->slots[0]++;
 286                         goto next_slot;
 287                 }
 288                 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
 289                         break;
 290
 291                 fi = btrfs_item_ptr(leaf, path->slots[0],
 292                                     struct btrfs_file_extent_item);
 293                 extent_type = btrfs_file_extent_type(leaf, fi);
 294
 295                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
 296                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 297                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 298                         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 299                         extent_offset = btrfs_file_extent_offset(leaf, fi);
 300                         extent_end = key.offset +
 301                                 btrfs_file_extent_num_bytes(leaf, fi);
 302                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 303                         extent_end = key.offset +
 304                                 btrfs_file_extent_ram_bytes(leaf, fi);
 305                 } else {
 306                         /* can't happen */
 307                         BUG();
 308                 }
 309
 310                 /*
 311                  * Don't skip extent items representing 0 byte lengths. They
 312                  * used to be created (bug) if while punching holes we hit
 313                  * -ENOSPC condition. So if we find one here, just ensure we
 314                  * delete it, otherwise we would insert a new file extent item
 315                  * with the same key (offset) as that 0 bytes length file
 316                  * extent item in the call to setup_items_for_insert() later
 317                  * in this function.
 318                  */
 319                 if (extent_end == key.offset && extent_end >= search_start) {
 320                         last_end = extent_end;
 321                         goto delete_extent_item;
 322                 }
 323
 324                 if (extent_end <= search_start) {
 325                         path->slots[0]++;
 326                         goto next_slot;
 327                 }
 328
 329                 found = 1;
 330                 search_start = max(key.offset, args->start);
 331                 if (recow || !modify_tree) {
 332                         modify_tree = -1;
 333                         btrfs_release_path(path);
 334                         continue;
 335                 }
 336
 337                 /*
 338                  *     | - range to drop - |
 339                  *  | -------- extent -------- |
 340                  */
 341                 if (args->start > key.offset && args->end < extent_end) {
 342                         BUG_ON(del_nr > 0);
 343                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 344                                 ret = -EOPNOTSUPP;
 345                                 break;
 346                         }
 347
 348                         memcpy(&new_key, &key, sizeof(new_key));
 349                         new_key.offset = args->start;
 350                         ret = btrfs_duplicate_item(trans, root, path,
 351                                                    &new_key);
 352                         if (ret == -EAGAIN) {
 353                                 btrfs_release_path(path);
 354                                 continue;
 355                         }
 356                         if (ret < 0)
 357                                 break;
 358
 359                         leaf = path->nodes[0];
 360                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 361                                             struct btrfs_file_extent_item);
 362                         btrfs_set_file_extent_num_bytes(leaf, fi,
 363                                                         args->start - key.offset);
 364
 365                         fi = btrfs_item_ptr(leaf, path->slots[0],
 366                                             struct btrfs_file_extent_item);
 367
 368                         extent_offset += args->start - key.offset;
 369                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 370                         btrfs_set_file_extent_num_bytes(leaf, fi,
 371                                                         extent_end - args->start);
 372                         btrfs_mark_buffer_dirty(trans, leaf);
 373
 374                         if (update_refs && disk_bytenr > 0) {
 375                                 btrfs_init_generic_ref(&ref,
 376                                                 BTRFS_ADD_DELAYED_REF,
 377                                                 disk_bytenr, num_bytes, 0,
 378                                                 root->root_key.objectid);
 379                                 btrfs_init_data_ref(&ref,
 380                                                 root->root_key.objectid,
 381                                                 new_key.objectid,
 382                                                 args->start - extent_offset,
 383                                                 0, false);
 384                                 ret = btrfs_inc_extent_ref(trans, &ref);
 385                                 if (ret) {
 386                                         btrfs_abort_transaction(trans, ret);
 387                                         break;
 388                                 }
 389                         }
 390                         key.offset = args->start;
 391                 }
 392                 /*
 393                  * From here on out we will have actually dropped something, so
 394                  * last_end can be updated.
 395                  */
 396                 last_end = extent_end;
 397
 398                 /*
 399                  *  | ---- range to drop ----- |
 400                  *      | -------- extent -------- |
 401                  */
 402                 if (args->start <= key.offset && args->end < extent_end) {
 403                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 404                                 ret = -EOPNOTSUPP;
 405                                 break;
 406                         }
 407
 408                         memcpy(&new_key, &key, sizeof(new_key));
 409                         new_key.offset = args->end;
 410                         btrfs_set_item_key_safe(trans, path, &new_key);
 411
 412                         extent_offset += args->end - key.offset;
 413                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 414                         btrfs_set_file_extent_num_bytes(leaf, fi,
 415                                                         extent_end - args->end);
 416                         btrfs_mark_buffer_dirty(trans, leaf);
 417                         if (update_refs && disk_bytenr > 0)
 418                                 args->bytes_found += args->end - key.offset;
 419                         break;
 420                 }
 421
 422                 search_start = extent_end;
 423                 /*
 424                  *       | ---- range to drop ----- |
 425                  *  | -------- extent -------- |
 426                  */
 427                 if (args->start > key.offset && args->end >= extent_end) {
 428                         BUG_ON(del_nr > 0);
 429                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 430                                 ret = -EOPNOTSUPP;
 431                                 break;
 432                         }
 433
 434                         btrfs_set_file_extent_num_bytes(leaf, fi,
 435                                                         args->start - key.offset);
 436                         btrfs_mark_buffer_dirty(trans, leaf);
 437                         if (update_refs && disk_bytenr > 0)
 438                                 args->bytes_found += extent_end - args->start;
 439                         if (args->end == extent_end)
 440                                 break;
 441
 442                         path->slots[0]++;
 443                         goto next_slot;
 444                 }
 445
 446                 /*
 447                  *  | ---- range to drop ----- |
 448                  *    | ------ extent ------ |
 449                  */
 450                 if (args->start <= key.offset && args->end >= extent_end) {
 451 delete_extent_item:
 452                         if (del_nr == 0) {
 453                                 del_slot = path->slots[0];
 454                                 del_nr = 1;
 455                         } else {
 456                                 BUG_ON(del_slot + del_nr != path->slots[0]);
 457                                 del_nr++;
 458                         }
 459
 460                         if (update_refs &&
 461                             extent_type == BTRFS_FILE_EXTENT_INLINE) {
 462                                 args->bytes_found += extent_end - key.offset;
 463                                 extent_end = ALIGN(extent_end,
 464                                                    fs_info->sectorsize);
 465                         } else if (update_refs && disk_bytenr > 0) {
 466                                 btrfs_init_generic_ref(&ref,
 467                                                 BTRFS_DROP_DELAYED_REF,
 468                                                 disk_bytenr, num_bytes, 0,
 469                                                 root->root_key.objectid);
 470                                 btrfs_init_data_ref(&ref,
 471                                                 root->root_key.objectid,
 472                                                 key.objectid,
 473                                                 key.offset - extent_offset, 0,
 474                                                 false);
 475                                 ret = btrfs_free_extent(trans, &ref);
 476                                 if (ret) {
 477                                         btrfs_abort_transaction(trans, ret);
 478                                         break;
 479                                 }
 480                                 args->bytes_found += extent_end - key.offset;
 481                         }
 482
 483                         if (args->end == extent_end)
 484                                 break;
 485
 486                         if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
 487                                 path->slots[0]++;
 488                                 goto next_slot;
 489                         }
 490
 491                         ret = btrfs_del_items(trans, root, path, del_slot,
 492                                               del_nr);
 493                         if (ret) {
 494                                 btrfs_abort_transaction(trans, ret);
 495                                 break;
 496                         }
 497
 498                         del_nr = 0;
 499                         del_slot = 0;
 500
 501                         btrfs_release_path(path);
 502                         continue;
 503                 }
 504
 505                 BUG();
 506         }
 507
 508         if (!ret && del_nr > 0) {
 509                 /*
 510                  * Set path->slots[0] to first slot, so that after the delete
 511                  * if items are move off from our leaf to its immediate left or
 512                  * right neighbor leafs, we end up with a correct and adjusted
 513                  * path->slots[0] for our insertion (if args->replace_extent).
 514                  */
 515                 path->slots[0] = del_slot;
 516                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 517                 if (ret)
 518                         btrfs_abort_transaction(trans, ret);
 519         }
 520
 521         leaf = path->nodes[0];
 522         /*
 523          * If btrfs_del_items() was called, it might have deleted a leaf, in
 524          * which case it unlocked our path, so check path->locks[0] matches a
 525          * write lock.
 526          */
 527         if (!ret && args->replace_extent &&
 528             path->locks[0] == BTRFS_WRITE_LOCK &&
 529             btrfs_leaf_free_space(leaf) >=
 530             sizeof(struct btrfs_item) + args->extent_item_size) {
 531
 532                 key.objectid = ino;
 533                 key.type = BTRFS_EXTENT_DATA_KEY;
 534                 key.offset = args->start;
 535                 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
 536                         struct btrfs_key slot_key;
 537
 538                         btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
 539                         if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 540                                 path->slots[0]++;
 541                 }
 542                 btrfs_setup_item_for_insert(trans, root, path, &key,
 543                                             args->extent_item_size);
 544                 args->extent_inserted = true;
 545         }
 546
 547         if (!args->path)
 548                 btrfs_free_path(path);
 549         else if (!args->extent_inserted)
 550                 btrfs_release_path(path);
 551 out:
 552         args->drop_end = found ? min(args->end, last_end) : args->end;
 553
 554         return ret;
 555 }
 556
 557 static int extent_mergeable(struct extent_buffer *leaf, int slot,
 558                             u64 objectid, u64 bytenr, u64 orig_offset,
 559                             u64 *start, u64 *end)
 560 {
 561         struct btrfs_file_extent_item *fi;
 562         struct btrfs_key key;
 563         u64 extent_end;
 564
 565         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
 566                 return 0;
 567
 568         btrfs_item_key_to_cpu(leaf, &key, slot);
 569         if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
 570                 return 0;
 571
 572         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 573         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
 574             btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
 575             btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
 576             btrfs_file_extent_compression(leaf, fi) ||
 577             btrfs_file_extent_encryption(leaf, fi) ||
 578             btrfs_file_extent_other_encoding(leaf, fi))
 579                 return 0;
 580
 581         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 582         if ((*start && *start != key.offset) || (*end && *end != extent_end))
 583                 return 0;
 584
 585         *start = key.offset;
 586         *end = extent_end;
 587         return 1;
 588 }
 589
 590 /*
 591  * Mark extent in the range start - end as written.
 592  *
 593  * This changes extent type from 'pre-allocated' to 'regular'. If only
 594  * part of extent is marked as written, the extent will be split into
 595  * two or three.
 596  */
 597 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 598                               struct btrfs_inode *inode, u64 start, u64 end)
 599 {
 600         struct btrfs_root *root = inode->root;
 601         struct extent_buffer *leaf;
 602         struct btrfs_path *path;
 603         struct btrfs_file_extent_item *fi;
 604         struct btrfs_ref ref = { 0 };
 605         struct btrfs_key key;
 606         struct btrfs_key new_key;
 607         u64 bytenr;
 608         u64 num_bytes;
 609         u64 extent_end;
 610         u64 orig_offset;
 611         u64 other_start;
 612         u64 other_end;
 613         u64 split;
 614         int del_nr = 0;
 615         int del_slot = 0;
 616         int recow;
 617         int ret = 0;
 618         u64 ino = btrfs_ino(inode);
 619
 620         path = btrfs_alloc_path();
 621         if (!path)
 622                 return -ENOMEM;
 623 again:
 624         recow = 0;
 625         split = start;
 626         key.objectid = ino;
 627         key.type = BTRFS_EXTENT_DATA_KEY;
 628         key.offset = split;
 629
 630         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 631         if (ret < 0)
 632                 goto out;
 633         if (ret > 0 && path->slots[0] > 0)
 634                 path->slots[0]--;
 635
 636         leaf = path->nodes[0];
 637         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 638         if (key.objectid != ino ||
 639             key.type != BTRFS_EXTENT_DATA_KEY) {
 640                 ret = -EINVAL;
 641                 btrfs_abort_transaction(trans, ret);
 642                 goto out;
 643         }
 644         fi = btrfs_item_ptr(leaf, path->slots[0],
 645                             struct btrfs_file_extent_item);
 646         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
 647                 ret = -EINVAL;
 648                 btrfs_abort_transaction(trans, ret);
 649                 goto out;
 650         }
 651         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 652         if (key.offset > start || extent_end < end) {
 653                 ret = -EINVAL;
 654                 btrfs_abort_transaction(trans, ret);
 655                 goto out;
 656         }
 657
 658         bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 659         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 660         orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
 661         memcpy(&new_key, &key, sizeof(new_key));
 662
 663         if (start == key.offset && end < extent_end) {
 664                 other_start = 0;
 665                 other_end = start;
 666                 if (extent_mergeable(leaf, path->slots[0] - 1,
 667                                      ino, bytenr, orig_offset,
 668                                      &other_start, &other_end)) {
 669                         new_key.offset = end;
 670                         btrfs_set_item_key_safe(trans, path, &new_key);
 671                         fi = btrfs_item_ptr(leaf, path->slots[0],
 672                                             struct btrfs_file_extent_item);
 673                         btrfs_set_file_extent_generation(leaf, fi,
 674                                                          trans->transid);
 675                         btrfs_set_file_extent_num_bytes(leaf, fi,
 676                                                         extent_end - end);
 677                         btrfs_set_file_extent_offset(leaf, fi,
 678                                                      end - orig_offset);
 679                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 680                                             struct btrfs_file_extent_item);
 681                         btrfs_set_file_extent_generation(leaf, fi,
 682                                                          trans->transid);
 683                         btrfs_set_file_extent_num_bytes(leaf, fi,
 684                                                         end - other_start);
 685                         btrfs_mark_buffer_dirty(trans, leaf);
 686                         goto out;
 687                 }
 688         }
 689
 690         if (start > key.offset && end == extent_end) {
 691                 other_start = end;
 692                 other_end = 0;
 693                 if (extent_mergeable(leaf, path->slots[0] + 1,
 694                                      ino, bytenr, orig_offset,
 695                                      &other_start, &other_end)) {
 696                         fi = btrfs_item_ptr(leaf, path->slots[0],
 697                                             struct btrfs_file_extent_item);
 698                         btrfs_set_file_extent_num_bytes(leaf, fi,
 699                                                         start - key.offset);
 700                         btrfs_set_file_extent_generation(leaf, fi,
 701                                                          trans->transid);
 702                         path->slots[0]++;
 703                         new_key.offset = start;
 704                         btrfs_set_item_key_safe(trans, path, &new_key);
 705
 706                         fi = btrfs_item_ptr(leaf, path->slots[0],
 707                                             struct btrfs_file_extent_item);
 708                         btrfs_set_file_extent_generation(leaf, fi,
 709                                                          trans->transid);
 710                         btrfs_set_file_extent_num_bytes(leaf, fi,
 711                                                         other_end - start);
 712                         btrfs_set_file_extent_offset(leaf, fi,
 713                                                      start - orig_offset);
 714                         btrfs_mark_buffer_dirty(trans, leaf);
 715                         goto out;
 716                 }
 717         }
 718
 719         while (start > key.offset || end < extent_end) {
 720                 if (key.offset == start)
 721                         split = end;
 722
 723                 new_key.offset = split;
 724                 ret = btrfs_duplicate_item(trans, root, path, &new_key);
 725                 if (ret == -EAGAIN) {
 726                         btrfs_release_path(path);
 727                         goto again;
 728                 }
 729                 if (ret < 0) {
 730                         btrfs_abort_transaction(trans, ret);
 731                         goto out;
 732                 }
 733
 734                 leaf = path->nodes[0];
 735                 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 736                                     struct btrfs_file_extent_item);
 737                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 738                 btrfs_set_file_extent_num_bytes(leaf, fi,
 739                                                 split - key.offset);
 740
 741                 fi = btrfs_item_ptr(leaf, path->slots[0],
 742                                     struct btrfs_file_extent_item);
 743
 744                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 745                 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 746                 btrfs_set_file_extent_num_bytes(leaf, fi,
 747                                                 extent_end - split);
 748                 btrfs_mark_buffer_dirty(trans, leaf);
 749
 750                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
 751                                        num_bytes, 0, root->root_key.objectid);
 752                 btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
 753                                     orig_offset, 0, false);
 754                 ret = btrfs_inc_extent_ref(trans, &ref);
 755                 if (ret) {
 756                         btrfs_abort_transaction(trans, ret);
 757                         goto out;
 758                 }
 759
 760                 if (split == start) {
 761                         key.offset = start;
 762                 } else {
 763                         if (start != key.offset) {
 764                                 ret = -EINVAL;
 765                                 btrfs_abort_transaction(trans, ret);
 766                                 goto out;
 767                         }
 768                         path->slots[0]--;
 769                         extent_end = end;
 770                 }
 771                 recow = 1;
 772         }
 773
 774         other_start = end;
 775         other_end = 0;
 776         btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
 777                                num_bytes, 0, root->root_key.objectid);
 778         btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
 779                             0, false);
 780         if (extent_mergeable(leaf, path->slots[0] + 1,
 781                              ino, bytenr, orig_offset,
 782                              &other_start, &other_end)) {
 783                 if (recow) {
 784                         btrfs_release_path(path);
 785                         goto again;
 786                 }
 787                 extent_end = other_end;
 788                 del_slot = path->slots[0] + 1;
 789                 del_nr++;
 790                 ret = btrfs_free_extent(trans, &ref);
 791                 if (ret) {
 792                         btrfs_abort_transaction(trans, ret);
 793                         goto out;
 794                 }
 795         }
 796         other_start = 0;
 797         other_end = start;
 798         if (extent_mergeable(leaf, path->slots[0] - 1,
 799                              ino, bytenr, orig_offset,
 800                              &other_start, &other_end)) {
 801                 if (recow) {
 802                         btrfs_release_path(path);
 803                         goto again;
 804                 }
 805                 key.offset = other_start;
 806                 del_slot = path->slots[0];
 807                 del_nr++;
 808                 ret = btrfs_free_extent(trans, &ref);
 809                 if (ret) {
 810                         btrfs_abort_transaction(trans, ret);
 811                         goto out;
 812                 }
 813         }
 814         if (del_nr == 0) {
 815                 fi = btrfs_item_ptr(leaf, path->slots[0],
 816                            struct btrfs_file_extent_item);
 817                 btrfs_set_file_extent_type(leaf, fi,
 818                                            BTRFS_FILE_EXTENT_REG);
 819                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 820                 btrfs_mark_buffer_dirty(trans, leaf);
 821         } else {
 822                 fi = btrfs_item_ptr(leaf, del_slot - 1,
 823                            struct btrfs_file_extent_item);
 824                 btrfs_set_file_extent_type(leaf, fi,
 825                                            BTRFS_FILE_EXTENT_REG);
 826                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 827                 btrfs_set_file_extent_num_bytes(leaf, fi,
 828                                                 extent_end - key.offset);
 829                 btrfs_mark_buffer_dirty(trans, leaf);
 830
 831                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 832                 if (ret < 0) {
 833                         btrfs_abort_transaction(trans, ret);
 834                         goto out;
 835                 }
 836         }
 837 out:
 838         btrfs_free_path(path);
 839         return ret;
 840 }
 841
 842 /*
 843  * on error we return an unlocked page and the error value
 844  * on success we return a locked page and 0
 845  */
 846 static int prepare_uptodate_page(struct inode *inode,
 847                                  struct page *page, u64 pos,
 848                                  bool force_uptodate)
 849 {
 850         struct folio *folio = page_folio(page);
 851         int ret = 0;
 852
 853         if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
 854             !PageUptodate(page)) {
 855                 ret = btrfs_read_folio(NULL, folio);
 856                 if (ret)
 857                         return ret;
 858                 lock_page(page);
 859                 if (!PageUptodate(page)) {
 860                         unlock_page(page);
 861                         return -EIO;
 862                 }
 863
 864                 /*
 865                  * Since btrfs_read_folio() will unlock the folio before it
 866                  * returns, there is a window where btrfs_release_folio() can be
 867                  * called to release the page.  Here we check both inode
 868                  * mapping and PagePrivate() to make sure the page was not
 869                  * released.
 870                  *
 871                  * The private flag check is essential for subpage as we need
 872                  * to store extra bitmap using page->private.
 873                  */
 874                 if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
 875                         unlock_page(page);
 876                         return -EAGAIN;
 877                 }
 878         }
 879         return 0;
 880 }
 881
 882 static fgf_t get_prepare_fgp_flags(bool nowait)
 883 {
 884         fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
 885
 886         if (nowait)
 887                 fgp_flags |= FGP_NOWAIT;
 888
 889         return fgp_flags;
 890 }
 891
 892 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
 893 {
 894         gfp_t gfp;
 895
 896         gfp = btrfs_alloc_write_mask(inode->i_mapping);
 897         if (nowait) {
 898                 gfp &= ~__GFP_DIRECT_RECLAIM;
 899                 gfp |= GFP_NOWAIT;
 900         }
 901
 902         return gfp;
 903 }
 904
 905 /*
 906  * this just gets pages into the page cache and locks them down.
 907  */
 908 static noinline int prepare_pages(struct inode *inode, struct page **pages,
 909                                   size_t num_pages, loff_t pos,
 910                                   size_t write_bytes, bool force_uptodate,
 911                                   bool nowait)
 912 {
 913         int i;
 914         unsigned long index = pos >> PAGE_SHIFT;
 915         gfp_t mask = get_prepare_gfp_flags(inode, nowait);
 916         fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
 917         int err = 0;
 918         int faili;
 919
 920         for (i = 0; i < num_pages; i++) {
 921 again:
 922                 pages[i] = pagecache_get_page(inode->i_mapping, index + i,
 923                                               fgp_flags, mask | __GFP_WRITE);
 924                 if (!pages[i]) {
 925                         faili = i - 1;
 926                         if (nowait)
 927                                 err = -EAGAIN;
 928                         else
 929                                 err = -ENOMEM;
 930                         goto fail;
 931                 }
 932
 933                 err = set_page_extent_mapped(pages[i]);
 934                 if (err < 0) {
 935                         faili = i;
 936                         goto fail;
 937                 }
 938
 939                 if (i == 0)
 940                         err = prepare_uptodate_page(inode, pages[i], pos,
 941                                                     force_uptodate);
 942                 if (!err && i == num_pages - 1)
 943                         err = prepare_uptodate_page(inode, pages[i],
 944                                                     pos + write_bytes, false);
 945                 if (err) {
 946                         put_page(pages[i]);
 947                         if (!nowait && err == -EAGAIN) {
 948                                 err = 0;
 949                                 goto again;
 950                         }
 951                         faili = i - 1;
 952                         goto fail;
 953                 }
 954                 wait_on_page_writeback(pages[i]);
 955         }
 956
 957         return 0;
 958 fail:
 959         while (faili >= 0) {
 960                 unlock_page(pages[faili]);
 961                 put_page(pages[faili]);
 962                 faili--;
 963         }
 964         return err;
 965
 966 }
 967
 968 /*
 969  * This function locks the extent and properly waits for data=ordered extents
 970  * to finish before allowing the pages to be modified if need.
 971  *
 972  * The return value:
 973  * 1 - the extent is locked
 974  * 0 - the extent is not locked, and everything is OK
 975  * -EAGAIN - need re-prepare the pages
 976  * the other < 0 number - Something wrong happens
 977  */
 978 static noinline int
 979 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 980                                 size_t num_pages, loff_t pos,
 981                                 size_t write_bytes,
 982                                 u64 *lockstart, u64 *lockend, bool nowait,
 983                                 struct extent_state **cached_state)
 984 {
 985         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 986         u64 start_pos;
 987         u64 last_pos;
 988         int i;
 989         int ret = 0;
 990
 991         start_pos = round_down(pos, fs_info->sectorsize);
 992         last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
 993
 994         if (start_pos < inode->vfs_inode.i_size) {
 995                 struct btrfs_ordered_extent *ordered;
 996
 997                 if (nowait) {
 998                         if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
 999                                              cached_state)) {
1000                                 for (i = 0; i < num_pages; i++) {
1001                                         unlock_page(pages[i]);
1002                                         put_page(pages[i]);
1003                                         pages[i] = NULL;
1004                                 }
1005
1006                                 return -EAGAIN;
1007                         }
1008                 } else {
1009                         lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
1010                 }
1011
1012                 ordered = btrfs_lookup_ordered_range(inode, start_pos,
1013                                                      last_pos - start_pos + 1);
1014                 if (ordered &&
1015                     ordered->file_offset + ordered->num_bytes > start_pos &&
1016                     ordered->file_offset <= last_pos) {
1017                         unlock_extent(&inode->io_tree, start_pos, last_pos,
1018                                       cached_state);
1019                         for (i = 0; i < num_pages; i++) {
1020                                 unlock_page(pages[i]);
1021                                 put_page(pages[i]);
1022                         }
1023                         btrfs_start_ordered_extent(ordered);
1024                         btrfs_put_ordered_extent(ordered);
1025                         return -EAGAIN;
1026                 }
1027                 if (ordered)
1028                         btrfs_put_ordered_extent(ordered);
1029
1030                 *lockstart = start_pos;
1031                 *lockend = last_pos;
1032                 ret = 1;
1033         }
1034
1035         /*
1036          * We should be called after prepare_pages() which should have locked
1037          * all pages in the range.
1038          */
1039         for (i = 0; i < num_pages; i++)
1040                 WARN_ON(!PageLocked(pages[i]));
1041
1042         return ret;
1043 }
1044
1045 /*
1046  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1047  *
1048  * @pos:         File offset.
1049  * @write_bytes: The length to write, will be updated to the nocow writeable
1050  *               range.
1051  *
1052  * This function will flush ordered extents in the range to ensure proper
1053  * nocow checks.
1054  *
1055  * Return:
1056  * > 0          If we can nocow, and updates @write_bytes.
1057  *  0           If we can't do a nocow write.
1058  * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
1059  *              root is in progress.
1060  * < 0          If an error happened.
1061  *
1062  * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
1063  */
1064 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1065                            size_t *write_bytes, bool nowait)
1066 {
1067         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1068         struct btrfs_root *root = inode->root;
1069         struct extent_state *cached_state = NULL;
1070         u64 lockstart, lockend;
1071         u64 num_bytes;
1072         int ret;
1073
1074         if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1075                 return 0;
1076
1077         if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
1078                 return -EAGAIN;
1079
1080         lockstart = round_down(pos, fs_info->sectorsize);
1081         lockend = round_up(pos + *write_bytes,
1082                            fs_info->sectorsize) - 1;
1083         num_bytes = lockend - lockstart + 1;
1084
1085         if (nowait) {
1086                 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1087                                                   &cached_state)) {
1088                         btrfs_drew_write_unlock(&root->snapshot_lock);
1089                         return -EAGAIN;
1090                 }
1091         } else {
1092                 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1093                                                    &cached_state);
1094         }
1095         ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1096                         NULL, NULL, NULL, nowait, false);
1097         if (ret <= 0)
1098                 btrfs_drew_write_unlock(&root->snapshot_lock);
1099         else
1100                 *write_bytes = min_t(size_t, *write_bytes ,
1101                                      num_bytes - pos + lockstart);
1102         unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1103
1104         return ret;
1105 }
1106
1107 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1108 {
1109         btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1110 }
1111
1112 static void update_time_for_write(struct inode *inode)
1113 {
1114         struct timespec64 now, ts;
1115
1116         if (IS_NOCMTIME(inode))
1117                 return;
1118
1119         now = current_time(inode);
1120         ts = inode_get_mtime(inode);
1121         if (!timespec64_equal(&ts, &now))
1122                 inode_set_mtime_to_ts(inode, now);
1123
1124         ts = inode_get_ctime(inode);
1125         if (!timespec64_equal(&ts, &now))
1126                 inode_set_ctime_to_ts(inode, now);
1127
1128         if (IS_I_VERSION(inode))
1129                 inode_inc_iversion(inode);
1130 }
1131
1132 static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1133                              size_t count)
1134 {
1135         struct file *file = iocb->ki_filp;
1136         struct inode *inode = file_inode(file);
1137         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1138         loff_t pos = iocb->ki_pos;
1139         int ret;
1140         loff_t oldsize;
1141         loff_t start_pos;
1142
1143         /*
1144          * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1145          * prealloc flags, as without those flags we always have to COW. We will
1146          * later check if we can really COW into the target range (using
1147          * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1148          */
1149         if ((iocb->ki_flags & IOCB_NOWAIT) &&
1150             !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1151                 return -EAGAIN;
1152
1153         ret = file_remove_privs(file);
1154         if (ret)
1155                 return ret;
1156
1157         /*
1158          * We reserve space for updating the inode when we reserve space for the
1159          * extent we are going to write, so we will enospc out there.  We don't
1160          * need to start yet another transaction to update the inode as we will
1161          * update the inode when we finish writing whatever data we write.
1162          */
1163         update_time_for_write(inode);
1164
1165         start_pos = round_down(pos, fs_info->sectorsize);
1166         oldsize = i_size_read(inode);
1167         if (start_pos > oldsize) {
1168                 /* Expand hole size to cover write data, preventing empty gap */
1169                 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1170
1171                 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1172                 if (ret)
1173                         return ret;
1174         }
1175
1176         return 0;
1177 }
1178
1179 static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1180                                                struct iov_iter *i)
1181 {
1182         struct file *file = iocb->ki_filp;
1183         loff_t pos;
1184         struct inode *inode = file_inode(file);
1185         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1186         struct page **pages = NULL;
1187         struct extent_changeset *data_reserved = NULL;
1188         u64 release_bytes = 0;
1189         u64 lockstart;
1190         u64 lockend;
1191         size_t num_written = 0;
1192         int nrptrs;
1193         ssize_t ret;
1194         bool only_release_metadata = false;
1195         bool force_page_uptodate = false;
1196         loff_t old_isize = i_size_read(inode);
1197         unsigned int ilock_flags = 0;
1198         const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1199         unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1200
1201         if (nowait)
1202                 ilock_flags |= BTRFS_ILOCK_TRY;
1203
1204         ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1205         if (ret < 0)
1206                 return ret;
1207
1208         ret = generic_write_checks(iocb, i);
1209         if (ret <= 0)
1210                 goto out;
1211
1212         ret = btrfs_write_check(iocb, i, ret);
1213         if (ret < 0)
1214                 goto out;
1215
1216         pos = iocb->ki_pos;
1217         nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1218                         PAGE_SIZE / (sizeof(struct page *)));
1219         nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1220         nrptrs = max(nrptrs, 8);
1221         pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1222         if (!pages) {
1223                 ret = -ENOMEM;
1224                 goto out;
1225         }
1226
1227         while (iov_iter_count(i) > 0) {
1228                 struct extent_state *cached_state = NULL;
1229                 size_t offset = offset_in_page(pos);
1230                 size_t sector_offset;
1231                 size_t write_bytes = min(iov_iter_count(i),
1232                                          nrptrs * (size_t)PAGE_SIZE -
1233                                          offset);
1234                 size_t num_pages;
1235                 size_t reserve_bytes;
1236                 size_t dirty_pages;
1237                 size_t copied;
1238                 size_t dirty_sectors;
1239                 size_t num_sectors;
1240                 int extents_locked;
1241
1242                 /*
1243                  * Fault pages before locking them in prepare_pages
1244                  * to avoid recursive lock
1245                  */
1246                 if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1247                         ret = -EFAULT;
1248                         break;
1249                 }
1250
1251                 only_release_metadata = false;
1252                 sector_offset = pos & (fs_info->sectorsize - 1);
1253
1254                 extent_changeset_release(data_reserved);
1255                 ret = btrfs_check_data_free_space(BTRFS_I(inode),
1256                                                   &data_reserved, pos,
1257                                                   write_bytes, nowait);
1258                 if (ret < 0) {
1259                         int can_nocow;
1260
1261                         if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
1262                                 ret = -EAGAIN;
1263                                 break;
1264                         }
1265
1266                         /*
1267                          * If we don't have to COW at the offset, reserve
1268                          * metadata only. write_bytes may get smaller than
1269                          * requested here.
1270                          */
1271                         can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1272                                                            &write_bytes, nowait);
1273                         if (can_nocow < 0)
1274                                 ret = can_nocow;
1275                         if (can_nocow > 0)
1276                                 ret = 0;
1277                         if (ret)
1278                                 break;
1279                         only_release_metadata = true;
1280                 }
1281
1282                 num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1283                 WARN_ON(num_pages > nrptrs);
1284                 reserve_bytes = round_up(write_bytes + sector_offset,
1285                                          fs_info->sectorsize);
1286                 WARN_ON(reserve_bytes == 0);
1287                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1288                                                       reserve_bytes,
1289                                                       reserve_bytes, nowait);
1290                 if (ret) {
1291                         if (!only_release_metadata)
1292                                 btrfs_free_reserved_data_space(BTRFS_I(inode),
1293                                                 data_reserved, pos,
1294                                                 write_bytes);
1295                         else
1296                                 btrfs_check_nocow_unlock(BTRFS_I(inode));
1297
1298                         if (nowait && ret == -ENOSPC)
1299                                 ret = -EAGAIN;
1300                         break;
1301                 }
1302
1303                 release_bytes = reserve_bytes;
1304 again:
1305                 ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
1306                 if (ret) {
1307                         btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1308                         break;
1309                 }
1310
1311                 /*
1312                  * This is going to setup the pages array with the number of
1313                  * pages we want, so we don't really need to worry about the
1314                  * contents of pages from loop to loop
1315                  */
1316                 ret = prepare_pages(inode, pages, num_pages,
1317                                     pos, write_bytes, force_page_uptodate, false);
1318                 if (ret) {
1319                         btrfs_delalloc_release_extents(BTRFS_I(inode),
1320                                                        reserve_bytes);
1321                         break;
1322                 }
1323
1324                 extents_locked = lock_and_cleanup_extent_if_need(
1325                                 BTRFS_I(inode), pages,
1326                                 num_pages, pos, write_bytes, &lockstart,
1327                                 &lockend, nowait, &cached_state);
1328                 if (extents_locked < 0) {
1329                         if (!nowait && extents_locked == -EAGAIN)
1330                                 goto again;
1331
1332                         btrfs_delalloc_release_extents(BTRFS_I(inode),
1333                                                        reserve_bytes);
1334                         ret = extents_locked;
1335                         break;
1336                 }
1337
1338                 copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1339
1340                 num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1341                 dirty_sectors = round_up(copied + sector_offset,
1342                                         fs_info->sectorsize);
1343                 dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1344
1345                 /*
1346                  * if we have trouble faulting in the pages, fall
1347                  * back to one page at a time
1348                  */
1349                 if (copied < write_bytes)
1350                         nrptrs = 1;
1351
1352                 if (copied == 0) {
1353                         force_page_uptodate = true;
1354                         dirty_sectors = 0;
1355                         dirty_pages = 0;
1356                 } else {
1357                         force_page_uptodate = false;
1358                         dirty_pages = DIV_ROUND_UP(copied + offset,
1359                                                    PAGE_SIZE);
1360                 }
1361
1362                 if (num_sectors > dirty_sectors) {
1363                         /* release everything except the sectors we dirtied */
1364                         release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1365                         if (only_release_metadata) {
1366                                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
1367                                                         release_bytes, true);
1368                         } else {
1369                                 u64 __pos;
1370
1371                                 __pos = round_down(pos,
1372                                                    fs_info->sectorsize) +
1373                                         (dirty_pages << PAGE_SHIFT);
1374                                 btrfs_delalloc_release_space(BTRFS_I(inode),
1375                                                 data_reserved, __pos,
1376                                                 release_bytes, true);
1377                         }
1378                 }
1379
1380                 release_bytes = round_up(copied + sector_offset,
1381                                         fs_info->sectorsize);
1382
1383                 ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1384                                         dirty_pages, pos, copied,
1385                                         &cached_state, only_release_metadata);
1386
1387                 /*
1388                  * If we have not locked the extent range, because the range's
1389                  * start offset is >= i_size, we might still have a non-NULL
1390                  * cached extent state, acquired while marking the extent range
1391                  * as delalloc through btrfs_dirty_pages(). Therefore free any
1392                  * possible cached extent state to avoid a memory leak.
1393                  */
1394                 if (extents_locked)
1395                         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
1396                                       lockend, &cached_state);
1397                 else
1398                         free_extent_state(cached_state);
1399
1400                 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1401                 if (ret) {
1402                         btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1403                         break;
1404                 }
1405
1406                 release_bytes = 0;
1407                 if (only_release_metadata)
1408                         btrfs_check_nocow_unlock(BTRFS_I(inode));
1409
1410                 btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1411
1412                 cond_resched();
1413
1414                 pos += copied;
1415                 num_written += copied;
1416         }
1417
1418         kfree(pages);
1419
1420         if (release_bytes) {
1421                 if (only_release_metadata) {
1422                         btrfs_check_nocow_unlock(BTRFS_I(inode));
1423                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
1424                                         release_bytes, true);
1425                 } else {
1426                         btrfs_delalloc_release_space(BTRFS_I(inode),
1427                                         data_reserved,
1428                                         round_down(pos, fs_info->sectorsize),
1429                                         release_bytes, true);
1430                 }
1431         }
1432
1433         extent_changeset_free(data_reserved);
1434         if (num_written > 0) {
1435                 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1436                 iocb->ki_pos += num_written;
1437         }
1438 out:
1439         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1440         return num_written ? num_written : ret;
1441 }
1442
1443 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1444                                const struct iov_iter *iter, loff_t offset)
1445 {
1446         const u32 blocksize_mask = fs_info->sectorsize - 1;
1447
1448         if (offset & blocksize_mask)
1449                 return -EINVAL;
1450
1451         if (iov_iter_alignment(iter) & blocksize_mask)
1452                 return -EINVAL;
1453
1454         return 0;
1455 }
1456
1457 static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1458 {
1459         struct file *file = iocb->ki_filp;
1460         struct inode *inode = file_inode(file);
1461         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1462         loff_t pos;
1463         ssize_t written = 0;
1464         ssize_t written_buffered;
1465         size_t prev_left = 0;
1466         loff_t endbyte;
1467         ssize_t err;
1468         unsigned int ilock_flags = 0;
1469         struct iomap_dio *dio;
1470
1471         if (iocb->ki_flags & IOCB_NOWAIT)
1472                 ilock_flags |= BTRFS_ILOCK_TRY;
1473
1474         /*
1475          * If the write DIO is within EOF, use a shared lock and also only if
1476          * security bits will likely not be dropped by file_remove_privs() called
1477          * from btrfs_write_check(). Either will need to be rechecked after the
1478          * lock was acquired.
1479          */
1480         if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
1481                 ilock_flags |= BTRFS_ILOCK_SHARED;
1482
1483 relock:
1484         err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1485         if (err < 0)
1486                 return err;
1487
1488         /* Shared lock cannot be used with security bits set. */
1489         if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
1490                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1491                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
1492                 goto relock;
1493         }
1494
1495         err = generic_write_checks(iocb, from);
1496         if (err <= 0) {
1497                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1498                 return err;
1499         }
1500
1501         err = btrfs_write_check(iocb, from, err);
1502         if (err < 0) {
1503                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1504                 goto out;
1505         }
1506
1507         pos = iocb->ki_pos;
1508         /*
1509          * Re-check since file size may have changed just before taking the
1510          * lock or pos may have changed because of O_APPEND in generic_write_check()
1511          */
1512         if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1513             pos + iov_iter_count(from) > i_size_read(inode)) {
1514                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1515                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
1516                 goto relock;
1517         }
1518
1519         if (check_direct_IO(fs_info, from, pos)) {
1520                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1521                 goto buffered;
1522         }
1523
1524         /*
1525          * The iov_iter can be mapped to the same file range we are writing to.
1526          * If that's the case, then we will deadlock in the iomap code, because
1527          * it first calls our callback btrfs_dio_iomap_begin(), which will create
1528          * an ordered extent, and after that it will fault in the pages that the
1529          * iov_iter refers to. During the fault in we end up in the readahead
1530          * pages code (starting at btrfs_readahead()), which will lock the range,
1531          * find that ordered extent and then wait for it to complete (at
1532          * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1533          * obviously the ordered extent can never complete as we didn't submit
1534          * yet the respective bio(s). This always happens when the buffer is
1535          * memory mapped to the same file range, since the iomap DIO code always
1536          * invalidates pages in the target file range (after starting and waiting
1537          * for any writeback).
1538          *
1539          * So here we disable page faults in the iov_iter and then retry if we
1540          * got -EFAULT, faulting in the pages before the retry.
1541          */
1542         from->nofault = true;
1543         dio = btrfs_dio_write(iocb, from, written);
1544         from->nofault = false;
1545
1546         /*
1547          * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
1548          * iocb, and that needs to lock the inode. So unlock it before calling
1549          * iomap_dio_complete() to avoid a deadlock.
1550          */
1551         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1552
1553         if (IS_ERR_OR_NULL(dio))
1554                 err = PTR_ERR_OR_ZERO(dio);
1555         else
1556                 err = iomap_dio_complete(dio);
1557
1558         /* No increment (+=) because iomap returns a cumulative value. */
1559         if (err > 0)
1560                 written = err;
1561
1562         if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
1563                 const size_t left = iov_iter_count(from);
1564                 /*
1565                  * We have more data left to write. Try to fault in as many as
1566                  * possible of the remainder pages and retry. We do this without
1567                  * releasing and locking again the inode, to prevent races with
1568                  * truncate.
1569                  *
1570                  * Also, in case the iov refers to pages in the file range of the
1571                  * file we want to write to (due to a mmap), we could enter an
1572                  * infinite loop if we retry after faulting the pages in, since
1573                  * iomap will invalidate any pages in the range early on, before
1574                  * it tries to fault in the pages of the iov. So we keep track of
1575                  * how much was left of iov in the previous EFAULT and fallback
1576                  * to buffered IO in case we haven't made any progress.
1577                  */
1578                 if (left == prev_left) {
1579                         err = -ENOTBLK;
1580                 } else {
1581                         fault_in_iov_iter_readable(from, left);
1582                         prev_left = left;
1583                         goto relock;
1584                 }
1585         }
1586
1587         /*
1588          * If 'err' is -ENOTBLK or we have not written all data, then it means
1589          * we must fallback to buffered IO.
1590          */
1591         if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
1592                 goto out;
1593
1594 buffered:
1595         /*
1596          * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
1597          * it must retry the operation in a context where blocking is acceptable,
1598          * because even if we end up not blocking during the buffered IO attempt
1599          * below, we will block when flushing and waiting for the IO.
1600          */
1601         if (iocb->ki_flags & IOCB_NOWAIT) {
1602                 err = -EAGAIN;
1603                 goto out;
1604         }
1605
1606         pos = iocb->ki_pos;
1607         written_buffered = btrfs_buffered_write(iocb, from);
1608         if (written_buffered < 0) {
1609                 err = written_buffered;
1610                 goto out;
1611         }
1612         /*
1613          * Ensure all data is persisted. We want the next direct IO read to be
1614          * able to read what was just written.
1615          */
1616         endbyte = pos + written_buffered - 1;
1617         err = btrfs_fdatawrite_range(inode, pos, endbyte);
1618         if (err)
1619                 goto out;
1620         err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1621         if (err)
1622                 goto out;
1623         written += written_buffered;
1624         iocb->ki_pos = pos + written_buffered;
1625         invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1626                                  endbyte >> PAGE_SHIFT);
1627 out:
1628         return err < 0 ? err : written;
1629 }
1630
1631 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1632                         const struct btrfs_ioctl_encoded_io_args *encoded)
1633 {
1634         struct file *file = iocb->ki_filp;
1635         struct inode *inode = file_inode(file);
1636         loff_t count;
1637         ssize_t ret;
1638
1639         btrfs_inode_lock(BTRFS_I(inode), 0);
1640         count = encoded->len;
1641         ret = generic_write_checks_count(iocb, &count);
1642         if (ret == 0 && count != encoded->len) {
1643                 /*
1644                  * The write got truncated by generic_write_checks_count(). We
1645                  * can't do a partial encoded write.
1646                  */
1647                 ret = -EFBIG;
1648         }
1649         if (ret || encoded->len == 0)
1650                 goto out;
1651
1652         ret = btrfs_write_check(iocb, from, encoded->len);
1653         if (ret < 0)
1654                 goto out;
1655
1656         ret = btrfs_do_encoded_write(iocb, from, encoded);
1657 out:
1658         btrfs_inode_unlock(BTRFS_I(inode), 0);
1659         return ret;
1660 }
1661
1662 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1663                             const struct btrfs_ioctl_encoded_io_args *encoded)
1664 {
1665         struct file *file = iocb->ki_filp;
1666         struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1667         ssize_t num_written, num_sync;
1668
1669         /*
1670          * If the fs flips readonly due to some impossible error, although we
1671          * have opened a file as writable, we have to stop this write operation
1672          * to ensure consistency.
1673          */
1674         if (BTRFS_FS_ERROR(inode->root->fs_info))
1675                 return -EROFS;
1676
1677         if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1678                 return -EOPNOTSUPP;
1679
1680         if (encoded) {
1681                 num_written = btrfs_encoded_write(iocb, from, encoded);
1682                 num_sync = encoded->len;
1683         } else if (iocb->ki_flags & IOCB_DIRECT) {
1684                 num_written = btrfs_direct_write(iocb, from);
1685                 num_sync = num_written;
1686         } else {
1687                 num_written = btrfs_buffered_write(iocb, from);
1688                 num_sync = num_written;
1689         }
1690
1691         btrfs_set_inode_last_sub_trans(inode);
1692
1693         if (num_sync > 0) {
1694                 num_sync = generic_write_sync(iocb, num_sync);
1695                 if (num_sync < 0)
1696                         num_written = num_sync;
1697         }
1698
1699         return num_written;
1700 }
1701
1702 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1703 {
1704         return btrfs_do_write_iter(iocb, from, NULL);
1705 }
1706
1707 int btrfs_release_file(struct inode *inode, struct file *filp)
1708 {
1709         struct btrfs_file_private *private = filp->private_data;
1710
1711         if (private) {
1712                 kfree(private->filldir_buf);
1713                 free_extent_state(private->llseek_cached_state);
1714                 kfree(private);
1715                 filp->private_data = NULL;
1716         }
1717
1718         /*
1719          * Set by setattr when we are about to truncate a file from a non-zero
1720          * size to a zero size.  This tries to flush down new bytes that may
1721          * have been written if the application were using truncate to replace
1722          * a file in place.
1723          */
1724         if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1725                                &BTRFS_I(inode)->runtime_flags))
1726                         filemap_flush(inode->i_mapping);
1727         return 0;
1728 }
1729
1730 static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1731 {
1732         int ret;
1733         struct blk_plug plug;
1734
1735         /*
1736          * This is only called in fsync, which would do synchronous writes, so
1737          * a plug can merge adjacent IOs as much as possible.  Esp. in case of
1738          * multiple disks using raid profile, a large IO can be split to
1739          * several segments of stripe length (currently 64K).
1740          */
1741         blk_start_plug(&plug);
1742         ret = btrfs_fdatawrite_range(inode, start, end);
1743         blk_finish_plug(&plug);
1744
1745         return ret;
1746 }
1747
1748 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1749 {
1750         struct btrfs_inode *inode = BTRFS_I(ctx->inode);
1751         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1752
1753         if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1754             list_empty(&ctx->ordered_extents))
1755                 return true;
1756
1757         /*
1758          * If we are doing a fast fsync we can not bail out if the inode's
1759          * last_trans is <= then the last committed transaction, because we only
1760          * update the last_trans of the inode during ordered extent completion,
1761          * and for a fast fsync we don't wait for that, we only wait for the
1762          * writeback to complete.
1763          */
1764         if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1765             (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1766              list_empty(&ctx->ordered_extents)))
1767                 return true;
1768
1769         return false;
1770 }
1771
1772 /*
1773  * fsync call for both files and directories.  This logs the inode into
1774  * the tree log instead of forcing full commits whenever possible.
1775  *
1776  * It needs to call filemap_fdatawait so that all ordered extent updates are
1777  * in the metadata btree are up to date for copying to the log.
1778  *
1779  * It drops the inode mutex before doing the tree log commit.  This is an
1780  * important optimization for directories because holding the mutex prevents
1781  * new operations on the dir while we write to disk.
1782  */
1783 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1784 {
1785         struct dentry *dentry = file_dentry(file);
1786         struct inode *inode = d_inode(dentry);
1787         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1788         struct btrfs_root *root = BTRFS_I(inode)->root;
1789         struct btrfs_trans_handle *trans;
1790         struct btrfs_log_ctx ctx;
1791         int ret = 0, err;
1792         u64 len;
1793         bool full_sync;
1794
1795         trace_btrfs_sync_file(file, datasync);
1796
1797         btrfs_init_log_ctx(&ctx, inode);
1798
1799         /*
1800          * Always set the range to a full range, otherwise we can get into
1801          * several problems, from missing file extent items to represent holes
1802          * when not using the NO_HOLES feature, to log tree corruption due to
1803          * races between hole detection during logging and completion of ordered
1804          * extents outside the range, to missing checksums due to ordered extents
1805          * for which we flushed only a subset of their pages.
1806          */
1807         start = 0;
1808         end = LLONG_MAX;
1809         len = (u64)LLONG_MAX + 1;
1810
1811         /*
1812          * We write the dirty pages in the range and wait until they complete
1813          * out of the ->i_mutex. If so, we can flush the dirty pages by
1814          * multi-task, and make the performance up.  See
1815          * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1816          */
1817         ret = start_ordered_ops(inode, start, end);
1818         if (ret)
1819                 goto out;
1820
1821         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1822
1823         atomic_inc(&root->log_batch);
1824
1825         /*
1826          * Before we acquired the inode's lock and the mmap lock, someone may
1827          * have dirtied more pages in the target range. We need to make sure
1828          * that writeback for any such pages does not start while we are logging
1829          * the inode, because if it does, any of the following might happen when
1830          * we are not doing a full inode sync:
1831          *
1832          * 1) We log an extent after its writeback finishes but before its
1833          *    checksums are added to the csum tree, leading to -EIO errors
1834          *    when attempting to read the extent after a log replay.
1835          *
1836          * 2) We can end up logging an extent before its writeback finishes.
1837          *    Therefore after the log replay we will have a file extent item
1838          *    pointing to an unwritten extent (and no data checksums as well).
1839          *
1840          * So trigger writeback for any eventual new dirty pages and then we
1841          * wait for all ordered extents to complete below.
1842          */
1843         ret = start_ordered_ops(inode, start, end);
1844         if (ret) {
1845                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1846                 goto out;
1847         }
1848
1849         /*
1850          * Always check for the full sync flag while holding the inode's lock,
1851          * to avoid races with other tasks. The flag must be either set all the
1852          * time during logging or always off all the time while logging.
1853          * We check the flag here after starting delalloc above, because when
1854          * running delalloc the full sync flag may be set if we need to drop
1855          * extra extent map ranges due to temporary memory allocation failures.
1856          */
1857         full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1858                              &BTRFS_I(inode)->runtime_flags);
1859
1860         /*
1861          * We have to do this here to avoid the priority inversion of waiting on
1862          * IO of a lower priority task while holding a transaction open.
1863          *
1864          * For a full fsync we wait for the ordered extents to complete while
1865          * for a fast fsync we wait just for writeback to complete, and then
1866          * attach the ordered extents to the transaction so that a transaction
1867          * commit waits for their completion, to avoid data loss if we fsync,
1868          * the current transaction commits before the ordered extents complete
1869          * and a power failure happens right after that.
1870          *
1871          * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1872          * logical address recorded in the ordered extent may change. We need
1873          * to wait for the IO to stabilize the logical address.
1874          */
1875         if (full_sync || btrfs_is_zoned(fs_info)) {
1876                 ret = btrfs_wait_ordered_range(inode, start, len);
1877         } else {
1878                 /*
1879                  * Get our ordered extents as soon as possible to avoid doing
1880                  * checksum lookups in the csum tree, and use instead the
1881                  * checksums attached to the ordered extents.
1882                  */
1883                 btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
1884                                                       &ctx.ordered_extents);
1885                 ret = filemap_fdatawait_range(inode->i_mapping, start, end);
1886         }
1887
1888         if (ret)
1889                 goto out_release_extents;
1890
1891         atomic_inc(&root->log_batch);
1892
1893         if (skip_inode_logging(&ctx)) {
1894                 /*
1895                  * We've had everything committed since the last time we were
1896                  * modified so clear this flag in case it was set for whatever
1897                  * reason, it's no longer relevant.
1898                  */
1899                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1900                           &BTRFS_I(inode)->runtime_flags);
1901                 /*
1902                  * An ordered extent might have started before and completed
1903                  * already with io errors, in which case the inode was not
1904                  * updated and we end up here. So check the inode's mapping
1905                  * for any errors that might have happened since we last
1906                  * checked called fsync.
1907                  */
1908                 ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
1909                 goto out_release_extents;
1910         }
1911
1912         /*
1913          * We use start here because we will need to wait on the IO to complete
1914          * in btrfs_sync_log, which could require joining a transaction (for
1915          * example checking cross references in the nocow path).  If we use join
1916          * here we could get into a situation where we're waiting on IO to
1917          * happen that is blocked on a transaction trying to commit.  With start
1918          * we inc the extwriter counter, so we wait for all extwriters to exit
1919          * before we start blocking joiners.  This comment is to keep somebody
1920          * from thinking they are super smart and changing this to
1921          * btrfs_join_transaction *cough*Josef*cough*.
1922          */
1923         trans = btrfs_start_transaction(root, 0);
1924         if (IS_ERR(trans)) {
1925                 ret = PTR_ERR(trans);
1926                 goto out_release_extents;
1927         }
1928         trans->in_fsync = true;
1929
1930         ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1931         btrfs_release_log_ctx_extents(&ctx);
1932         if (ret < 0) {
1933                 /* Fallthrough and commit/free transaction. */
1934                 ret = BTRFS_LOG_FORCE_COMMIT;
1935         }
1936
1937         /* we've logged all the items and now have a consistent
1938          * version of the file in the log.  It is possible that
1939          * someone will come in and modify the file, but that's
1940          * fine because the log is consistent on disk, and we
1941          * have references to all of the file's extents
1942          *
1943          * It is possible that someone will come in and log the
1944          * file again, but that will end up using the synchronization
1945          * inside btrfs_sync_log to keep things safe.
1946          */
1947         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1948
1949         if (ret == BTRFS_NO_LOG_SYNC) {
1950                 ret = btrfs_end_transaction(trans);
1951                 goto out;
1952         }
1953
1954         /* We successfully logged the inode, attempt to sync the log. */
1955         if (!ret) {
1956                 ret = btrfs_sync_log(trans, root, &ctx);
1957                 if (!ret) {
1958                         ret = btrfs_end_transaction(trans);
1959                         goto out;
1960                 }
1961         }
1962
1963         /*
1964          * At this point we need to commit the transaction because we had
1965          * btrfs_need_log_full_commit() or some other error.
1966          *
1967          * If we didn't do a full sync we have to stop the trans handle, wait on
1968          * the ordered extents, start it again and commit the transaction.  If
1969          * we attempt to wait on the ordered extents here we could deadlock with
1970          * something like fallocate() that is holding the extent lock trying to
1971          * start a transaction while some other thread is trying to commit the
1972          * transaction while we (fsync) are currently holding the transaction
1973          * open.
1974          */
1975         if (!full_sync) {
1976                 ret = btrfs_end_transaction(trans);
1977                 if (ret)
1978                         goto out;
1979                 ret = btrfs_wait_ordered_range(inode, start, len);
1980                 if (ret)
1981                         goto out;
1982
1983                 /*
1984                  * This is safe to use here because we're only interested in
1985                  * making sure the transaction that had the ordered extents is
1986                  * committed.  We aren't waiting on anything past this point,
1987                  * we're purely getting the transaction and committing it.
1988                  */
1989                 trans = btrfs_attach_transaction_barrier(root);
1990                 if (IS_ERR(trans)) {
1991                         ret = PTR_ERR(trans);
1992
1993                         /*
1994                          * We committed the transaction and there's no currently
1995                          * running transaction, this means everything we care
1996                          * about made it to disk and we are done.
1997                          */
1998                         if (ret == -ENOENT)
1999                                 ret = 0;
2000                         goto out;
2001                 }
2002         }
2003
2004         ret = btrfs_commit_transaction(trans);
2005 out:
2006         ASSERT(list_empty(&ctx.list));
2007         ASSERT(list_empty(&ctx.conflict_inodes));
2008         err = file_check_and_advance_wb_err(file);
2009         if (!ret)
2010                 ret = err;
2011         return ret > 0 ? -EIO : ret;
2012
2013 out_release_extents:
2014         btrfs_release_log_ctx_extents(&ctx);
2015         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2016         goto out;
2017 }
2018
2019 static const struct vm_operations_struct btrfs_file_vm_ops = {
2020         .fault          = filemap_fault,
2021         .map_pages      = filemap_map_pages,
2022         .page_mkwrite   = btrfs_page_mkwrite,
2023 };
2024
2025 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
2026 {
2027         struct address_space *mapping = filp->f_mapping;
2028
2029         if (!mapping->a_ops->read_folio)
2030                 return -ENOEXEC;
2031
2032         file_accessed(filp);
2033         vma->vm_ops = &btrfs_file_vm_ops;
2034
2035         return 0;
2036 }
2037
2038 static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2039                           int slot, u64 start, u64 end)
2040 {
2041         struct btrfs_file_extent_item *fi;
2042         struct btrfs_key key;
2043
2044         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2045                 return 0;
2046
2047         btrfs_item_key_to_cpu(leaf, &key, slot);
2048         if (key.objectid != btrfs_ino(inode) ||
2049             key.type != BTRFS_EXTENT_DATA_KEY)
2050                 return 0;
2051
2052         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2053
2054         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2055                 return 0;
2056
2057         if (btrfs_file_extent_disk_bytenr(leaf, fi))
2058                 return 0;
2059
2060         if (key.offset == end)
2061                 return 1;
2062         if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2063                 return 1;
2064         return 0;
2065 }
2066
2067 static int fill_holes(struct btrfs_trans_handle *trans,
2068                 struct btrfs_inode *inode,
2069                 struct btrfs_path *path, u64 offset, u64 end)
2070 {
2071         struct btrfs_fs_info *fs_info = trans->fs_info;
2072         struct btrfs_root *root = inode->root;
2073         struct extent_buffer *leaf;
2074         struct btrfs_file_extent_item *fi;
2075         struct extent_map *hole_em;
2076         struct btrfs_key key;
2077         int ret;
2078
2079         if (btrfs_fs_incompat(fs_info, NO_HOLES))
2080                 goto out;
2081
2082         key.objectid = btrfs_ino(inode);
2083         key.type = BTRFS_EXTENT_DATA_KEY;
2084         key.offset = offset;
2085
2086         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2087         if (ret <= 0) {
2088                 /*
2089                  * We should have dropped this offset, so if we find it then
2090                  * something has gone horribly wrong.
2091                  */
2092                 if (ret == 0)
2093                         ret = -EINVAL;
2094                 return ret;
2095         }
2096
2097         leaf = path->nodes[0];
2098         if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2099                 u64 num_bytes;
2100
2101                 path->slots[0]--;
2102                 fi = btrfs_item_ptr(leaf, path->slots[0],
2103                                     struct btrfs_file_extent_item);
2104                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2105                         end - offset;
2106                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2107                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2108                 btrfs_set_file_extent_offset(leaf, fi, 0);
2109                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2110                 btrfs_mark_buffer_dirty(trans, leaf);
2111                 goto out;
2112         }
2113
2114         if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2115                 u64 num_bytes;
2116
2117                 key.offset = offset;
2118                 btrfs_set_item_key_safe(trans, path, &key);
2119                 fi = btrfs_item_ptr(leaf, path->slots[0],
2120                                     struct btrfs_file_extent_item);
2121                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2122                         offset;
2123                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2124                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2125                 btrfs_set_file_extent_offset(leaf, fi, 0);
2126                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2127                 btrfs_mark_buffer_dirty(trans, leaf);
2128                 goto out;
2129         }
2130         btrfs_release_path(path);
2131
2132         ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2133                                        end - offset);
2134         if (ret)
2135                 return ret;
2136
2137 out:
2138         btrfs_release_path(path);
2139
2140         hole_em = alloc_extent_map();
2141         if (!hole_em) {
2142                 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2143                 btrfs_set_inode_full_sync(inode);
2144         } else {
2145                 hole_em->start = offset;
2146                 hole_em->len = end - offset;
2147                 hole_em->ram_bytes = hole_em->len;
2148                 hole_em->orig_start = offset;
2149
2150                 hole_em->block_start = EXTENT_MAP_HOLE;
2151                 hole_em->block_len = 0;
2152                 hole_em->orig_block_len = 0;
2153                 hole_em->compress_type = BTRFS_COMPRESS_NONE;
2154                 hole_em->generation = trans->transid;
2155
2156                 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2157                 free_extent_map(hole_em);
2158                 if (ret)
2159                         btrfs_set_inode_full_sync(inode);
2160         }
2161
2162         return 0;
2163 }
2164
2165 /*
2166  * Find a hole extent on given inode and change start/len to the end of hole
2167  * extent.(hole/vacuum extent whose em->start <= start &&
2168  *         em->start + em->len > start)
2169  * When a hole extent is found, return 1 and modify start/len.
2170  */
2171 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2172 {
2173         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2174         struct extent_map *em;
2175         int ret = 0;
2176
2177         em = btrfs_get_extent(inode, NULL, 0,
2178                               round_down(*start, fs_info->sectorsize),
2179                               round_up(*len, fs_info->sectorsize));
2180         if (IS_ERR(em))
2181                 return PTR_ERR(em);
2182
2183         /* Hole or vacuum extent(only exists in no-hole mode) */
2184         if (em->block_start == EXTENT_MAP_HOLE) {
2185                 ret = 1;
2186                 *len = em->start + em->len > *start + *len ?
2187                        0 : *start + *len - em->start - em->len;
2188                 *start = em->start + em->len;
2189         }
2190         free_extent_map(em);
2191         return ret;
2192 }
2193
2194 static void btrfs_punch_hole_lock_range(struct inode *inode,
2195                                         const u64 lockstart,
2196                                         const u64 lockend,
2197                                         struct extent_state **cached_state)
2198 {
2199         /*
2200          * For subpage case, if the range is not at page boundary, we could
2201          * have pages at the leading/tailing part of the range.
2202          * This could lead to dead loop since filemap_range_has_page()
2203          * will always return true.
2204          * So here we need to do extra page alignment for
2205          * filemap_range_has_page().
2206          */
2207         const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2208         const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2209
2210         while (1) {
2211                 truncate_pagecache_range(inode, lockstart, lockend);
2212
2213                 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2214                             cached_state);
2215                 /*
2216                  * We can't have ordered extents in the range, nor dirty/writeback
2217                  * pages, because we have locked the inode's VFS lock in exclusive
2218                  * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2219                  * we have flushed all delalloc in the range and we have waited
2220                  * for any ordered extents in the range to complete.
2221                  * We can race with anyone reading pages from this range, so after
2222                  * locking the range check if we have pages in the range, and if
2223                  * we do, unlock the range and retry.
2224                  */
2225                 if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
2226                                             page_lockend))
2227                         break;
2228
2229                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2230                               cached_state);
2231         }
2232
2233         btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2234 }
2235
2236 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2237                                      struct btrfs_inode *inode,
2238                                      struct btrfs_path *path,
2239                                      struct btrfs_replace_extent_info *extent_info,
2240                                      const u64 replace_len,
2241                                      const u64 bytes_to_drop)
2242 {
2243         struct btrfs_fs_info *fs_info = trans->fs_info;
2244         struct btrfs_root *root = inode->root;
2245         struct btrfs_file_extent_item *extent;
2246         struct extent_buffer *leaf;
2247         struct btrfs_key key;
2248         int slot;
2249         struct btrfs_ref ref = { 0 };
2250         int ret;
2251
2252         if (replace_len == 0)
2253                 return 0;
2254
2255         if (extent_info->disk_offset == 0 &&
2256             btrfs_fs_incompat(fs_info, NO_HOLES)) {
2257                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2258                 return 0;
2259         }
2260
2261         key.objectid = btrfs_ino(inode);
2262         key.type = BTRFS_EXTENT_DATA_KEY;
2263         key.offset = extent_info->file_offset;
2264         ret = btrfs_insert_empty_item(trans, root, path, &key,
2265                                       sizeof(struct btrfs_file_extent_item));
2266         if (ret)
2267                 return ret;
2268         leaf = path->nodes[0];
2269         slot = path->slots[0];
2270         write_extent_buffer(leaf, extent_info->extent_buf,
2271                             btrfs_item_ptr_offset(leaf, slot),
2272                             sizeof(struct btrfs_file_extent_item));
2273         extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2274         ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2275         btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2276         btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2277         if (extent_info->is_new_extent)
2278                 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2279         btrfs_mark_buffer_dirty(trans, leaf);
2280         btrfs_release_path(path);
2281
2282         ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2283                                                 replace_len);
2284         if (ret)
2285                 return ret;
2286
2287         /* If it's a hole, nothing more needs to be done. */
2288         if (extent_info->disk_offset == 0) {
2289                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2290                 return 0;
2291         }
2292
2293         btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2294
2295         if (extent_info->is_new_extent && extent_info->insertions == 0) {
2296                 key.objectid = extent_info->disk_offset;
2297                 key.type = BTRFS_EXTENT_ITEM_KEY;
2298                 key.offset = extent_info->disk_len;
2299                 ret = btrfs_alloc_reserved_file_extent(trans, root,
2300                                                        btrfs_ino(inode),
2301                                                        extent_info->file_offset,
2302                                                        extent_info->qgroup_reserved,
2303                                                        &key);
2304         } else {
2305                 u64 ref_offset;
2306
2307                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2308                                        extent_info->disk_offset,
2309                                        extent_info->disk_len, 0,
2310                                        root->root_key.objectid);
2311                 ref_offset = extent_info->file_offset - extent_info->data_offset;
2312                 btrfs_init_data_ref(&ref, root->root_key.objectid,
2313                                     btrfs_ino(inode), ref_offset, 0, false);
2314                 ret = btrfs_inc_extent_ref(trans, &ref);
2315         }
2316
2317         extent_info->insertions++;
2318
2319         return ret;
2320 }
2321
2322 /*
2323  * The respective range must have been previously locked, as well as the inode.
2324  * The end offset is inclusive (last byte of the range).
2325  * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2326  * the file range with an extent.
2327  * When not punching a hole, we don't want to end up in a state where we dropped
2328  * extents without inserting a new one, so we must abort the transaction to avoid
2329  * a corruption.
2330  */
2331 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2332                                struct btrfs_path *path, const u64 start,
2333                                const u64 end,
2334                                struct btrfs_replace_extent_info *extent_info,
2335                                struct btrfs_trans_handle **trans_out)
2336 {
2337         struct btrfs_drop_extents_args drop_args = { 0 };
2338         struct btrfs_root *root = inode->root;
2339         struct btrfs_fs_info *fs_info = root->fs_info;
2340         u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2341         u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2342         struct btrfs_trans_handle *trans = NULL;
2343         struct btrfs_block_rsv *rsv;
2344         unsigned int rsv_count;
2345         u64 cur_offset;
2346         u64 len = end - start;
2347         int ret = 0;
2348
2349         if (end <= start)
2350                 return -EINVAL;
2351
2352         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2353         if (!rsv) {
2354                 ret = -ENOMEM;
2355                 goto out;
2356         }
2357         rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2358         rsv->failfast = true;
2359
2360         /*
2361          * 1 - update the inode
2362          * 1 - removing the extents in the range
2363          * 1 - adding the hole extent if no_holes isn't set or if we are
2364          *     replacing the range with a new extent
2365          */
2366         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2367                 rsv_count = 3;
2368         else
2369                 rsv_count = 2;
2370
2371         trans = btrfs_start_transaction(root, rsv_count);
2372         if (IS_ERR(trans)) {
2373                 ret = PTR_ERR(trans);
2374                 trans = NULL;
2375                 goto out_free;
2376         }
2377
2378         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2379                                       min_size, false);
2380         if (WARN_ON(ret))
2381                 goto out_trans;
2382         trans->block_rsv = rsv;
2383
2384         cur_offset = start;
2385         drop_args.path = path;
2386         drop_args.end = end + 1;
2387         drop_args.drop_cache = true;
2388         while (cur_offset < end) {
2389                 drop_args.start = cur_offset;
2390                 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2391                 /* If we are punching a hole decrement the inode's byte count */
2392                 if (!extent_info)
2393                         btrfs_update_inode_bytes(inode, 0,
2394                                                  drop_args.bytes_found);
2395                 if (ret != -ENOSPC) {
2396                         /*
2397                          * The only time we don't want to abort is if we are
2398                          * attempting to clone a partial inline extent, in which
2399                          * case we'll get EOPNOTSUPP.  However if we aren't
2400                          * clone we need to abort no matter what, because if we
2401                          * got EOPNOTSUPP via prealloc then we messed up and
2402                          * need to abort.
2403                          */
2404                         if (ret &&
2405                             (ret != -EOPNOTSUPP ||
2406                              (extent_info && extent_info->is_new_extent)))
2407                                 btrfs_abort_transaction(trans, ret);
2408                         break;
2409                 }
2410
2411                 trans->block_rsv = &fs_info->trans_block_rsv;
2412
2413                 if (!extent_info && cur_offset < drop_args.drop_end &&
2414                     cur_offset < ino_size) {
2415                         ret = fill_holes(trans, inode, path, cur_offset,
2416                                          drop_args.drop_end);
2417                         if (ret) {
2418                                 /*
2419                                  * If we failed then we didn't insert our hole
2420                                  * entries for the area we dropped, so now the
2421                                  * fs is corrupted, so we must abort the
2422                                  * transaction.
2423                                  */
2424                                 btrfs_abort_transaction(trans, ret);
2425                                 break;
2426                         }
2427                 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2428                         /*
2429                          * We are past the i_size here, but since we didn't
2430                          * insert holes we need to clear the mapped area so we
2431                          * know to not set disk_i_size in this area until a new
2432                          * file extent is inserted here.
2433                          */
2434                         ret = btrfs_inode_clear_file_extent_range(inode,
2435                                         cur_offset,
2436                                         drop_args.drop_end - cur_offset);
2437                         if (ret) {
2438                                 /*
2439                                  * We couldn't clear our area, so we could
2440                                  * presumably adjust up and corrupt the fs, so
2441                                  * we need to abort.
2442                                  */
2443                                 btrfs_abort_transaction(trans, ret);
2444                                 break;
2445                         }
2446                 }
2447
2448                 if (extent_info &&
2449                     drop_args.drop_end > extent_info->file_offset) {
2450                         u64 replace_len = drop_args.drop_end -
2451                                           extent_info->file_offset;
2452
2453                         ret = btrfs_insert_replace_extent(trans, inode, path,
2454                                         extent_info, replace_len,
2455                                         drop_args.bytes_found);
2456                         if (ret) {
2457                                 btrfs_abort_transaction(trans, ret);
2458                                 break;
2459                         }
2460                         extent_info->data_len -= replace_len;
2461                         extent_info->data_offset += replace_len;
2462                         extent_info->file_offset += replace_len;
2463                 }
2464
2465                 /*
2466                  * We are releasing our handle on the transaction, balance the
2467                  * dirty pages of the btree inode and flush delayed items, and
2468                  * then get a new transaction handle, which may now point to a
2469                  * new transaction in case someone else may have committed the
2470                  * transaction we used to replace/drop file extent items. So
2471                  * bump the inode's iversion and update mtime and ctime except
2472                  * if we are called from a dedupe context. This is because a
2473                  * power failure/crash may happen after the transaction is
2474                  * committed and before we finish replacing/dropping all the
2475                  * file extent items we need.
2476                  */
2477                 inode_inc_iversion(&inode->vfs_inode);
2478
2479                 if (!extent_info || extent_info->update_times)
2480                         inode_set_mtime_to_ts(&inode->vfs_inode,
2481                                               inode_set_ctime_current(&inode->vfs_inode));
2482
2483                 ret = btrfs_update_inode(trans, inode);
2484                 if (ret)
2485                         break;
2486
2487                 btrfs_end_transaction(trans);
2488                 btrfs_btree_balance_dirty(fs_info);
2489
2490                 trans = btrfs_start_transaction(root, rsv_count);
2491                 if (IS_ERR(trans)) {
2492                         ret = PTR_ERR(trans);
2493                         trans = NULL;
2494                         break;
2495                 }
2496
2497                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2498                                               rsv, min_size, false);
2499                 if (WARN_ON(ret))
2500                         break;
2501                 trans->block_rsv = rsv;
2502
2503                 cur_offset = drop_args.drop_end;
2504                 len = end - cur_offset;
2505                 if (!extent_info && len) {
2506                         ret = find_first_non_hole(inode, &cur_offset, &len);
2507                         if (unlikely(ret < 0))
2508                                 break;
2509                         if (ret && !len) {
2510                                 ret = 0;
2511                                 break;
2512                         }
2513                 }
2514         }
2515
2516         /*
2517          * If we were cloning, force the next fsync to be a full one since we
2518          * we replaced (or just dropped in the case of cloning holes when
2519          * NO_HOLES is enabled) file extent items and did not setup new extent
2520          * maps for the replacement extents (or holes).
2521          */
2522         if (extent_info && !extent_info->is_new_extent)
2523                 btrfs_set_inode_full_sync(inode);
2524
2525         if (ret)
2526                 goto out_trans;
2527
2528         trans->block_rsv = &fs_info->trans_block_rsv;
2529         /*
2530          * If we are using the NO_HOLES feature we might have had already an
2531          * hole that overlaps a part of the region [lockstart, lockend] and
2532          * ends at (or beyond) lockend. Since we have no file extent items to
2533          * represent holes, drop_end can be less than lockend and so we must
2534          * make sure we have an extent map representing the existing hole (the
2535          * call to __btrfs_drop_extents() might have dropped the existing extent
2536          * map representing the existing hole), otherwise the fast fsync path
2537          * will not record the existence of the hole region
2538          * [existing_hole_start, lockend].
2539          */
2540         if (drop_args.drop_end <= end)
2541                 drop_args.drop_end = end + 1;
2542         /*
2543          * Don't insert file hole extent item if it's for a range beyond eof
2544          * (because it's useless) or if it represents a 0 bytes range (when
2545          * cur_offset == drop_end).
2546          */
2547         if (!extent_info && cur_offset < ino_size &&
2548             cur_offset < drop_args.drop_end) {
2549                 ret = fill_holes(trans, inode, path, cur_offset,
2550                                  drop_args.drop_end);
2551                 if (ret) {
2552                         /* Same comment as above. */
2553                         btrfs_abort_transaction(trans, ret);
2554                         goto out_trans;
2555                 }
2556         } else if (!extent_info && cur_offset < drop_args.drop_end) {
2557                 /* See the comment in the loop above for the reasoning here. */
2558                 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2559                                         drop_args.drop_end - cur_offset);
2560                 if (ret) {
2561                         btrfs_abort_transaction(trans, ret);
2562                         goto out_trans;
2563                 }
2564
2565         }
2566         if (extent_info) {
2567                 ret = btrfs_insert_replace_extent(trans, inode, path,
2568                                 extent_info, extent_info->data_len,
2569                                 drop_args.bytes_found);
2570                 if (ret) {
2571                         btrfs_abort_transaction(trans, ret);
2572                         goto out_trans;
2573                 }
2574         }
2575
2576 out_trans:
2577         if (!trans)
2578                 goto out_free;
2579
2580         trans->block_rsv = &fs_info->trans_block_rsv;
2581         if (ret)
2582                 btrfs_end_transaction(trans);
2583         else
2584                 *trans_out = trans;
2585 out_free:
2586         btrfs_free_block_rsv(fs_info, rsv);
2587 out:
2588         return ret;
2589 }
2590
2591 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2592 {
2593         struct inode *inode = file_inode(file);
2594         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2595         struct btrfs_root *root = BTRFS_I(inode)->root;
2596         struct extent_state *cached_state = NULL;
2597         struct btrfs_path *path;
2598         struct btrfs_trans_handle *trans = NULL;
2599         u64 lockstart;
2600         u64 lockend;
2601         u64 tail_start;
2602         u64 tail_len;
2603         u64 orig_start = offset;
2604         int ret = 0;
2605         bool same_block;
2606         u64 ino_size;
2607         bool truncated_block = false;
2608         bool updated_inode = false;
2609
2610         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2611
2612         ret = btrfs_wait_ordered_range(inode, offset, len);
2613         if (ret)
2614                 goto out_only_mutex;
2615
2616         ino_size = round_up(inode->i_size, fs_info->sectorsize);
2617         ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2618         if (ret < 0)
2619                 goto out_only_mutex;
2620         if (ret && !len) {
2621                 /* Already in a large hole */
2622                 ret = 0;
2623                 goto out_only_mutex;
2624         }
2625
2626         ret = file_modified(file);
2627         if (ret)
2628                 goto out_only_mutex;
2629
2630         lockstart = round_up(offset, fs_info->sectorsize);
2631         lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2632         same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2633                 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2634         /*
2635          * We needn't truncate any block which is beyond the end of the file
2636          * because we are sure there is no data there.
2637          */
2638         /*
2639          * Only do this if we are in the same block and we aren't doing the
2640          * entire block.
2641          */
2642         if (same_block && len < fs_info->sectorsize) {
2643                 if (offset < ino_size) {
2644                         truncated_block = true;
2645                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2646                                                    0);
2647                 } else {
2648                         ret = 0;
2649                 }
2650                 goto out_only_mutex;
2651         }
2652
2653         /* zero back part of the first block */
2654         if (offset < ino_size) {
2655                 truncated_block = true;
2656                 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2657                 if (ret) {
2658                         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2659                         return ret;
2660                 }
2661         }
2662
2663         /* Check the aligned pages after the first unaligned page,
2664          * if offset != orig_start, which means the first unaligned page
2665          * including several following pages are already in holes,
2666          * the extra check can be skipped */
2667         if (offset == orig_start) {
2668                 /* after truncate page, check hole again */
2669                 len = offset + len - lockstart;
2670                 offset = lockstart;
2671                 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2672                 if (ret < 0)
2673                         goto out_only_mutex;
2674                 if (ret && !len) {
2675                         ret = 0;
2676                         goto out_only_mutex;
2677                 }
2678                 lockstart = offset;
2679         }
2680
2681         /* Check the tail unaligned part is in a hole */
2682         tail_start = lockend + 1;
2683         tail_len = offset + len - tail_start;
2684         if (tail_len) {
2685                 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2686                 if (unlikely(ret < 0))
2687                         goto out_only_mutex;
2688                 if (!ret) {
2689                         /* zero the front end of the last page */
2690                         if (tail_start + tail_len < ino_size) {
2691                                 truncated_block = true;
2692                                 ret = btrfs_truncate_block(BTRFS_I(inode),
2693                                                         tail_start + tail_len,
2694                                                         0, 1);
2695                                 if (ret)
2696                                         goto out_only_mutex;
2697                         }
2698                 }
2699         }
2700
2701         if (lockend < lockstart) {
2702                 ret = 0;
2703                 goto out_only_mutex;
2704         }
2705
2706         btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2707
2708         path = btrfs_alloc_path();
2709         if (!path) {
2710                 ret = -ENOMEM;
2711                 goto out;
2712         }
2713
2714         ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2715                                          lockend, NULL, &trans);
2716         btrfs_free_path(path);
2717         if (ret)
2718                 goto out;
2719
2720         ASSERT(trans != NULL);
2721         inode_inc_iversion(inode);
2722         inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2723         ret = btrfs_update_inode(trans, BTRFS_I(inode));
2724         updated_inode = true;
2725         btrfs_end_transaction(trans);
2726         btrfs_btree_balance_dirty(fs_info);
2727 out:
2728         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2729                       &cached_state);
2730 out_only_mutex:
2731         if (!updated_inode && truncated_block && !ret) {
2732                 /*
2733                  * If we only end up zeroing part of a page, we still need to
2734                  * update the inode item, so that all the time fields are
2735                  * updated as well as the necessary btrfs inode in memory fields
2736                  * for detecting, at fsync time, if the inode isn't yet in the
2737                  * log tree or it's there but not up to date.
2738                  */
2739                 struct timespec64 now = inode_set_ctime_current(inode);
2740
2741                 inode_inc_iversion(inode);
2742                 inode_set_mtime_to_ts(inode, now);
2743                 trans = btrfs_start_transaction(root, 1);
2744                 if (IS_ERR(trans)) {
2745                         ret = PTR_ERR(trans);
2746                 } else {
2747                         int ret2;
2748
2749                         ret = btrfs_update_inode(trans, BTRFS_I(inode));
2750                         ret2 = btrfs_end_transaction(trans);
2751                         if (!ret)
2752                                 ret = ret2;
2753                 }
2754         }
2755         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2756         return ret;
2757 }
2758
2759 /* Helper structure to record which range is already reserved */
2760 struct falloc_range {
2761         struct list_head list;
2762         u64 start;
2763         u64 len;
2764 };
2765
2766 /*
2767  * Helper function to add falloc range
2768  *
2769  * Caller should have locked the larger range of extent containing
2770  * [start, len)
2771  */
2772 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2773 {
2774         struct falloc_range *range = NULL;
2775
2776         if (!list_empty(head)) {
2777                 /*
2778                  * As fallocate iterates by bytenr order, we only need to check
2779                  * the last range.
2780                  */
2781                 range = list_last_entry(head, struct falloc_range, list);
2782                 if (range->start + range->len == start) {
2783                         range->len += len;
2784                         return 0;
2785                 }
2786         }
2787
2788         range = kmalloc(sizeof(*range), GFP_KERNEL);
2789         if (!range)
2790                 return -ENOMEM;
2791         range->start = start;
2792         range->len = len;
2793         list_add_tail(&range->list, head);
2794         return 0;
2795 }
2796
2797 static int btrfs_fallocate_update_isize(struct inode *inode,
2798                                         const u64 end,
2799                                         const int mode)
2800 {
2801         struct btrfs_trans_handle *trans;
2802         struct btrfs_root *root = BTRFS_I(inode)->root;
2803         int ret;
2804         int ret2;
2805
2806         if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2807                 return 0;
2808
2809         trans = btrfs_start_transaction(root, 1);
2810         if (IS_ERR(trans))
2811                 return PTR_ERR(trans);
2812
2813         inode_set_ctime_current(inode);
2814         i_size_write(inode, end);
2815         btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2816         ret = btrfs_update_inode(trans, BTRFS_I(inode));
2817         ret2 = btrfs_end_transaction(trans);
2818
2819         return ret ? ret : ret2;
2820 }
2821
2822 enum {
2823         RANGE_BOUNDARY_WRITTEN_EXTENT,
2824         RANGE_BOUNDARY_PREALLOC_EXTENT,
2825         RANGE_BOUNDARY_HOLE,
2826 };
2827
2828 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2829                                                  u64 offset)
2830 {
2831         const u64 sectorsize = inode->root->fs_info->sectorsize;
2832         struct extent_map *em;
2833         int ret;
2834
2835         offset = round_down(offset, sectorsize);
2836         em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
2837         if (IS_ERR(em))
2838                 return PTR_ERR(em);
2839
2840         if (em->block_start == EXTENT_MAP_HOLE)
2841                 ret = RANGE_BOUNDARY_HOLE;
2842         else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2843                 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2844         else
2845                 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2846
2847         free_extent_map(em);
2848         return ret;
2849 }
2850
2851 static int btrfs_zero_range(struct inode *inode,
2852                             loff_t offset,
2853                             loff_t len,
2854                             const int mode)
2855 {
2856         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2857         struct extent_map *em;
2858         struct extent_changeset *data_reserved = NULL;
2859         int ret;
2860         u64 alloc_hint = 0;
2861         const u64 sectorsize = fs_info->sectorsize;
2862         u64 alloc_start = round_down(offset, sectorsize);
2863         u64 alloc_end = round_up(offset + len, sectorsize);
2864         u64 bytes_to_reserve = 0;
2865         bool space_reserved = false;
2866
2867         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
2868                               alloc_end - alloc_start);
2869         if (IS_ERR(em)) {
2870                 ret = PTR_ERR(em);
2871                 goto out;
2872         }
2873
2874         /*
2875          * Avoid hole punching and extent allocation for some cases. More cases
2876          * could be considered, but these are unlikely common and we keep things
2877          * as simple as possible for now. Also, intentionally, if the target
2878          * range contains one or more prealloc extents together with regular
2879          * extents and holes, we drop all the existing extents and allocate a
2880          * new prealloc extent, so that we get a larger contiguous disk extent.
2881          */
2882         if (em->start <= alloc_start &&
2883             test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
2884                 const u64 em_end = em->start + em->len;
2885
2886                 if (em_end >= offset + len) {
2887                         /*
2888                          * The whole range is already a prealloc extent,
2889                          * do nothing except updating the inode's i_size if
2890                          * needed.
2891                          */
2892                         free_extent_map(em);
2893                         ret = btrfs_fallocate_update_isize(inode, offset + len,
2894                                                            mode);
2895                         goto out;
2896                 }
2897                 /*
2898                  * Part of the range is already a prealloc extent, so operate
2899                  * only on the remaining part of the range.
2900                  */
2901                 alloc_start = em_end;
2902                 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2903                 len = offset + len - alloc_start;
2904                 offset = alloc_start;
2905                 alloc_hint = em->block_start + em->len;
2906         }
2907         free_extent_map(em);
2908
2909         if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2910             BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2911                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
2912                                       sectorsize);
2913                 if (IS_ERR(em)) {
2914                         ret = PTR_ERR(em);
2915                         goto out;
2916                 }
2917
2918                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
2919                         free_extent_map(em);
2920                         ret = btrfs_fallocate_update_isize(inode, offset + len,
2921                                                            mode);
2922                         goto out;
2923                 }
2924                 if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
2925                         free_extent_map(em);
2926                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2927                                                    0);
2928                         if (!ret)
2929                                 ret = btrfs_fallocate_update_isize(inode,
2930                                                                    offset + len,
2931                                                                    mode);
2932                         return ret;
2933                 }
2934                 free_extent_map(em);
2935                 alloc_start = round_down(offset, sectorsize);
2936                 alloc_end = alloc_start + sectorsize;
2937                 goto reserve_space;
2938         }
2939
2940         alloc_start = round_up(offset, sectorsize);
2941         alloc_end = round_down(offset + len, sectorsize);
2942
2943         /*
2944          * For unaligned ranges, check the pages at the boundaries, they might
2945          * map to an extent, in which case we need to partially zero them, or
2946          * they might map to a hole, in which case we need our allocation range
2947          * to cover them.
2948          */
2949         if (!IS_ALIGNED(offset, sectorsize)) {
2950                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2951                                                             offset);
2952                 if (ret < 0)
2953                         goto out;
2954                 if (ret == RANGE_BOUNDARY_HOLE) {
2955                         alloc_start = round_down(offset, sectorsize);
2956                         ret = 0;
2957                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2958                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2959                         if (ret)
2960                                 goto out;
2961                 } else {
2962                         ret = 0;
2963                 }
2964         }
2965
2966         if (!IS_ALIGNED(offset + len, sectorsize)) {
2967                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2968                                                             offset + len);
2969                 if (ret < 0)
2970                         goto out;
2971                 if (ret == RANGE_BOUNDARY_HOLE) {
2972                         alloc_end = round_up(offset + len, sectorsize);
2973                         ret = 0;
2974                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2975                         ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
2976                                                    0, 1);
2977                         if (ret)
2978                                 goto out;
2979                 } else {
2980                         ret = 0;
2981                 }
2982         }
2983
2984 reserve_space:
2985         if (alloc_start < alloc_end) {
2986                 struct extent_state *cached_state = NULL;
2987                 const u64 lockstart = alloc_start;
2988                 const u64 lockend = alloc_end - 1;
2989
2990                 bytes_to_reserve = alloc_end - alloc_start;
2991                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
2992                                                       bytes_to_reserve);
2993                 if (ret < 0)
2994                         goto out;
2995                 space_reserved = true;
2996                 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2997                                             &cached_state);
2998                 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
2999                                                 alloc_start, bytes_to_reserve);
3000                 if (ret) {
3001                         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
3002                                       lockend, &cached_state);
3003                         goto out;
3004                 }
3005                 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3006                                                 alloc_end - alloc_start,
3007                                                 i_blocksize(inode),
3008                                                 offset + len, &alloc_hint);
3009                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3010                               &cached_state);
3011                 /* btrfs_prealloc_file_range releases reserved space on error */
3012                 if (ret) {
3013                         space_reserved = false;
3014                         goto out;
3015                 }
3016         }
3017         ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3018  out:
3019         if (ret && space_reserved)
3020                 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3021                                                alloc_start, bytes_to_reserve);
3022         extent_changeset_free(data_reserved);
3023
3024         return ret;
3025 }
3026
3027 static long btrfs_fallocate(struct file *file, int mode,
3028                             loff_t offset, loff_t len)
3029 {
3030         struct inode *inode = file_inode(file);
3031         struct extent_state *cached_state = NULL;
3032         struct extent_changeset *data_reserved = NULL;
3033         struct falloc_range *range;
3034         struct falloc_range *tmp;
3035         LIST_HEAD(reserve_list);
3036         u64 cur_offset;
3037         u64 last_byte;
3038         u64 alloc_start;
3039         u64 alloc_end;
3040         u64 alloc_hint = 0;
3041         u64 locked_end;
3042         u64 actual_end = 0;
3043         u64 data_space_needed = 0;
3044         u64 data_space_reserved = 0;
3045         u64 qgroup_reserved = 0;
3046         struct extent_map *em;
3047         int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3048         int ret;
3049
3050         /* Do not allow fallocate in ZONED mode */
3051         if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
3052                 return -EOPNOTSUPP;
3053
3054         alloc_start = round_down(offset, blocksize);
3055         alloc_end = round_up(offset + len, blocksize);
3056         cur_offset = alloc_start;
3057
3058         /* Make sure we aren't being give some crap mode */
3059         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3060                      FALLOC_FL_ZERO_RANGE))
3061                 return -EOPNOTSUPP;
3062
3063         if (mode & FALLOC_FL_PUNCH_HOLE)
3064                 return btrfs_punch_hole(file, offset, len);
3065
3066         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3067
3068         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3069                 ret = inode_newsize_ok(inode, offset + len);
3070                 if (ret)
3071                         goto out;
3072         }
3073
3074         ret = file_modified(file);
3075         if (ret)
3076                 goto out;
3077
3078         /*
3079          * TODO: Move these two operations after we have checked
3080          * accurate reserved space, or fallocate can still fail but
3081          * with page truncated or size expanded.
3082          *
3083          * But that's a minor problem and won't do much harm BTW.
3084          */
3085         if (alloc_start > inode->i_size) {
3086                 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3087                                         alloc_start);
3088                 if (ret)
3089                         goto out;
3090         } else if (offset + len > inode->i_size) {
3091                 /*
3092                  * If we are fallocating from the end of the file onward we
3093                  * need to zero out the end of the block if i_size lands in the
3094                  * middle of a block.
3095                  */
3096                 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3097                 if (ret)
3098                         goto out;
3099         }
3100
3101         /*
3102          * We have locked the inode at the VFS level (in exclusive mode) and we
3103          * have locked the i_mmap_lock lock (in exclusive mode). Now before
3104          * locking the file range, flush all dealloc in the range and wait for
3105          * all ordered extents in the range to complete. After this we can lock
3106          * the file range and, due to the previous locking we did, we know there
3107          * can't be more delalloc or ordered extents in the range.
3108          */
3109         ret = btrfs_wait_ordered_range(inode, alloc_start,
3110                                        alloc_end - alloc_start);
3111         if (ret)
3112                 goto out;
3113
3114         if (mode & FALLOC_FL_ZERO_RANGE) {
3115                 ret = btrfs_zero_range(inode, offset, len, mode);
3116                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3117                 return ret;
3118         }
3119
3120         locked_end = alloc_end - 1;
3121         lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3122                     &cached_state);
3123
3124         btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3125
3126         /* First, check if we exceed the qgroup limit */
3127         while (cur_offset < alloc_end) {
3128                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3129                                       alloc_end - cur_offset);
3130                 if (IS_ERR(em)) {
3131                         ret = PTR_ERR(em);
3132                         break;
3133                 }
3134                 last_byte = min(extent_map_end(em), alloc_end);
3135                 actual_end = min_t(u64, extent_map_end(em), offset + len);
3136                 last_byte = ALIGN(last_byte, blocksize);
3137                 if (em->block_start == EXTENT_MAP_HOLE ||
3138                     (cur_offset >= inode->i_size &&
3139                      !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3140                         const u64 range_len = last_byte - cur_offset;
3141
3142                         ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3143                         if (ret < 0) {
3144                                 free_extent_map(em);
3145                                 break;
3146                         }
3147                         ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3148                                         &data_reserved, cur_offset, range_len);
3149                         if (ret < 0) {
3150                                 free_extent_map(em);
3151                                 break;
3152                         }
3153                         qgroup_reserved += range_len;
3154                         data_space_needed += range_len;
3155                 }
3156                 free_extent_map(em);
3157                 cur_offset = last_byte;
3158         }
3159
3160         if (!ret && data_space_needed > 0) {
3161                 /*
3162                  * We are safe to reserve space here as we can't have delalloc
3163                  * in the range, see above.
3164                  */
3165                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3166                                                       data_space_needed);
3167                 if (!ret)
3168                         data_space_reserved = data_space_needed;
3169         }
3170
3171         /*
3172          * If ret is still 0, means we're OK to fallocate.
3173          * Or just cleanup the list and exit.
3174          */
3175         list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3176                 if (!ret) {
3177                         ret = btrfs_prealloc_file_range(inode, mode,
3178                                         range->start,
3179                                         range->len, i_blocksize(inode),
3180                                         offset + len, &alloc_hint);
3181                         /*
3182                          * btrfs_prealloc_file_range() releases space even
3183                          * if it returns an error.
3184                          */
3185                         data_space_reserved -= range->len;
3186                         qgroup_reserved -= range->len;
3187                 } else if (data_space_reserved > 0) {
3188                         btrfs_free_reserved_data_space(BTRFS_I(inode),
3189                                                data_reserved, range->start,
3190                                                range->len);
3191                         data_space_reserved -= range->len;
3192                         qgroup_reserved -= range->len;
3193                 } else if (qgroup_reserved > 0) {
3194                         btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3195                                                range->start, range->len, NULL);
3196                         qgroup_reserved -= range->len;
3197                 }
3198                 list_del(&range->list);
3199                 kfree(range);
3200         }
3201         if (ret < 0)
3202                 goto out_unlock;
3203
3204         /*
3205          * We didn't need to allocate any more space, but we still extended the
3206          * size of the file so we need to update i_size and the inode item.
3207          */
3208         ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3209 out_unlock:
3210         unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3211                       &cached_state);
3212 out:
3213         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3214         extent_changeset_free(data_reserved);
3215         return ret;
3216 }
3217
3218 /*
3219  * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3220  * that has unflushed and/or flushing delalloc. There might be other adjacent
3221  * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3222  * looping while it gets adjacent subranges, and merging them together.
3223  */
3224 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3225                                    struct extent_state **cached_state,
3226                                    bool *search_io_tree,
3227                                    u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3228 {
3229         u64 len = end + 1 - start;
3230         u64 delalloc_len = 0;
3231         struct btrfs_ordered_extent *oe;
3232         u64 oe_start;
3233         u64 oe_end;
3234
3235         /*
3236          * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3237          * means we have delalloc (dirty pages) for which writeback has not
3238          * started yet.
3239          */
3240         if (*search_io_tree) {
3241                 spin_lock(&inode->lock);
3242                 if (inode->delalloc_bytes > 0) {
3243                         spin_unlock(&inode->lock);
3244                         *delalloc_start_ret = start;
3245                         delalloc_len = count_range_bits(&inode->io_tree,
3246                                                         delalloc_start_ret, end,
3247                                                         len, EXTENT_DELALLOC, 1,
3248                                                         cached_state);
3249                 } else {
3250                         spin_unlock(&inode->lock);
3251                 }
3252         }
3253
3254         if (delalloc_len > 0) {
3255                 /*
3256                  * If delalloc was found then *delalloc_start_ret has a sector size
3257                  * aligned value (rounded down).
3258                  */
3259                 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3260
3261                 if (*delalloc_start_ret == start) {
3262                         /* Delalloc for the whole range, nothing more to do. */
3263                         if (*delalloc_end_ret == end)
3264                                 return true;
3265                         /* Else trim our search range for ordered extents. */
3266                         start = *delalloc_end_ret + 1;
3267                         len = end + 1 - start;
3268                 }
3269         } else {
3270                 /* No delalloc, future calls don't need to search again. */
3271                 *search_io_tree = false;
3272         }
3273
3274         /*
3275          * Now also check if there's any ordered extent in the range.
3276          * We do this because:
3277          *
3278          * 1) When delalloc is flushed, the file range is locked, we clear the
3279          *    EXTENT_DELALLOC bit from the io tree and create an extent map and
3280          *    an ordered extent for the write. So we might just have been called
3281          *    after delalloc is flushed and before the ordered extent completes
3282          *    and inserts the new file extent item in the subvolume's btree;
3283          *
3284          * 2) We may have an ordered extent created by flushing delalloc for a
3285          *    subrange that starts before the subrange we found marked with
3286          *    EXTENT_DELALLOC in the io tree.
3287          *
3288          * We could also use the extent map tree to find such delalloc that is
3289          * being flushed, but using the ordered extents tree is more efficient
3290          * because it's usually much smaller as ordered extents are removed from
3291          * the tree once they complete. With the extent maps, we mau have them
3292          * in the extent map tree for a very long time, and they were either
3293          * created by previous writes or loaded by read operations.
3294          */
3295         oe = btrfs_lookup_first_ordered_range(inode, start, len);
3296         if (!oe)
3297                 return (delalloc_len > 0);
3298
3299         /* The ordered extent may span beyond our search range. */
3300         oe_start = max(oe->file_offset, start);
3301         oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3302
3303         btrfs_put_ordered_extent(oe);
3304
3305         /* Don't have unflushed delalloc, return the ordered extent range. */
3306         if (delalloc_len == 0) {
3307                 *delalloc_start_ret = oe_start;
3308                 *delalloc_end_ret = oe_end;
3309                 return true;
3310         }
3311
3312         /*
3313          * We have both unflushed delalloc (io_tree) and an ordered extent.
3314          * If the ranges are adjacent returned a combined range, otherwise
3315          * return the leftmost range.
3316          */
3317         if (oe_start < *delalloc_start_ret) {
3318                 if (oe_end < *delalloc_start_ret)
3319                         *delalloc_end_ret = oe_end;
3320                 *delalloc_start_ret = oe_start;
3321         } else if (*delalloc_end_ret + 1 == oe_start) {
3322                 *delalloc_end_ret = oe_end;
3323         }
3324
3325         return true;
3326 }
3327
3328 /*
3329  * Check if there's delalloc in a given range.
3330  *
3331  * @inode:               The inode.
3332  * @start:               The start offset of the range. It does not need to be
3333  *                       sector size aligned.
3334  * @end:                 The end offset (inclusive value) of the search range.
3335  *                       It does not need to be sector size aligned.
3336  * @cached_state:        Extent state record used for speeding up delalloc
3337  *                       searches in the inode's io_tree. Can be NULL.
3338  * @delalloc_start_ret:  Output argument, set to the start offset of the
3339  *                       subrange found with delalloc (may not be sector size
3340  *                       aligned).
3341  * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
3342  *                       of the subrange found with delalloc.
3343  *
3344  * Returns true if a subrange with delalloc is found within the given range, and
3345  * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3346  * end offsets of the subrange.
3347  */
3348 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3349                                   struct extent_state **cached_state,
3350                                   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3351 {
3352         u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3353         u64 prev_delalloc_end = 0;
3354         bool search_io_tree = true;
3355         bool ret = false;
3356
3357         while (cur_offset <= end) {
3358                 u64 delalloc_start;
3359                 u64 delalloc_end;
3360                 bool delalloc;
3361
3362                 delalloc = find_delalloc_subrange(inode, cur_offset, end,
3363                                                   cached_state, &search_io_tree,
3364                                                   &delalloc_start,
3365                                                   &delalloc_end);
3366                 if (!delalloc)
3367                         break;
3368
3369                 if (prev_delalloc_end == 0) {
3370                         /* First subrange found. */
3371                         *delalloc_start_ret = max(delalloc_start, start);
3372                         *delalloc_end_ret = delalloc_end;
3373                         ret = true;
3374                 } else if (delalloc_start == prev_delalloc_end + 1) {
3375                         /* Subrange adjacent to the previous one, merge them. */
3376                         *delalloc_end_ret = delalloc_end;
3377                 } else {
3378                         /* Subrange not adjacent to the previous one, exit. */
3379                         break;
3380                 }
3381
3382                 prev_delalloc_end = delalloc_end;
3383                 cur_offset = delalloc_end + 1;
3384                 cond_resched();
3385         }
3386
3387         return ret;
3388 }
3389
3390 /*
3391  * Check if there's a hole or delalloc range in a range representing a hole (or
3392  * prealloc extent) found in the inode's subvolume btree.
3393  *
3394  * @inode:      The inode.
3395  * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
3396  * @start:      Start offset of the hole region. It does not need to be sector
3397  *              size aligned.
3398  * @end:        End offset (inclusive value) of the hole region. It does not
3399  *              need to be sector size aligned.
3400  * @start_ret:  Return parameter, used to set the start of the subrange in the
3401  *              hole that matches the search criteria (seek mode), if such
3402  *              subrange is found (return value of the function is true).
3403  *              The value returned here may not be sector size aligned.
3404  *
3405  * Returns true if a subrange matching the given seek mode is found, and if one
3406  * is found, it updates @start_ret with the start of the subrange.
3407  */
3408 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3409                                         struct extent_state **cached_state,
3410                                         u64 start, u64 end, u64 *start_ret)
3411 {
3412         u64 delalloc_start;
3413         u64 delalloc_end;
3414         bool delalloc;
3415
3416         delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3417                                                 &delalloc_start, &delalloc_end);
3418         if (delalloc && whence == SEEK_DATA) {
3419                 *start_ret = delalloc_start;
3420                 return true;
3421         }
3422
3423         if (delalloc && whence == SEEK_HOLE) {
3424                 /*
3425                  * We found delalloc but it starts after out start offset. So we
3426                  * have a hole between our start offset and the delalloc start.
3427                  */
3428                 if (start < delalloc_start) {
3429                         *start_ret = start;
3430                         return true;
3431                 }
3432                 /*
3433                  * Delalloc range starts at our start offset.
3434                  * If the delalloc range's length is smaller than our range,
3435                  * then it means we have a hole that starts where the delalloc
3436                  * subrange ends.
3437                  */
3438                 if (delalloc_end < end) {
3439                         *start_ret = delalloc_end + 1;
3440                         return true;
3441                 }
3442
3443                 /* There's delalloc for the whole range. */
3444                 return false;
3445         }
3446
3447         if (!delalloc && whence == SEEK_HOLE) {
3448                 *start_ret = start;
3449                 return true;
3450         }
3451
3452         /*
3453          * No delalloc in the range and we are seeking for data. The caller has
3454          * to iterate to the next extent item in the subvolume btree.
3455          */
3456         return false;
3457 }
3458
3459 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3460 {
3461         struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3462         struct btrfs_file_private *private = file->private_data;
3463         struct btrfs_fs_info *fs_info = inode->root->fs_info;
3464         struct extent_state *cached_state = NULL;
3465         struct extent_state **delalloc_cached_state;
3466         const loff_t i_size = i_size_read(&inode->vfs_inode);
3467         const u64 ino = btrfs_ino(inode);
3468         struct btrfs_root *root = inode->root;
3469         struct btrfs_path *path;
3470         struct btrfs_key key;
3471         u64 last_extent_end;
3472         u64 lockstart;
3473         u64 lockend;
3474         u64 start;
3475         int ret;
3476         bool found = false;
3477
3478         if (i_size == 0 || offset >= i_size)
3479                 return -ENXIO;
3480
3481         /*
3482          * Quick path. If the inode has no prealloc extents and its number of
3483          * bytes used matches its i_size, then it can not have holes.
3484          */
3485         if (whence == SEEK_HOLE &&
3486             !(inode->flags & BTRFS_INODE_PREALLOC) &&
3487             inode_get_bytes(&inode->vfs_inode) == i_size)
3488                 return i_size;
3489
3490         if (!private) {
3491                 private = kzalloc(sizeof(*private), GFP_KERNEL);
3492                 /*
3493                  * No worries if memory allocation failed.
3494                  * The private structure is used only for speeding up multiple
3495                  * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3496                  * so everything will still be correct.
3497                  */
3498                 file->private_data = private;
3499         }
3500
3501         if (private)
3502                 delalloc_cached_state = &private->llseek_cached_state;
3503         else
3504                 delalloc_cached_state = NULL;
3505
3506         /*
3507          * offset can be negative, in this case we start finding DATA/HOLE from
3508          * the very start of the file.
3509          */
3510         start = max_t(loff_t, 0, offset);
3511
3512         lockstart = round_down(start, fs_info->sectorsize);
3513         lockend = round_up(i_size, fs_info->sectorsize);
3514         if (lockend <= lockstart)
3515                 lockend = lockstart + fs_info->sectorsize;
3516         lockend--;
3517
3518         path = btrfs_alloc_path();
3519         if (!path)
3520                 return -ENOMEM;
3521         path->reada = READA_FORWARD;
3522
3523         key.objectid = ino;
3524         key.type = BTRFS_EXTENT_DATA_KEY;
3525         key.offset = start;
3526
3527         last_extent_end = lockstart;
3528
3529         lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3530
3531         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3532         if (ret < 0) {
3533                 goto out;
3534         } else if (ret > 0 && path->slots[0] > 0) {
3535                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3536                 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3537                         path->slots[0]--;
3538         }
3539
3540         while (start < i_size) {
3541                 struct extent_buffer *leaf = path->nodes[0];
3542                 struct btrfs_file_extent_item *extent;
3543                 u64 extent_end;
3544                 u8 type;
3545
3546                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3547                         ret = btrfs_next_leaf(root, path);
3548                         if (ret < 0)
3549                                 goto out;
3550                         else if (ret > 0)
3551                                 break;
3552
3553                         leaf = path->nodes[0];
3554                 }
3555
3556                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3557                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3558                         break;
3559
3560                 extent_end = btrfs_file_extent_end(path);
3561
3562                 /*
3563                  * In the first iteration we may have a slot that points to an
3564                  * extent that ends before our start offset, so skip it.
3565                  */
3566                 if (extent_end <= start) {
3567                         path->slots[0]++;
3568                         continue;
3569                 }
3570
3571                 /* We have an implicit hole, NO_HOLES feature is likely set. */
3572                 if (last_extent_end < key.offset) {
3573                         u64 search_start = last_extent_end;
3574                         u64 found_start;
3575
3576                         /*
3577                          * First iteration, @start matches @offset and it's
3578                          * within the hole.
3579                          */
3580                         if (start == offset)
3581                                 search_start = offset;
3582
3583                         found = find_desired_extent_in_hole(inode, whence,
3584                                                             delalloc_cached_state,
3585                                                             search_start,
3586                                                             key.offset - 1,
3587                                                             &found_start);
3588                         if (found) {
3589                                 start = found_start;
3590                                 break;
3591                         }
3592                         /*
3593                          * Didn't find data or a hole (due to delalloc) in the
3594                          * implicit hole range, so need to analyze the extent.
3595                          */
3596                 }
3597
3598                 extent = btrfs_item_ptr(leaf, path->slots[0],
3599                                         struct btrfs_file_extent_item);
3600                 type = btrfs_file_extent_type(leaf, extent);
3601
3602                 /*
3603                  * Can't access the extent's disk_bytenr field if this is an
3604                  * inline extent, since at that offset, it's where the extent
3605                  * data starts.
3606                  */
3607                 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3608                     (type == BTRFS_FILE_EXTENT_REG &&
3609                      btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3610                         /*
3611                          * Explicit hole or prealloc extent, search for delalloc.
3612                          * A prealloc extent is treated like a hole.
3613                          */
3614                         u64 search_start = key.offset;
3615                         u64 found_start;
3616
3617                         /*
3618                          * First iteration, @start matches @offset and it's
3619                          * within the hole.
3620                          */
3621                         if (start == offset)
3622                                 search_start = offset;
3623
3624                         found = find_desired_extent_in_hole(inode, whence,
3625                                                             delalloc_cached_state,
3626                                                             search_start,
3627                                                             extent_end - 1,
3628                                                             &found_start);
3629                         if (found) {
3630                                 start = found_start;
3631                                 break;
3632                         }
3633                         /*
3634                          * Didn't find data or a hole (due to delalloc) in the
3635                          * implicit hole range, so need to analyze the next
3636                          * extent item.
3637                          */
3638                 } else {
3639                         /*
3640                          * Found a regular or inline extent.
3641                          * If we are seeking for data, adjust the start offset
3642                          * and stop, we're done.
3643                          */
3644                         if (whence == SEEK_DATA) {
3645                                 start = max_t(u64, key.offset, offset);
3646                                 found = true;
3647                                 break;
3648                         }
3649                         /*
3650                          * Else, we are seeking for a hole, check the next file
3651                          * extent item.
3652                          */
3653                 }
3654
3655                 start = extent_end;
3656                 last_extent_end = extent_end;
3657                 path->slots[0]++;
3658                 if (fatal_signal_pending(current)) {
3659                         ret = -EINTR;
3660                         goto out;
3661                 }
3662                 cond_resched();
3663         }
3664
3665         /* We have an implicit hole from the last extent found up to i_size. */
3666         if (!found && start < i_size) {
3667                 found = find_desired_extent_in_hole(inode, whence,
3668                                                     delalloc_cached_state, start,
3669                                                     i_size - 1, &start);
3670                 if (!found)
3671                         start = i_size;
3672         }
3673
3674 out:
3675         unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3676         btrfs_free_path(path);
3677
3678         if (ret < 0)
3679                 return ret;
3680
3681         if (whence == SEEK_DATA && start >= i_size)
3682                 return -ENXIO;
3683
3684         return min_t(loff_t, start, i_size);
3685 }
3686
3687 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3688 {
3689         struct inode *inode = file->f_mapping->host;
3690
3691         switch (whence) {
3692         default:
3693                 return generic_file_llseek(file, offset, whence);
3694         case SEEK_DATA:
3695         case SEEK_HOLE:
3696                 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3697                 offset = find_desired_extent(file, offset, whence);
3698                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3699                 break;
3700         }
3701
3702         if (offset < 0)
3703                 return offset;
3704
3705         return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3706 }
3707
3708 static int btrfs_file_open(struct inode *inode, struct file *filp)
3709 {
3710         int ret;
3711
3712         filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
3713                         FMODE_CAN_ODIRECT;
3714
3715         ret = fsverity_file_open(inode, filp);
3716         if (ret)
3717                 return ret;
3718         return generic_file_open(inode, filp);
3719 }
3720
3721 static int check_direct_read(struct btrfs_fs_info *fs_info,
3722                              const struct iov_iter *iter, loff_t offset)
3723 {
3724         int ret;
3725         int i, seg;
3726
3727         ret = check_direct_IO(fs_info, iter, offset);
3728         if (ret < 0)
3729                 return ret;
3730
3731         if (!iter_is_iovec(iter))
3732                 return 0;
3733
3734         for (seg = 0; seg < iter->nr_segs; seg++) {
3735                 for (i = seg + 1; i < iter->nr_segs; i++) {
3736                         const struct iovec *iov1 = iter_iov(iter) + seg;
3737                         const struct iovec *iov2 = iter_iov(iter) + i;
3738
3739                         if (iov1->iov_base == iov2->iov_base)
3740                                 return -EINVAL;
3741                 }
3742         }
3743         return 0;
3744 }
3745
3746 static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3747 {
3748         struct inode *inode = file_inode(iocb->ki_filp);
3749         size_t prev_left = 0;
3750         ssize_t read = 0;
3751         ssize_t ret;
3752
3753         if (fsverity_active(inode))
3754                 return 0;
3755
3756         if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
3757                 return 0;
3758
3759         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3760 again:
3761         /*
3762          * This is similar to what we do for direct IO writes, see the comment
3763          * at btrfs_direct_write(), but we also disable page faults in addition
3764          * to disabling them only at the iov_iter level. This is because when
3765          * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3766          * which can still trigger page fault ins despite having set ->nofault
3767          * to true of our 'to' iov_iter.
3768          *
3769          * The difference to direct IO writes is that we deadlock when trying
3770          * to lock the extent range in the inode's tree during he page reads
3771          * triggered by the fault in (while for writes it is due to waiting for
3772          * our own ordered extent). This is because for direct IO reads,
3773          * btrfs_dio_iomap_begin() returns with the extent range locked, which
3774          * is only unlocked in the endio callback (end_bio_extent_readpage()).
3775          */
3776         pagefault_disable();
3777         to->nofault = true;
3778         ret = btrfs_dio_read(iocb, to, read);
3779         to->nofault = false;
3780         pagefault_enable();
3781
3782         /* No increment (+=) because iomap returns a cumulative value. */
3783         if (ret > 0)
3784                 read = ret;
3785
3786         if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
3787                 const size_t left = iov_iter_count(to);
3788
3789                 if (left == prev_left) {
3790                         /*
3791                          * We didn't make any progress since the last attempt,
3792                          * fallback to a buffered read for the remainder of the
3793                          * range. This is just to avoid any possibility of looping
3794                          * for too long.
3795                          */
3796                         ret = read;
3797                 } else {
3798                         /*
3799                          * We made some progress since the last retry or this is
3800                          * the first time we are retrying. Fault in as many pages
3801                          * as possible and retry.
3802                          */
3803                         fault_in_iov_iter_writeable(to, left);
3804                         prev_left = left;
3805                         goto again;
3806                 }
3807         }
3808         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3809         return ret < 0 ? ret : read;
3810 }
3811
3812 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3813 {
3814         ssize_t ret = 0;
3815
3816         if (iocb->ki_flags & IOCB_DIRECT) {
3817                 ret = btrfs_direct_read(iocb, to);
3818                 if (ret < 0 || !iov_iter_count(to) ||
3819                     iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3820                         return ret;
3821         }
3822
3823         return filemap_read(iocb, to, ret);
3824 }
3825
3826 const struct file_operations btrfs_file_operations = {
3827         .llseek         = btrfs_file_llseek,
3828         .read_iter      = btrfs_file_read_iter,
3829         .splice_read    = filemap_splice_read,
3830         .write_iter     = btrfs_file_write_iter,
3831         .splice_write   = iter_file_splice_write,
3832         .mmap           = btrfs_file_mmap,
3833         .open           = btrfs_file_open,
3834         .release        = btrfs_release_file,
3835         .get_unmapped_area = thp_get_unmapped_area,
3836         .fsync          = btrfs_sync_file,
3837         .fallocate      = btrfs_fallocate,
3838         .unlocked_ioctl = btrfs_ioctl,
3839 #ifdef CONFIG_COMPAT
3840         .compat_ioctl   = btrfs_compat_ioctl,
3841 #endif
3842         .remap_file_range = btrfs_remap_file_range,
3843 };
3844
3845 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3846 {
3847         int ret;
3848
3849         /*
3850          * So with compression we will find and lock a dirty page and clear the
3851          * first one as dirty, setup an async extent, and immediately return
3852          * with the entire range locked but with nobody actually marked with
3853          * writeback.  So we can't just filemap_write_and_wait_range() and
3854          * expect it to work since it will just kick off a thread to do the
3855          * actual work.  So we need to call filemap_fdatawrite_range _again_
3856          * since it will wait on the page lock, which won't be unlocked until
3857          * after the pages have been marked as writeback and so we're good to go
3858          * from there.  We have to do this otherwise we'll miss the ordered
3859          * extents and that results in badness.  Please Josef, do not think you
3860          * know better and pull this out at some point in the future, it is
3861          * right and you are wrong.
3862          */
3863         ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3864         if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3865                              &BTRFS_I(inode)->runtime_flags))
3866                 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3867
3868         return ret;
3869 }