fs/zonefs/file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Simple file system for zoned block devices exposing zones as files.
   4  *
   5  * Copyright (C) 2022 Western Digital Corporation or its affiliates.
   6  */
   7 #include <linux/module.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/iomap.h>
  10 #include <linux/init.h>
  11 #include <linux/slab.h>
  12 #include <linux/blkdev.h>
  13 #include <linux/statfs.h>
  14 #include <linux/writeback.h>
  15 #include <linux/quotaops.h>
  16 #include <linux/seq_file.h>
  17 #include <linux/parser.h>
  18 #include <linux/uio.h>
  19 #include <linux/mman.h>
  20 #include <linux/sched/mm.h>
  21 #include <linux/task_io_accounting_ops.h>
  22
  23 #include "zonefs.h"
  24
  25 #include "trace.h"
  26
  27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
  28                                    loff_t length, unsigned int flags,
  29                                    struct iomap *iomap, struct iomap *srcmap)
  30 {
  31         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  32         struct zonefs_zone *z = zonefs_inode_zone(inode);
  33         struct super_block *sb = inode->i_sb;
  34         loff_t isize;
  35
  36         /*
  37          * All blocks are always mapped below EOF. If reading past EOF,
  38          * act as if there is a hole up to the file maximum size.
  39          */
  40         mutex_lock(&zi->i_truncate_mutex);
  41         iomap->bdev = inode->i_sb->s_bdev;
  42         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  43         isize = i_size_read(inode);
  44         if (iomap->offset >= isize) {
  45                 iomap->type = IOMAP_HOLE;
  46                 iomap->addr = IOMAP_NULL_ADDR;
  47                 iomap->length = length;
  48         } else {
  49                 iomap->type = IOMAP_MAPPED;
  50                 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  51                 iomap->length = isize - iomap->offset;
  52         }
  53         mutex_unlock(&zi->i_truncate_mutex);
  54
  55         trace_zonefs_iomap_begin(inode, iomap);
  56
  57         return 0;
  58 }
  59
  60 static const struct iomap_ops zonefs_read_iomap_ops = {
  61         .iomap_begin    = zonefs_read_iomap_begin,
  62 };
  63
  64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
  65                                     loff_t length, unsigned int flags,
  66                                     struct iomap *iomap, struct iomap *srcmap)
  67 {
  68         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  69         struct zonefs_zone *z = zonefs_inode_zone(inode);
  70         struct super_block *sb = inode->i_sb;
  71         loff_t isize;
  72
  73         /* All write I/Os should always be within the file maximum size */
  74         if (WARN_ON_ONCE(offset + length > z->z_capacity))
  75                 return -EIO;
  76
  77         /*
  78          * Sequential zones can only accept direct writes. This is already
  79          * checked when writes are issued, so warn if we see a page writeback
  80          * operation.
  81          */
  82         if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
  83                 return -EIO;
  84
  85         /*
  86          * For conventional zones, all blocks are always mapped. For sequential
  87          * zones, all blocks after always mapped below the inode size (zone
  88          * write pointer) and unwriten beyond.
  89          */
  90         mutex_lock(&zi->i_truncate_mutex);
  91         iomap->bdev = inode->i_sb->s_bdev;
  92         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  93         iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  94         isize = i_size_read(inode);
  95         if (iomap->offset >= isize) {
  96                 iomap->type = IOMAP_UNWRITTEN;
  97                 iomap->length = z->z_capacity - iomap->offset;
  98         } else {
  99                 iomap->type = IOMAP_MAPPED;
 100                 iomap->length = isize - iomap->offset;
 101         }
 102         mutex_unlock(&zi->i_truncate_mutex);
 103
 104         trace_zonefs_iomap_begin(inode, iomap);
 105
 106         return 0;
 107 }
 108
 109 static const struct iomap_ops zonefs_write_iomap_ops = {
 110         .iomap_begin    = zonefs_write_iomap_begin,
 111 };
 112
 113 static int zonefs_read_folio(struct file *unused, struct folio *folio)
 114 {
 115         return iomap_read_folio(folio, &zonefs_read_iomap_ops);
 116 }
 117
 118 static void zonefs_readahead(struct readahead_control *rac)
 119 {
 120         iomap_readahead(rac, &zonefs_read_iomap_ops);
 121 }
 122
 123 /*
 124  * Map blocks for page writeback. This is used only on conventional zone files,
 125  * which implies that the page range can only be within the fixed inode size.
 126  */
 127 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
 128                                    struct inode *inode, loff_t offset)
 129 {
 130         struct zonefs_zone *z = zonefs_inode_zone(inode);
 131
 132         if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
 133                 return -EIO;
 134         if (WARN_ON_ONCE(offset >= i_size_read(inode)))
 135                 return -EIO;
 136
 137         /* If the mapping is already OK, nothing needs to be done */
 138         if (offset >= wpc->iomap.offset &&
 139             offset < wpc->iomap.offset + wpc->iomap.length)
 140                 return 0;
 141
 142         return zonefs_write_iomap_begin(inode, offset,
 143                                         z->z_capacity - offset,
 144                                         IOMAP_WRITE, &wpc->iomap, NULL);
 145 }
 146
 147 static const struct iomap_writeback_ops zonefs_writeback_ops = {
 148         .map_blocks             = zonefs_write_map_blocks,
 149 };
 150
 151 static int zonefs_writepages(struct address_space *mapping,
 152                              struct writeback_control *wbc)
 153 {
 154         struct iomap_writepage_ctx wpc = { };
 155
 156         return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
 157 }
 158
 159 static int zonefs_swap_activate(struct swap_info_struct *sis,
 160                                 struct file *swap_file, sector_t *span)
 161 {
 162         struct inode *inode = file_inode(swap_file);
 163
 164         if (zonefs_inode_is_seq(inode)) {
 165                 zonefs_err(inode->i_sb,
 166                            "swap file: not a conventional zone file\n");
 167                 return -EINVAL;
 168         }
 169
 170         return iomap_swapfile_activate(sis, swap_file, span,
 171                                        &zonefs_read_iomap_ops);
 172 }
 173
 174 const struct address_space_operations zonefs_file_aops = {
 175         .read_folio             = zonefs_read_folio,
 176         .readahead              = zonefs_readahead,
 177         .writepages             = zonefs_writepages,
 178         .dirty_folio            = filemap_dirty_folio,
 179         .release_folio          = iomap_release_folio,
 180         .invalidate_folio       = iomap_invalidate_folio,
 181         .migrate_folio          = filemap_migrate_folio,
 182         .is_partially_uptodate  = iomap_is_partially_uptodate,
 183         .error_remove_page      = generic_error_remove_page,
 184         .direct_IO              = noop_direct_IO,
 185         .swap_activate          = zonefs_swap_activate,
 186 };
 187
 188 int zonefs_file_truncate(struct inode *inode, loff_t isize)
 189 {
 190         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 191         struct zonefs_zone *z = zonefs_inode_zone(inode);
 192         loff_t old_isize;
 193         enum req_op op;
 194         int ret = 0;
 195
 196         /*
 197          * Only sequential zone files can be truncated and truncation is allowed
 198          * only down to a 0 size, which is equivalent to a zone reset, and to
 199          * the maximum file size, which is equivalent to a zone finish.
 200          */
 201         if (!zonefs_zone_is_seq(z))
 202                 return -EPERM;
 203
 204         if (!isize)
 205                 op = REQ_OP_ZONE_RESET;
 206         else if (isize == z->z_capacity)
 207                 op = REQ_OP_ZONE_FINISH;
 208         else
 209                 return -EPERM;
 210
 211         inode_dio_wait(inode);
 212
 213         /* Serialize against page faults */
 214         filemap_invalidate_lock(inode->i_mapping);
 215
 216         /* Serialize against zonefs_iomap_begin() */
 217         mutex_lock(&zi->i_truncate_mutex);
 218
 219         old_isize = i_size_read(inode);
 220         if (isize == old_isize)
 221                 goto unlock;
 222
 223         ret = zonefs_inode_zone_mgmt(inode, op);
 224         if (ret)
 225                 goto unlock;
 226
 227         /*
 228          * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
 229          * take care of open zones.
 230          */
 231         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 232                 /*
 233                  * Truncating a zone to EMPTY or FULL is the equivalent of
 234                  * closing the zone. For a truncation to 0, we need to
 235                  * re-open the zone to ensure new writes can be processed.
 236                  * For a truncation to the maximum file size, the zone is
 237                  * closed and writes cannot be accepted anymore, so clear
 238                  * the open flag.
 239                  */
 240                 if (!isize)
 241                         ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
 242                 else
 243                         z->z_flags &= ~ZONEFS_ZONE_OPEN;
 244         }
 245
 246         zonefs_update_stats(inode, isize);
 247         truncate_setsize(inode, isize);
 248         z->z_wpoffset = isize;
 249         zonefs_inode_account_active(inode);
 250
 251 unlock:
 252         mutex_unlock(&zi->i_truncate_mutex);
 253         filemap_invalidate_unlock(inode->i_mapping);
 254
 255         return ret;
 256 }
 257
 258 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
 259                              int datasync)
 260 {
 261         struct inode *inode = file_inode(file);
 262         int ret = 0;
 263
 264         if (unlikely(IS_IMMUTABLE(inode)))
 265                 return -EPERM;
 266
 267         /*
 268          * Since only direct writes are allowed in sequential files, page cache
 269          * flush is needed only for conventional zone files.
 270          */
 271         if (zonefs_inode_is_cnv(inode))
 272                 ret = file_write_and_wait_range(file, start, end);
 273         if (!ret)
 274                 ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 275
 276         if (ret)
 277                 zonefs_io_error(inode, true);
 278
 279         return ret;
 280 }
 281
 282 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
 283 {
 284         struct inode *inode = file_inode(vmf->vma->vm_file);
 285         vm_fault_t ret;
 286
 287         if (unlikely(IS_IMMUTABLE(inode)))
 288                 return VM_FAULT_SIGBUS;
 289
 290         /*
 291          * Sanity check: only conventional zone files can have shared
 292          * writeable mappings.
 293          */
 294         if (zonefs_inode_is_seq(inode))
 295                 return VM_FAULT_NOPAGE;
 296
 297         sb_start_pagefault(inode->i_sb);
 298         file_update_time(vmf->vma->vm_file);
 299
 300         /* Serialize against truncates */
 301         filemap_invalidate_lock_shared(inode->i_mapping);
 302         ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
 303         filemap_invalidate_unlock_shared(inode->i_mapping);
 304
 305         sb_end_pagefault(inode->i_sb);
 306         return ret;
 307 }
 308
 309 static const struct vm_operations_struct zonefs_file_vm_ops = {
 310         .fault          = filemap_fault,
 311         .map_pages      = filemap_map_pages,
 312         .page_mkwrite   = zonefs_filemap_page_mkwrite,
 313 };
 314
 315 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
 316 {
 317         /*
 318          * Conventional zones accept random writes, so their files can support
 319          * shared writable mappings. For sequential zone files, only read
 320          * mappings are possible since there are no guarantees for write
 321          * ordering between msync() and page cache writeback.
 322          */
 323         if (zonefs_inode_is_seq(file_inode(file)) &&
 324             (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
 325                 return -EINVAL;
 326
 327         file_accessed(file);
 328         vma->vm_ops = &zonefs_file_vm_ops;
 329
 330         return 0;
 331 }
 332
 333 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
 334 {
 335         loff_t isize = i_size_read(file_inode(file));
 336
 337         /*
 338          * Seeks are limited to below the zone size for conventional zones
 339          * and below the zone write pointer for sequential zones. In both
 340          * cases, this limit is the inode size.
 341          */
 342         return generic_file_llseek_size(file, offset, whence, isize, isize);
 343 }
 344
 345 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
 346                                         int error, unsigned int flags)
 347 {
 348         struct inode *inode = file_inode(iocb->ki_filp);
 349         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 350
 351         if (error) {
 352                 /*
 353                  * For Sync IOs, error recovery is called from
 354                  * zonefs_file_dio_write().
 355                  */
 356                 if (!is_sync_kiocb(iocb))
 357                         zonefs_io_error(inode, true);
 358                 return error;
 359         }
 360
 361         if (size && zonefs_inode_is_seq(inode)) {
 362                 /*
 363                  * Note that we may be seeing completions out of order,
 364                  * but that is not a problem since a write completed
 365                  * successfully necessarily means that all preceding writes
 366                  * were also successful. So we can safely increase the inode
 367                  * size to the write end location.
 368                  */
 369                 mutex_lock(&zi->i_truncate_mutex);
 370                 if (i_size_read(inode) < iocb->ki_pos + size) {
 371                         zonefs_update_stats(inode, iocb->ki_pos + size);
 372                         zonefs_i_size_write(inode, iocb->ki_pos + size);
 373                 }
 374                 mutex_unlock(&zi->i_truncate_mutex);
 375         }
 376
 377         return 0;
 378 }
 379
 380 static const struct iomap_dio_ops zonefs_write_dio_ops = {
 381         .end_io                 = zonefs_file_write_dio_end_io,
 382 };
 383
 384 static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 385 {
 386         struct inode *inode = file_inode(iocb->ki_filp);
 387         struct zonefs_zone *z = zonefs_inode_zone(inode);
 388         struct block_device *bdev = inode->i_sb->s_bdev;
 389         unsigned int max = bdev_max_zone_append_sectors(bdev);
 390         pgoff_t start, end;
 391         struct bio *bio;
 392         ssize_t size;
 393         int nr_pages;
 394         ssize_t ret;
 395
 396         max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
 397         iov_iter_truncate(from, max);
 398
 399         /*
 400          * If the inode block size (zone write granularity) is smaller than the
 401          * page size, we may be appending data belonging to the last page of the
 402          * inode straddling inode->i_size, with that page already cached due to
 403          * a buffered read or readahead. So make sure to invalidate that page.
 404          * This will always be a no-op for the case where the block size is
 405          * equal to the page size.
 406          */
 407         start = iocb->ki_pos >> PAGE_SHIFT;
 408         end = (iocb->ki_pos + iov_iter_count(from) - 1) >> PAGE_SHIFT;
 409         if (invalidate_inode_pages2_range(inode->i_mapping, start, end))
 410                 return -EBUSY;
 411
 412         nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
 413         if (!nr_pages)
 414                 return 0;
 415
 416         bio = bio_alloc(bdev, nr_pages,
 417                         REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
 418         bio->bi_iter.bi_sector = z->z_sector;
 419         bio->bi_ioprio = iocb->ki_ioprio;
 420         if (iocb_is_dsync(iocb))
 421                 bio->bi_opf |= REQ_FUA;
 422
 423         ret = bio_iov_iter_get_pages(bio, from);
 424         if (unlikely(ret))
 425                 goto out_release;
 426
 427         size = bio->bi_iter.bi_size;
 428         task_io_account_write(size);
 429
 430         if (iocb->ki_flags & IOCB_HIPRI)
 431                 bio_set_polled(bio, iocb);
 432
 433         ret = submit_bio_wait(bio);
 434
 435         /*
 436          * If the file zone was written underneath the file system, the zone
 437          * write pointer may not be where we expect it to be, but the zone
 438          * append write can still succeed. So check manually that we wrote where
 439          * we intended to, that is, at zi->i_wpoffset.
 440          */
 441         if (!ret) {
 442                 sector_t wpsector =
 443                         z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT);
 444
 445                 if (bio->bi_iter.bi_sector != wpsector) {
 446                         zonefs_warn(inode->i_sb,
 447                                 "Corrupted write pointer %llu for zone at %llu\n",
 448                                 bio->bi_iter.bi_sector, z->z_sector);
 449                         ret = -EIO;
 450                 }
 451         }
 452
 453         zonefs_file_write_dio_end_io(iocb, size, ret, 0);
 454         trace_zonefs_file_dio_append(inode, size, ret);
 455
 456 out_release:
 457         bio_release_pages(bio, false);
 458         bio_put(bio);
 459
 460         if (ret >= 0) {
 461                 iocb->ki_pos += size;
 462                 return size;
 463         }
 464
 465         return ret;
 466 }
 467
 468 /*
 469  * Do not exceed the LFS limits nor the file zone size. If pos is under the
 470  * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
 471  */
 472 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
 473                                         loff_t count)
 474 {
 475         struct inode *inode = file_inode(file);
 476         struct zonefs_zone *z = zonefs_inode_zone(inode);
 477         loff_t limit = rlimit(RLIMIT_FSIZE);
 478         loff_t max_size = z->z_capacity;
 479
 480         if (limit != RLIM_INFINITY) {
 481                 if (pos >= limit) {
 482                         send_sig(SIGXFSZ, current, 0);
 483                         return -EFBIG;
 484                 }
 485                 count = min(count, limit - pos);
 486         }
 487
 488         if (!(file->f_flags & O_LARGEFILE))
 489                 max_size = min_t(loff_t, MAX_NON_LFS, max_size);
 490
 491         if (unlikely(pos >= max_size))
 492                 return -EFBIG;
 493
 494         return min(count, max_size - pos);
 495 }
 496
 497 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
 498 {
 499         struct file *file = iocb->ki_filp;
 500         struct inode *inode = file_inode(file);
 501         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 502         struct zonefs_zone *z = zonefs_inode_zone(inode);
 503         loff_t count;
 504
 505         if (IS_SWAPFILE(inode))
 506                 return -ETXTBSY;
 507
 508         if (!iov_iter_count(from))
 509                 return 0;
 510
 511         if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 512                 return -EINVAL;
 513
 514         if (iocb->ki_flags & IOCB_APPEND) {
 515                 if (zonefs_zone_is_cnv(z))
 516                         return -EINVAL;
 517                 mutex_lock(&zi->i_truncate_mutex);
 518                 iocb->ki_pos = z->z_wpoffset;
 519                 mutex_unlock(&zi->i_truncate_mutex);
 520         }
 521
 522         count = zonefs_write_check_limits(file, iocb->ki_pos,
 523                                           iov_iter_count(from));
 524         if (count < 0)
 525                 return count;
 526
 527         iov_iter_truncate(from, count);
 528         return iov_iter_count(from);
 529 }
 530
 531 /*
 532  * Handle direct writes. For sequential zone files, this is the only possible
 533  * write path. For these files, check that the user is issuing writes
 534  * sequentially from the end of the file. This code assumes that the block layer
 535  * delivers write requests to the device in sequential order. This is always the
 536  * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
 537  * elevator feature is being used (e.g. mq-deadline). The block layer always
 538  * automatically select such an elevator for zoned block devices during the
 539  * device initialization.
 540  */
 541 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 542 {
 543         struct inode *inode = file_inode(iocb->ki_filp);
 544         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 545         struct zonefs_zone *z = zonefs_inode_zone(inode);
 546         struct super_block *sb = inode->i_sb;
 547         bool sync = is_sync_kiocb(iocb);
 548         bool append = false;
 549         ssize_t ret, count;
 550
 551         /*
 552          * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
 553          * as this can cause write reordering (e.g. the first aio gets EAGAIN
 554          * on the inode lock but the second goes through but is now unaligned).
 555          */
 556         if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT))
 557                 return -EOPNOTSUPP;
 558
 559         if (iocb->ki_flags & IOCB_NOWAIT) {
 560                 if (!inode_trylock(inode))
 561                         return -EAGAIN;
 562         } else {
 563                 inode_lock(inode);
 564         }
 565
 566         count = zonefs_write_checks(iocb, from);
 567         if (count <= 0) {
 568                 ret = count;
 569                 goto inode_unlock;
 570         }
 571
 572         if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 573                 ret = -EINVAL;
 574                 goto inode_unlock;
 575         }
 576
 577         /* Enforce sequential writes (append only) in sequential zones */
 578         if (zonefs_zone_is_seq(z)) {
 579                 mutex_lock(&zi->i_truncate_mutex);
 580                 if (iocb->ki_pos != z->z_wpoffset) {
 581                         mutex_unlock(&zi->i_truncate_mutex);
 582                         ret = -EINVAL;
 583                         goto inode_unlock;
 584                 }
 585                 /*
 586                  * Advance the zone write pointer offset. This assumes that the
 587                  * IO will succeed, which is OK to do because we do not allow
 588                  * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
 589                  * fails, the error path will correct the write pointer offset.
 590                  */
 591                 z->z_wpoffset += count;
 592                 zonefs_inode_account_active(inode);
 593                 mutex_unlock(&zi->i_truncate_mutex);
 594                 append = sync;
 595         }
 596
 597         if (append) {
 598                 ret = zonefs_file_dio_append(iocb, from);
 599         } else {
 600                 /*
 601                  * iomap_dio_rw() may return ENOTBLK if there was an issue with
 602                  * page invalidation. Overwrite that error code with EBUSY to
 603                  * be consistent with zonefs_file_dio_append() return value for
 604                  * similar issues.
 605                  */
 606                 ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
 607                                    &zonefs_write_dio_ops, 0, NULL, 0);
 608                 if (ret == -ENOTBLK)
 609                         ret = -EBUSY;
 610         }
 611
 612         /*
 613          * For a failed IO or partial completion, trigger error recovery
 614          * to update the zone write pointer offset to a correct value.
 615          * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
 616          * have executed error recovery if the IO already completed when we
 617          * reach here. However, we cannot know that and execute error recovery
 618          * again (that will not change anything).
 619          */
 620         if (zonefs_zone_is_seq(z)) {
 621                 if (ret > 0 && ret != count)
 622                         ret = -EIO;
 623                 if (ret < 0 && ret != -EIOCBQUEUED)
 624                         zonefs_io_error(inode, true);
 625         }
 626
 627 inode_unlock:
 628         inode_unlock(inode);
 629
 630         return ret;
 631 }
 632
 633 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
 634                                           struct iov_iter *from)
 635 {
 636         struct inode *inode = file_inode(iocb->ki_filp);
 637         ssize_t ret;
 638
 639         /*
 640          * Direct IO writes are mandatory for sequential zone files so that the
 641          * write IO issuing order is preserved.
 642          */
 643         if (zonefs_inode_is_seq(inode))
 644                 return -EIO;
 645
 646         if (iocb->ki_flags & IOCB_NOWAIT) {
 647                 if (!inode_trylock(inode))
 648                         return -EAGAIN;
 649         } else {
 650                 inode_lock(inode);
 651         }
 652
 653         ret = zonefs_write_checks(iocb, from);
 654         if (ret <= 0)
 655                 goto inode_unlock;
 656
 657         ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
 658         if (ret > 0)
 659                 iocb->ki_pos += ret;
 660         else if (ret == -EIO)
 661                 zonefs_io_error(inode, true);
 662
 663 inode_unlock:
 664         inode_unlock(inode);
 665         if (ret > 0)
 666                 ret = generic_write_sync(iocb, ret);
 667
 668         return ret;
 669 }
 670
 671 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 672 {
 673         struct inode *inode = file_inode(iocb->ki_filp);
 674         struct zonefs_zone *z = zonefs_inode_zone(inode);
 675
 676         if (unlikely(IS_IMMUTABLE(inode)))
 677                 return -EPERM;
 678
 679         if (sb_rdonly(inode->i_sb))
 680                 return -EROFS;
 681
 682         /* Write operations beyond the zone capacity are not allowed */
 683         if (iocb->ki_pos >= z->z_capacity)
 684                 return -EFBIG;
 685
 686         if (iocb->ki_flags & IOCB_DIRECT) {
 687                 ssize_t ret = zonefs_file_dio_write(iocb, from);
 688
 689                 if (ret != -ENOTBLK)
 690                         return ret;
 691         }
 692
 693         return zonefs_file_buffered_write(iocb, from);
 694 }
 695
 696 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
 697                                        int error, unsigned int flags)
 698 {
 699         if (error) {
 700                 zonefs_io_error(file_inode(iocb->ki_filp), false);
 701                 return error;
 702         }
 703
 704         return 0;
 705 }
 706
 707 static const struct iomap_dio_ops zonefs_read_dio_ops = {
 708         .end_io                 = zonefs_file_read_dio_end_io,
 709 };
 710
 711 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 712 {
 713         struct inode *inode = file_inode(iocb->ki_filp);
 714         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 715         struct zonefs_zone *z = zonefs_inode_zone(inode);
 716         struct super_block *sb = inode->i_sb;
 717         loff_t isize;
 718         ssize_t ret;
 719
 720         /* Offline zones cannot be read */
 721         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 722                 return -EPERM;
 723
 724         if (iocb->ki_pos >= z->z_capacity)
 725                 return 0;
 726
 727         if (iocb->ki_flags & IOCB_NOWAIT) {
 728                 if (!inode_trylock_shared(inode))
 729                         return -EAGAIN;
 730         } else {
 731                 inode_lock_shared(inode);
 732         }
 733
 734         /* Limit read operations to written data */
 735         mutex_lock(&zi->i_truncate_mutex);
 736         isize = i_size_read(inode);
 737         if (iocb->ki_pos >= isize) {
 738                 mutex_unlock(&zi->i_truncate_mutex);
 739                 ret = 0;
 740                 goto inode_unlock;
 741         }
 742         iov_iter_truncate(to, isize - iocb->ki_pos);
 743         mutex_unlock(&zi->i_truncate_mutex);
 744
 745         if (iocb->ki_flags & IOCB_DIRECT) {
 746                 size_t count = iov_iter_count(to);
 747
 748                 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 749                         ret = -EINVAL;
 750                         goto inode_unlock;
 751                 }
 752                 file_accessed(iocb->ki_filp);
 753                 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
 754                                    &zonefs_read_dio_ops, 0, NULL, 0);
 755         } else {
 756                 ret = generic_file_read_iter(iocb, to);
 757                 if (ret == -EIO)
 758                         zonefs_io_error(inode, false);
 759         }
 760
 761 inode_unlock:
 762         inode_unlock_shared(inode);
 763
 764         return ret;
 765 }
 766
 767 /*
 768  * Write open accounting is done only for sequential files.
 769  */
 770 static inline bool zonefs_seq_file_need_wro(struct inode *inode,
 771                                             struct file *file)
 772 {
 773         if (zonefs_inode_is_cnv(inode))
 774                 return false;
 775
 776         if (!(file->f_mode & FMODE_WRITE))
 777                 return false;
 778
 779         return true;
 780 }
 781
 782 static int zonefs_seq_file_write_open(struct inode *inode)
 783 {
 784         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 785         struct zonefs_zone *z = zonefs_inode_zone(inode);
 786         int ret = 0;
 787
 788         mutex_lock(&zi->i_truncate_mutex);
 789
 790         if (!zi->i_wr_refcnt) {
 791                 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 792                 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
 793
 794                 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
 795
 796                         if (sbi->s_max_wro_seq_files
 797                             && wro > sbi->s_max_wro_seq_files) {
 798                                 atomic_dec(&sbi->s_wro_seq_files);
 799                                 ret = -EBUSY;
 800                                 goto unlock;
 801                         }
 802
 803                         if (i_size_read(inode) < z->z_capacity) {
 804                                 ret = zonefs_inode_zone_mgmt(inode,
 805                                                              REQ_OP_ZONE_OPEN);
 806                                 if (ret) {
 807                                         atomic_dec(&sbi->s_wro_seq_files);
 808                                         goto unlock;
 809                                 }
 810                                 z->z_flags |= ZONEFS_ZONE_OPEN;
 811                                 zonefs_inode_account_active(inode);
 812                         }
 813                 }
 814         }
 815
 816         zi->i_wr_refcnt++;
 817
 818 unlock:
 819         mutex_unlock(&zi->i_truncate_mutex);
 820
 821         return ret;
 822 }
 823
 824 static int zonefs_file_open(struct inode *inode, struct file *file)
 825 {
 826         int ret;
 827
 828         ret = generic_file_open(inode, file);
 829         if (ret)
 830                 return ret;
 831
 832         if (zonefs_seq_file_need_wro(inode, file))
 833                 return zonefs_seq_file_write_open(inode);
 834
 835         return 0;
 836 }
 837
 838 static void zonefs_seq_file_write_close(struct inode *inode)
 839 {
 840         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 841         struct zonefs_zone *z = zonefs_inode_zone(inode);
 842         struct super_block *sb = inode->i_sb;
 843         struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 844         int ret = 0;
 845
 846         mutex_lock(&zi->i_truncate_mutex);
 847
 848         zi->i_wr_refcnt--;
 849         if (zi->i_wr_refcnt)
 850                 goto unlock;
 851
 852         /*
 853          * The file zone may not be open anymore (e.g. the file was truncated to
 854          * its maximum size or it was fully written). For this case, we only
 855          * need to decrement the write open count.
 856          */
 857         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 858                 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
 859                 if (ret) {
 860                         __zonefs_io_error(inode, false);
 861                         /*
 862                          * Leaving zones explicitly open may lead to a state
 863                          * where most zones cannot be written (zone resources
 864                          * exhausted). So take preventive action by remounting
 865                          * read-only.
 866                          */
 867                         if (z->z_flags & ZONEFS_ZONE_OPEN &&
 868                             !(sb->s_flags & SB_RDONLY)) {
 869                                 zonefs_warn(sb,
 870                                         "closing zone at %llu failed %d\n",
 871                                         z->z_sector, ret);
 872                                 zonefs_warn(sb,
 873                                         "remounting filesystem read-only\n");
 874                                 sb->s_flags |= SB_RDONLY;
 875                         }
 876                         goto unlock;
 877                 }
 878
 879                 z->z_flags &= ~ZONEFS_ZONE_OPEN;
 880                 zonefs_inode_account_active(inode);
 881         }
 882
 883         atomic_dec(&sbi->s_wro_seq_files);
 884
 885 unlock:
 886         mutex_unlock(&zi->i_truncate_mutex);
 887 }
 888
 889 static int zonefs_file_release(struct inode *inode, struct file *file)
 890 {
 891         /*
 892          * If we explicitly open a zone we must close it again as well, but the
 893          * zone management operation can fail (either due to an IO error or as
 894          * the zone has gone offline or read-only). Make sure we don't fail the
 895          * close(2) for user-space.
 896          */
 897         if (zonefs_seq_file_need_wro(inode, file))
 898                 zonefs_seq_file_write_close(inode);
 899
 900         return 0;
 901 }
 902
 903 const struct file_operations zonefs_file_operations = {
 904         .open           = zonefs_file_open,
 905         .release        = zonefs_file_release,
 906         .fsync          = zonefs_file_fsync,
 907         .mmap           = zonefs_file_mmap,
 908         .llseek         = zonefs_file_llseek,
 909         .read_iter      = zonefs_file_read_iter,
 910         .write_iter     = zonefs_file_write_iter,
 911         .splice_read    = generic_file_splice_read,
 912         .splice_write   = iter_file_splice_write,
 913         .iopoll         = iocb_bio_iopoll,
 914 };