fs/zonefs/file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Simple file system for zoned block devices exposing zones as files.
   4  *
   5  * Copyright (C) 2022 Western Digital Corporation or its affiliates.
   6  */
   7 #include <linux/module.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/iomap.h>
  10 #include <linux/init.h>
  11 #include <linux/slab.h>
  12 #include <linux/blkdev.h>
  13 #include <linux/statfs.h>
  14 #include <linux/writeback.h>
  15 #include <linux/quotaops.h>
  16 #include <linux/seq_file.h>
  17 #include <linux/parser.h>
  18 #include <linux/uio.h>
  19 #include <linux/mman.h>
  20 #include <linux/sched/mm.h>
  21 #include <linux/task_io_accounting_ops.h>
  22
  23 #include "zonefs.h"
  24
  25 #include "trace.h"
  26
  27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
  28                                    loff_t length, unsigned int flags,
  29                                    struct iomap *iomap, struct iomap *srcmap)
  30 {
  31         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  32         struct zonefs_zone *z = zonefs_inode_zone(inode);
  33         struct super_block *sb = inode->i_sb;
  34         loff_t isize;
  35
  36         /*
  37          * All blocks are always mapped below EOF. If reading past EOF,
  38          * act as if there is a hole up to the file maximum size.
  39          */
  40         mutex_lock(&zi->i_truncate_mutex);
  41         iomap->bdev = inode->i_sb->s_bdev;
  42         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  43         isize = i_size_read(inode);
  44         if (iomap->offset >= isize) {
  45                 iomap->type = IOMAP_HOLE;
  46                 iomap->addr = IOMAP_NULL_ADDR;
  47                 iomap->length = length;
  48         } else {
  49                 iomap->type = IOMAP_MAPPED;
  50                 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  51                 iomap->length = isize - iomap->offset;
  52         }
  53         mutex_unlock(&zi->i_truncate_mutex);
  54
  55         trace_zonefs_iomap_begin(inode, iomap);
  56
  57         return 0;
  58 }
  59
  60 static const struct iomap_ops zonefs_read_iomap_ops = {
  61         .iomap_begin    = zonefs_read_iomap_begin,
  62 };
  63
  64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
  65                                     loff_t length, unsigned int flags,
  66                                     struct iomap *iomap, struct iomap *srcmap)
  67 {
  68         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  69         struct zonefs_zone *z = zonefs_inode_zone(inode);
  70         struct super_block *sb = inode->i_sb;
  71         loff_t isize;
  72
  73         /* All write I/Os should always be within the file maximum size */
  74         if (WARN_ON_ONCE(offset + length > z->z_capacity))
  75                 return -EIO;
  76
  77         /*
  78          * Sequential zones can only accept direct writes. This is already
  79          * checked when writes are issued, so warn if we see a page writeback
  80          * operation.
  81          */
  82         if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
  83                 return -EIO;
  84
  85         /*
  86          * For conventional zones, all blocks are always mapped. For sequential
  87          * zones, all blocks after always mapped below the inode size (zone
  88          * write pointer) and unwriten beyond.
  89          */
  90         mutex_lock(&zi->i_truncate_mutex);
  91         iomap->bdev = inode->i_sb->s_bdev;
  92         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  93         iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  94         isize = i_size_read(inode);
  95         if (iomap->offset >= isize) {
  96                 iomap->type = IOMAP_UNWRITTEN;
  97                 iomap->length = z->z_capacity - iomap->offset;
  98         } else {
  99                 iomap->type = IOMAP_MAPPED;
 100                 iomap->length = isize - iomap->offset;
 101         }
 102         mutex_unlock(&zi->i_truncate_mutex);
 103
 104         trace_zonefs_iomap_begin(inode, iomap);
 105
 106         return 0;
 107 }
 108
 109 static const struct iomap_ops zonefs_write_iomap_ops = {
 110         .iomap_begin    = zonefs_write_iomap_begin,
 111 };
 112
 113 static int zonefs_read_folio(struct file *unused, struct folio *folio)
 114 {
 115         return iomap_read_folio(folio, &zonefs_read_iomap_ops);
 116 }
 117
 118 static void zonefs_readahead(struct readahead_control *rac)
 119 {
 120         iomap_readahead(rac, &zonefs_read_iomap_ops);
 121 }
 122
 123 /*
 124  * Map blocks for page writeback. This is used only on conventional zone files,
 125  * which implies that the page range can only be within the fixed inode size.
 126  */
 127 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
 128                                    struct inode *inode, loff_t offset)
 129 {
 130         struct zonefs_zone *z = zonefs_inode_zone(inode);
 131
 132         if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
 133                 return -EIO;
 134         if (WARN_ON_ONCE(offset >= i_size_read(inode)))
 135                 return -EIO;
 136
 137         /* If the mapping is already OK, nothing needs to be done */
 138         if (offset >= wpc->iomap.offset &&
 139             offset < wpc->iomap.offset + wpc->iomap.length)
 140                 return 0;
 141
 142         return zonefs_write_iomap_begin(inode, offset,
 143                                         z->z_capacity - offset,
 144                                         IOMAP_WRITE, &wpc->iomap, NULL);
 145 }
 146
 147 static const struct iomap_writeback_ops zonefs_writeback_ops = {
 148         .map_blocks             = zonefs_write_map_blocks,
 149 };
 150
 151 static int zonefs_writepages(struct address_space *mapping,
 152                              struct writeback_control *wbc)
 153 {
 154         struct iomap_writepage_ctx wpc = { };
 155
 156         return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
 157 }
 158
 159 static int zonefs_swap_activate(struct swap_info_struct *sis,
 160                                 struct file *swap_file, sector_t *span)
 161 {
 162         struct inode *inode = file_inode(swap_file);
 163
 164         if (zonefs_inode_is_seq(inode)) {
 165                 zonefs_err(inode->i_sb,
 166                            "swap file: not a conventional zone file\n");
 167                 return -EINVAL;
 168         }
 169
 170         return iomap_swapfile_activate(sis, swap_file, span,
 171                                        &zonefs_read_iomap_ops);
 172 }
 173
 174 const struct address_space_operations zonefs_file_aops = {
 175         .read_folio             = zonefs_read_folio,
 176         .readahead              = zonefs_readahead,
 177         .writepages             = zonefs_writepages,
 178         .dirty_folio            = filemap_dirty_folio,
 179         .release_folio          = iomap_release_folio,
 180         .invalidate_folio       = iomap_invalidate_folio,
 181         .migrate_folio          = filemap_migrate_folio,
 182         .is_partially_uptodate  = iomap_is_partially_uptodate,
 183         .error_remove_page      = generic_error_remove_page,
 184         .direct_IO              = noop_direct_IO,
 185         .swap_activate          = zonefs_swap_activate,
 186 };
 187
 188 int zonefs_file_truncate(struct inode *inode, loff_t isize)
 189 {
 190         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 191         struct zonefs_zone *z = zonefs_inode_zone(inode);
 192         loff_t old_isize;
 193         enum req_op op;
 194         int ret = 0;
 195
 196         /*
 197          * Only sequential zone files can be truncated and truncation is allowed
 198          * only down to a 0 size, which is equivalent to a zone reset, and to
 199          * the maximum file size, which is equivalent to a zone finish.
 200          */
 201         if (!zonefs_zone_is_seq(z))
 202                 return -EPERM;
 203
 204         if (!isize)
 205                 op = REQ_OP_ZONE_RESET;
 206         else if (isize == z->z_capacity)
 207                 op = REQ_OP_ZONE_FINISH;
 208         else
 209                 return -EPERM;
 210
 211         inode_dio_wait(inode);
 212
 213         /* Serialize against page faults */
 214         filemap_invalidate_lock(inode->i_mapping);
 215
 216         /* Serialize against zonefs_iomap_begin() */
 217         mutex_lock(&zi->i_truncate_mutex);
 218
 219         old_isize = i_size_read(inode);
 220         if (isize == old_isize)
 221                 goto unlock;
 222
 223         ret = zonefs_inode_zone_mgmt(inode, op);
 224         if (ret)
 225                 goto unlock;
 226
 227         /*
 228          * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
 229          * take care of open zones.
 230          */
 231         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 232                 /*
 233                  * Truncating a zone to EMPTY or FULL is the equivalent of
 234                  * closing the zone. For a truncation to 0, we need to
 235                  * re-open the zone to ensure new writes can be processed.
 236                  * For a truncation to the maximum file size, the zone is
 237                  * closed and writes cannot be accepted anymore, so clear
 238                  * the open flag.
 239                  */
 240                 if (!isize)
 241                         ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
 242                 else
 243                         z->z_flags &= ~ZONEFS_ZONE_OPEN;
 244         }
 245
 246         zonefs_update_stats(inode, isize);
 247         truncate_setsize(inode, isize);
 248         z->z_wpoffset = isize;
 249         zonefs_inode_account_active(inode);
 250
 251 unlock:
 252         mutex_unlock(&zi->i_truncate_mutex);
 253         filemap_invalidate_unlock(inode->i_mapping);
 254
 255         return ret;
 256 }
 257
 258 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
 259                              int datasync)
 260 {
 261         struct inode *inode = file_inode(file);
 262         int ret = 0;
 263
 264         if (unlikely(IS_IMMUTABLE(inode)))
 265                 return -EPERM;
 266
 267         /*
 268          * Since only direct writes are allowed in sequential files, page cache
 269          * flush is needed only for conventional zone files.
 270          */
 271         if (zonefs_inode_is_cnv(inode))
 272                 ret = file_write_and_wait_range(file, start, end);
 273         if (!ret)
 274                 ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 275
 276         if (ret)
 277                 zonefs_io_error(inode, true);
 278
 279         return ret;
 280 }
 281
 282 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
 283 {
 284         struct inode *inode = file_inode(vmf->vma->vm_file);
 285         vm_fault_t ret;
 286
 287         if (unlikely(IS_IMMUTABLE(inode)))
 288                 return VM_FAULT_SIGBUS;
 289
 290         /*
 291          * Sanity check: only conventional zone files can have shared
 292          * writeable mappings.
 293          */
 294         if (zonefs_inode_is_seq(inode))
 295                 return VM_FAULT_NOPAGE;
 296
 297         sb_start_pagefault(inode->i_sb);
 298         file_update_time(vmf->vma->vm_file);
 299
 300         /* Serialize against truncates */
 301         filemap_invalidate_lock_shared(inode->i_mapping);
 302         ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
 303         filemap_invalidate_unlock_shared(inode->i_mapping);
 304
 305         sb_end_pagefault(inode->i_sb);
 306         return ret;
 307 }
 308
 309 static const struct vm_operations_struct zonefs_file_vm_ops = {
 310         .fault          = filemap_fault,
 311         .map_pages      = filemap_map_pages,
 312         .page_mkwrite   = zonefs_filemap_page_mkwrite,
 313 };
 314
 315 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
 316 {
 317         /*
 318          * Conventional zones accept random writes, so their files can support
 319          * shared writable mappings. For sequential zone files, only read
 320          * mappings are possible since there are no guarantees for write
 321          * ordering between msync() and page cache writeback.
 322          */
 323         if (zonefs_inode_is_seq(file_inode(file)) &&
 324             (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
 325                 return -EINVAL;
 326
 327         file_accessed(file);
 328         vma->vm_ops = &zonefs_file_vm_ops;
 329
 330         return 0;
 331 }
 332
 333 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
 334 {
 335         loff_t isize = i_size_read(file_inode(file));
 336
 337         /*
 338          * Seeks are limited to below the zone size for conventional zones
 339          * and below the zone write pointer for sequential zones. In both
 340          * cases, this limit is the inode size.
 341          */
 342         return generic_file_llseek_size(file, offset, whence, isize, isize);
 343 }
 344
 345 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
 346                                         int error, unsigned int flags)
 347 {
 348         struct inode *inode = file_inode(iocb->ki_filp);
 349         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 350
 351         if (error) {
 352                 zonefs_io_error(inode, true);
 353                 return error;
 354         }
 355
 356         if (size && zonefs_inode_is_seq(inode)) {
 357                 /*
 358                  * Note that we may be seeing completions out of order,
 359                  * but that is not a problem since a write completed
 360                  * successfully necessarily means that all preceding writes
 361                  * were also successful. So we can safely increase the inode
 362                  * size to the write end location.
 363                  */
 364                 mutex_lock(&zi->i_truncate_mutex);
 365                 if (i_size_read(inode) < iocb->ki_pos + size) {
 366                         zonefs_update_stats(inode, iocb->ki_pos + size);
 367                         zonefs_i_size_write(inode, iocb->ki_pos + size);
 368                 }
 369                 mutex_unlock(&zi->i_truncate_mutex);
 370         }
 371
 372         return 0;
 373 }
 374
 375 static const struct iomap_dio_ops zonefs_write_dio_ops = {
 376         .end_io                 = zonefs_file_write_dio_end_io,
 377 };
 378
 379 static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 380 {
 381         struct inode *inode = file_inode(iocb->ki_filp);
 382         struct zonefs_zone *z = zonefs_inode_zone(inode);
 383         struct block_device *bdev = inode->i_sb->s_bdev;
 384         unsigned int max = bdev_max_zone_append_sectors(bdev);
 385         pgoff_t start, end;
 386         struct bio *bio;
 387         ssize_t size;
 388         int nr_pages;
 389         ssize_t ret;
 390
 391         max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
 392         iov_iter_truncate(from, max);
 393
 394         /*
 395          * If the inode block size (zone write granularity) is smaller than the
 396          * page size, we may be appending data belonging to the last page of the
 397          * inode straddling inode->i_size, with that page already cached due to
 398          * a buffered read or readahead. So make sure to invalidate that page.
 399          * This will always be a no-op for the case where the block size is
 400          * equal to the page size.
 401          */
 402         start = iocb->ki_pos >> PAGE_SHIFT;
 403         end = (iocb->ki_pos + iov_iter_count(from) - 1) >> PAGE_SHIFT;
 404         if (invalidate_inode_pages2_range(inode->i_mapping, start, end))
 405                 return -EBUSY;
 406
 407         nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
 408         if (!nr_pages)
 409                 return 0;
 410
 411         bio = bio_alloc(bdev, nr_pages,
 412                         REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
 413         bio->bi_iter.bi_sector = z->z_sector;
 414         bio->bi_ioprio = iocb->ki_ioprio;
 415         if (iocb_is_dsync(iocb))
 416                 bio->bi_opf |= REQ_FUA;
 417
 418         ret = bio_iov_iter_get_pages(bio, from);
 419         if (unlikely(ret))
 420                 goto out_release;
 421
 422         size = bio->bi_iter.bi_size;
 423         task_io_account_write(size);
 424
 425         if (iocb->ki_flags & IOCB_HIPRI)
 426                 bio_set_polled(bio, iocb);
 427
 428         ret = submit_bio_wait(bio);
 429
 430         /*
 431          * If the file zone was written underneath the file system, the zone
 432          * write pointer may not be where we expect it to be, but the zone
 433          * append write can still succeed. So check manually that we wrote where
 434          * we intended to, that is, at zi->i_wpoffset.
 435          */
 436         if (!ret) {
 437                 sector_t wpsector =
 438                         z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT);
 439
 440                 if (bio->bi_iter.bi_sector != wpsector) {
 441                         zonefs_warn(inode->i_sb,
 442                                 "Corrupted write pointer %llu for zone at %llu\n",
 443                                 bio->bi_iter.bi_sector, z->z_sector);
 444                         ret = -EIO;
 445                 }
 446         }
 447
 448         zonefs_file_write_dio_end_io(iocb, size, ret, 0);
 449         trace_zonefs_file_dio_append(inode, size, ret);
 450
 451 out_release:
 452         bio_release_pages(bio, false);
 453         bio_put(bio);
 454
 455         if (ret >= 0) {
 456                 iocb->ki_pos += size;
 457                 return size;
 458         }
 459
 460         return ret;
 461 }
 462
 463 /*
 464  * Do not exceed the LFS limits nor the file zone size. If pos is under the
 465  * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
 466  */
 467 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
 468                                         loff_t count)
 469 {
 470         struct inode *inode = file_inode(file);
 471         struct zonefs_zone *z = zonefs_inode_zone(inode);
 472         loff_t limit = rlimit(RLIMIT_FSIZE);
 473         loff_t max_size = z->z_capacity;
 474
 475         if (limit != RLIM_INFINITY) {
 476                 if (pos >= limit) {
 477                         send_sig(SIGXFSZ, current, 0);
 478                         return -EFBIG;
 479                 }
 480                 count = min(count, limit - pos);
 481         }
 482
 483         if (!(file->f_flags & O_LARGEFILE))
 484                 max_size = min_t(loff_t, MAX_NON_LFS, max_size);
 485
 486         if (unlikely(pos >= max_size))
 487                 return -EFBIG;
 488
 489         return min(count, max_size - pos);
 490 }
 491
 492 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
 493 {
 494         struct file *file = iocb->ki_filp;
 495         struct inode *inode = file_inode(file);
 496         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 497         struct zonefs_zone *z = zonefs_inode_zone(inode);
 498         loff_t count;
 499
 500         if (IS_SWAPFILE(inode))
 501                 return -ETXTBSY;
 502
 503         if (!iov_iter_count(from))
 504                 return 0;
 505
 506         if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 507                 return -EINVAL;
 508
 509         if (iocb->ki_flags & IOCB_APPEND) {
 510                 if (zonefs_zone_is_cnv(z))
 511                         return -EINVAL;
 512                 mutex_lock(&zi->i_truncate_mutex);
 513                 iocb->ki_pos = z->z_wpoffset;
 514                 mutex_unlock(&zi->i_truncate_mutex);
 515         }
 516
 517         count = zonefs_write_check_limits(file, iocb->ki_pos,
 518                                           iov_iter_count(from));
 519         if (count < 0)
 520                 return count;
 521
 522         iov_iter_truncate(from, count);
 523         return iov_iter_count(from);
 524 }
 525
 526 /*
 527  * Handle direct writes. For sequential zone files, this is the only possible
 528  * write path. For these files, check that the user is issuing writes
 529  * sequentially from the end of the file. This code assumes that the block layer
 530  * delivers write requests to the device in sequential order. This is always the
 531  * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
 532  * elevator feature is being used (e.g. mq-deadline). The block layer always
 533  * automatically select such an elevator for zoned block devices during the
 534  * device initialization.
 535  */
 536 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 537 {
 538         struct inode *inode = file_inode(iocb->ki_filp);
 539         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 540         struct zonefs_zone *z = zonefs_inode_zone(inode);
 541         struct super_block *sb = inode->i_sb;
 542         bool sync = is_sync_kiocb(iocb);
 543         bool append = false;
 544         ssize_t ret, count;
 545
 546         /*
 547          * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
 548          * as this can cause write reordering (e.g. the first aio gets EAGAIN
 549          * on the inode lock but the second goes through but is now unaligned).
 550          */
 551         if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT))
 552                 return -EOPNOTSUPP;
 553
 554         if (iocb->ki_flags & IOCB_NOWAIT) {
 555                 if (!inode_trylock(inode))
 556                         return -EAGAIN;
 557         } else {
 558                 inode_lock(inode);
 559         }
 560
 561         count = zonefs_write_checks(iocb, from);
 562         if (count <= 0) {
 563                 ret = count;
 564                 goto inode_unlock;
 565         }
 566
 567         if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 568                 ret = -EINVAL;
 569                 goto inode_unlock;
 570         }
 571
 572         /* Enforce sequential writes (append only) in sequential zones */
 573         if (zonefs_zone_is_seq(z)) {
 574                 mutex_lock(&zi->i_truncate_mutex);
 575                 if (iocb->ki_pos != z->z_wpoffset) {
 576                         mutex_unlock(&zi->i_truncate_mutex);
 577                         ret = -EINVAL;
 578                         goto inode_unlock;
 579                 }
 580                 mutex_unlock(&zi->i_truncate_mutex);
 581                 append = sync;
 582         }
 583
 584         if (append) {
 585                 ret = zonefs_file_dio_append(iocb, from);
 586         } else {
 587                 /*
 588                  * iomap_dio_rw() may return ENOTBLK if there was an issue with
 589                  * page invalidation. Overwrite that error code with EBUSY to
 590                  * be consistent with zonefs_file_dio_append() return value for
 591                  * similar issues.
 592                  */
 593                 ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
 594                                    &zonefs_write_dio_ops, 0, NULL, 0);
 595                 if (ret == -ENOTBLK)
 596                         ret = -EBUSY;
 597         }
 598
 599         if (zonefs_zone_is_seq(z) &&
 600             (ret > 0 || ret == -EIOCBQUEUED)) {
 601                 if (ret > 0)
 602                         count = ret;
 603
 604                 /*
 605                  * Update the zone write pointer offset assuming the write
 606                  * operation succeeded. If it did not, the error recovery path
 607                  * will correct it. Also do active seq file accounting.
 608                  */
 609                 mutex_lock(&zi->i_truncate_mutex);
 610                 z->z_wpoffset += count;
 611                 zonefs_inode_account_active(inode);
 612                 mutex_unlock(&zi->i_truncate_mutex);
 613         }
 614
 615 inode_unlock:
 616         inode_unlock(inode);
 617
 618         return ret;
 619 }
 620
 621 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
 622                                           struct iov_iter *from)
 623 {
 624         struct inode *inode = file_inode(iocb->ki_filp);
 625         ssize_t ret;
 626
 627         /*
 628          * Direct IO writes are mandatory for sequential zone files so that the
 629          * write IO issuing order is preserved.
 630          */
 631         if (zonefs_inode_is_seq(inode))
 632                 return -EIO;
 633
 634         if (iocb->ki_flags & IOCB_NOWAIT) {
 635                 if (!inode_trylock(inode))
 636                         return -EAGAIN;
 637         } else {
 638                 inode_lock(inode);
 639         }
 640
 641         ret = zonefs_write_checks(iocb, from);
 642         if (ret <= 0)
 643                 goto inode_unlock;
 644
 645         ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
 646         if (ret > 0)
 647                 iocb->ki_pos += ret;
 648         else if (ret == -EIO)
 649                 zonefs_io_error(inode, true);
 650
 651 inode_unlock:
 652         inode_unlock(inode);
 653         if (ret > 0)
 654                 ret = generic_write_sync(iocb, ret);
 655
 656         return ret;
 657 }
 658
 659 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 660 {
 661         struct inode *inode = file_inode(iocb->ki_filp);
 662         struct zonefs_zone *z = zonefs_inode_zone(inode);
 663
 664         if (unlikely(IS_IMMUTABLE(inode)))
 665                 return -EPERM;
 666
 667         if (sb_rdonly(inode->i_sb))
 668                 return -EROFS;
 669
 670         /* Write operations beyond the zone capacity are not allowed */
 671         if (iocb->ki_pos >= z->z_capacity)
 672                 return -EFBIG;
 673
 674         if (iocb->ki_flags & IOCB_DIRECT) {
 675                 ssize_t ret = zonefs_file_dio_write(iocb, from);
 676
 677                 if (ret != -ENOTBLK)
 678                         return ret;
 679         }
 680
 681         return zonefs_file_buffered_write(iocb, from);
 682 }
 683
 684 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
 685                                        int error, unsigned int flags)
 686 {
 687         if (error) {
 688                 zonefs_io_error(file_inode(iocb->ki_filp), false);
 689                 return error;
 690         }
 691
 692         return 0;
 693 }
 694
 695 static const struct iomap_dio_ops zonefs_read_dio_ops = {
 696         .end_io                 = zonefs_file_read_dio_end_io,
 697 };
 698
 699 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 700 {
 701         struct inode *inode = file_inode(iocb->ki_filp);
 702         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 703         struct zonefs_zone *z = zonefs_inode_zone(inode);
 704         struct super_block *sb = inode->i_sb;
 705         loff_t isize;
 706         ssize_t ret;
 707
 708         /* Offline zones cannot be read */
 709         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 710                 return -EPERM;
 711
 712         if (iocb->ki_pos >= z->z_capacity)
 713                 return 0;
 714
 715         if (iocb->ki_flags & IOCB_NOWAIT) {
 716                 if (!inode_trylock_shared(inode))
 717                         return -EAGAIN;
 718         } else {
 719                 inode_lock_shared(inode);
 720         }
 721
 722         /* Limit read operations to written data */
 723         mutex_lock(&zi->i_truncate_mutex);
 724         isize = i_size_read(inode);
 725         if (iocb->ki_pos >= isize) {
 726                 mutex_unlock(&zi->i_truncate_mutex);
 727                 ret = 0;
 728                 goto inode_unlock;
 729         }
 730         iov_iter_truncate(to, isize - iocb->ki_pos);
 731         mutex_unlock(&zi->i_truncate_mutex);
 732
 733         if (iocb->ki_flags & IOCB_DIRECT) {
 734                 size_t count = iov_iter_count(to);
 735
 736                 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 737                         ret = -EINVAL;
 738                         goto inode_unlock;
 739                 }
 740                 file_accessed(iocb->ki_filp);
 741                 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
 742                                    &zonefs_read_dio_ops, 0, NULL, 0);
 743         } else {
 744                 ret = generic_file_read_iter(iocb, to);
 745                 if (ret == -EIO)
 746                         zonefs_io_error(inode, false);
 747         }
 748
 749 inode_unlock:
 750         inode_unlock_shared(inode);
 751
 752         return ret;
 753 }
 754
 755 /*
 756  * Write open accounting is done only for sequential files.
 757  */
 758 static inline bool zonefs_seq_file_need_wro(struct inode *inode,
 759                                             struct file *file)
 760 {
 761         if (zonefs_inode_is_cnv(inode))
 762                 return false;
 763
 764         if (!(file->f_mode & FMODE_WRITE))
 765                 return false;
 766
 767         return true;
 768 }
 769
 770 static int zonefs_seq_file_write_open(struct inode *inode)
 771 {
 772         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 773         struct zonefs_zone *z = zonefs_inode_zone(inode);
 774         int ret = 0;
 775
 776         mutex_lock(&zi->i_truncate_mutex);
 777
 778         if (!zi->i_wr_refcnt) {
 779                 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 780                 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
 781
 782                 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
 783
 784                         if (sbi->s_max_wro_seq_files
 785                             && wro > sbi->s_max_wro_seq_files) {
 786                                 atomic_dec(&sbi->s_wro_seq_files);
 787                                 ret = -EBUSY;
 788                                 goto unlock;
 789                         }
 790
 791                         if (i_size_read(inode) < z->z_capacity) {
 792                                 ret = zonefs_inode_zone_mgmt(inode,
 793                                                              REQ_OP_ZONE_OPEN);
 794                                 if (ret) {
 795                                         atomic_dec(&sbi->s_wro_seq_files);
 796                                         goto unlock;
 797                                 }
 798                                 z->z_flags |= ZONEFS_ZONE_OPEN;
 799                                 zonefs_inode_account_active(inode);
 800                         }
 801                 }
 802         }
 803
 804         zi->i_wr_refcnt++;
 805
 806 unlock:
 807         mutex_unlock(&zi->i_truncate_mutex);
 808
 809         return ret;
 810 }
 811
 812 static int zonefs_file_open(struct inode *inode, struct file *file)
 813 {
 814         int ret;
 815
 816         ret = generic_file_open(inode, file);
 817         if (ret)
 818                 return ret;
 819
 820         if (zonefs_seq_file_need_wro(inode, file))
 821                 return zonefs_seq_file_write_open(inode);
 822
 823         return 0;
 824 }
 825
 826 static void zonefs_seq_file_write_close(struct inode *inode)
 827 {
 828         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 829         struct zonefs_zone *z = zonefs_inode_zone(inode);
 830         struct super_block *sb = inode->i_sb;
 831         struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 832         int ret = 0;
 833
 834         mutex_lock(&zi->i_truncate_mutex);
 835
 836         zi->i_wr_refcnt--;
 837         if (zi->i_wr_refcnt)
 838                 goto unlock;
 839
 840         /*
 841          * The file zone may not be open anymore (e.g. the file was truncated to
 842          * its maximum size or it was fully written). For this case, we only
 843          * need to decrement the write open count.
 844          */
 845         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 846                 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
 847                 if (ret) {
 848                         __zonefs_io_error(inode, false);
 849                         /*
 850                          * Leaving zones explicitly open may lead to a state
 851                          * where most zones cannot be written (zone resources
 852                          * exhausted). So take preventive action by remounting
 853                          * read-only.
 854                          */
 855                         if (z->z_flags & ZONEFS_ZONE_OPEN &&
 856                             !(sb->s_flags & SB_RDONLY)) {
 857                                 zonefs_warn(sb,
 858                                         "closing zone at %llu failed %d\n",
 859                                         z->z_sector, ret);
 860                                 zonefs_warn(sb,
 861                                         "remounting filesystem read-only\n");
 862                                 sb->s_flags |= SB_RDONLY;
 863                         }
 864                         goto unlock;
 865                 }
 866
 867                 z->z_flags &= ~ZONEFS_ZONE_OPEN;
 868                 zonefs_inode_account_active(inode);
 869         }
 870
 871         atomic_dec(&sbi->s_wro_seq_files);
 872
 873 unlock:
 874         mutex_unlock(&zi->i_truncate_mutex);
 875 }
 876
 877 static int zonefs_file_release(struct inode *inode, struct file *file)
 878 {
 879         /*
 880          * If we explicitly open a zone we must close it again as well, but the
 881          * zone management operation can fail (either due to an IO error or as
 882          * the zone has gone offline or read-only). Make sure we don't fail the
 883          * close(2) for user-space.
 884          */
 885         if (zonefs_seq_file_need_wro(inode, file))
 886                 zonefs_seq_file_write_close(inode);
 887
 888         return 0;
 889 }
 890
 891 const struct file_operations zonefs_file_operations = {
 892         .open           = zonefs_file_open,
 893         .release        = zonefs_file_release,
 894         .fsync          = zonefs_file_fsync,
 895         .mmap           = zonefs_file_mmap,
 896         .llseek         = zonefs_file_llseek,
 897         .read_iter      = zonefs_file_read_iter,
 898         .write_iter     = zonefs_file_write_iter,
 899         .splice_read    = generic_file_splice_read,
 900         .splice_write   = iter_file_splice_write,
 901         .iopoll         = iocb_bio_iopoll,
 902 };