fs/netfs/read_helper.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* Network filesystem high-level read support.
   3  *
   4  * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
   5  * Written by David Howells (dhowells@redhat.com)
   6  */
   7
   8 #include <linux/module.h>
   9 #include <linux/export.h>
  10 #include <linux/fs.h>
  11 #include <linux/mm.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/slab.h>
  14 #include <linux/uio.h>
  15 #include <linux/sched/mm.h>
  16 #include <linux/task_io_accounting_ops.h>
  17 #include <linux/netfs.h>
  18 #include "internal.h"
  19 #define CREATE_TRACE_POINTS
  20 #include <trace/events/netfs.h>
  21
  22 MODULE_DESCRIPTION("Network fs support");
  23 MODULE_AUTHOR("Red Hat, Inc.");
  24 MODULE_LICENSE("GPL");
  25
  26 unsigned netfs_debug;
  27 module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
  28 MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
  29
  30 static void netfs_rreq_work(struct work_struct *);
  31 static void __netfs_put_subrequest(struct netfs_read_subrequest *, bool);
  32
  33 static void netfs_put_subrequest(struct netfs_read_subrequest *subreq,
  34                                  bool was_async)
  35 {
  36         if (refcount_dec_and_test(&subreq->usage))
  37                 __netfs_put_subrequest(subreq, was_async);
  38 }
  39
  40 static struct netfs_read_request *netfs_alloc_read_request(
  41         const struct netfs_read_request_ops *ops, void *netfs_priv,
  42         struct file *file)
  43 {
  44         static atomic_t debug_ids;
  45         struct netfs_read_request *rreq;
  46
  47         rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL);
  48         if (rreq) {
  49                 rreq->netfs_ops = ops;
  50                 rreq->netfs_priv = netfs_priv;
  51                 rreq->inode     = file_inode(file);
  52                 rreq->i_size    = i_size_read(rreq->inode);
  53                 rreq->debug_id  = atomic_inc_return(&debug_ids);
  54                 INIT_LIST_HEAD(&rreq->subrequests);
  55                 INIT_WORK(&rreq->work, netfs_rreq_work);
  56                 refcount_set(&rreq->usage, 1);
  57                 __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
  58                 ops->init_rreq(rreq, file);
  59                 netfs_stat(&netfs_n_rh_rreq);
  60         }
  61
  62         return rreq;
  63 }
  64
  65 static void netfs_get_read_request(struct netfs_read_request *rreq)
  66 {
  67         refcount_inc(&rreq->usage);
  68 }
  69
  70 static void netfs_rreq_clear_subreqs(struct netfs_read_request *rreq,
  71                                      bool was_async)
  72 {
  73         struct netfs_read_subrequest *subreq;
  74
  75         while (!list_empty(&rreq->subrequests)) {
  76                 subreq = list_first_entry(&rreq->subrequests,
  77                                           struct netfs_read_subrequest, rreq_link);
  78                 list_del(&subreq->rreq_link);
  79                 netfs_put_subrequest(subreq, was_async);
  80         }
  81 }
  82
  83 static void netfs_free_read_request(struct work_struct *work)
  84 {
  85         struct netfs_read_request *rreq =
  86                 container_of(work, struct netfs_read_request, work);
  87         netfs_rreq_clear_subreqs(rreq, false);
  88         if (rreq->netfs_priv)
  89                 rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
  90         trace_netfs_rreq(rreq, netfs_rreq_trace_free);
  91         if (rreq->cache_resources.ops)
  92                 rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
  93         kfree(rreq);
  94         netfs_stat_d(&netfs_n_rh_rreq);
  95 }
  96
  97 static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async)
  98 {
  99         if (refcount_dec_and_test(&rreq->usage)) {
 100                 if (was_async) {
 101                         rreq->work.func = netfs_free_read_request;
 102                         if (!queue_work(system_unbound_wq, &rreq->work))
 103                                 BUG();
 104                 } else {
 105                         netfs_free_read_request(&rreq->work);
 106                 }
 107         }
 108 }
 109
 110 /*
 111  * Allocate and partially initialise an I/O request structure.
 112  */
 113 static struct netfs_read_subrequest *netfs_alloc_subrequest(
 114         struct netfs_read_request *rreq)
 115 {
 116         struct netfs_read_subrequest *subreq;
 117
 118         subreq = kzalloc(sizeof(struct netfs_read_subrequest), GFP_KERNEL);
 119         if (subreq) {
 120                 INIT_LIST_HEAD(&subreq->rreq_link);
 121                 refcount_set(&subreq->usage, 2);
 122                 subreq->rreq = rreq;
 123                 netfs_get_read_request(rreq);
 124                 netfs_stat(&netfs_n_rh_sreq);
 125         }
 126
 127         return subreq;
 128 }
 129
 130 static void netfs_get_read_subrequest(struct netfs_read_subrequest *subreq)
 131 {
 132         refcount_inc(&subreq->usage);
 133 }
 134
 135 static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq,
 136                                    bool was_async)
 137 {
 138         struct netfs_read_request *rreq = subreq->rreq;
 139
 140         trace_netfs_sreq(subreq, netfs_sreq_trace_free);
 141         kfree(subreq);
 142         netfs_stat_d(&netfs_n_rh_sreq);
 143         netfs_put_read_request(rreq, was_async);
 144 }
 145
 146 /*
 147  * Clear the unread part of an I/O request.
 148  */
 149 static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
 150 {
 151         struct iov_iter iter;
 152
 153         iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
 154                         subreq->start + subreq->transferred,
 155                         subreq->len   - subreq->transferred);
 156         iov_iter_zero(iov_iter_count(&iter), &iter);
 157 }
 158
 159 static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
 160                                         bool was_async)
 161 {
 162         struct netfs_read_subrequest *subreq = priv;
 163
 164         netfs_subreq_terminated(subreq, transferred_or_error, was_async);
 165 }
 166
 167 /*
 168  * Issue a read against the cache.
 169  * - Eats the caller's ref on subreq.
 170  */
 171 static void netfs_read_from_cache(struct netfs_read_request *rreq,
 172                                   struct netfs_read_subrequest *subreq,
 173                                   bool seek_data)
 174 {
 175         struct netfs_cache_resources *cres = &rreq->cache_resources;
 176         struct iov_iter iter;
 177
 178         netfs_stat(&netfs_n_rh_read);
 179         iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
 180                         subreq->start + subreq->transferred,
 181                         subreq->len   - subreq->transferred);
 182
 183         cres->ops->read(cres, subreq->start, &iter, seek_data,
 184                         netfs_cache_read_terminated, subreq);
 185 }
 186
 187 /*
 188  * Fill a subrequest region with zeroes.
 189  */
 190 static void netfs_fill_with_zeroes(struct netfs_read_request *rreq,
 191                                    struct netfs_read_subrequest *subreq)
 192 {
 193         netfs_stat(&netfs_n_rh_zero);
 194         __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 195         netfs_subreq_terminated(subreq, 0, false);
 196 }
 197
 198 /*
 199  * Ask the netfs to issue a read request to the server for us.
 200  *
 201  * The netfs is expected to read from subreq->pos + subreq->transferred to
 202  * subreq->pos + subreq->len - 1.  It may not backtrack and write data into the
 203  * buffer prior to the transferred point as it might clobber dirty data
 204  * obtained from the cache.
 205  *
 206  * Alternatively, the netfs is allowed to indicate one of two things:
 207  *
 208  * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
 209  *   make progress.
 210  *
 211  * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
 212  *   cleared.
 213  */
 214 static void netfs_read_from_server(struct netfs_read_request *rreq,
 215                                    struct netfs_read_subrequest *subreq)
 216 {
 217         netfs_stat(&netfs_n_rh_download);
 218         rreq->netfs_ops->issue_op(subreq);
 219 }
 220
 221 /*
 222  * Release those waiting.
 223  */
 224 static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async)
 225 {
 226         trace_netfs_rreq(rreq, netfs_rreq_trace_done);
 227         netfs_rreq_clear_subreqs(rreq, was_async);
 228         netfs_put_read_request(rreq, was_async);
 229 }
 230
 231 /*
 232  * Deal with the completion of writing the data to the cache.  We have to clear
 233  * the PG_fscache bits on the pages involved and release the caller's ref.
 234  *
 235  * May be called in softirq mode and we inherit a ref from the caller.
 236  */
 237 static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
 238                                           bool was_async)
 239 {
 240         struct netfs_read_subrequest *subreq;
 241         struct page *page;
 242         pgoff_t unlocked = 0;
 243         bool have_unlocked = false;
 244
 245         rcu_read_lock();
 246
 247         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 248                 XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
 249
 250                 xas_for_each(&xas, page, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
 251                         /* We might have multiple writes from the same huge
 252                          * page, but we mustn't unlock a page more than once.
 253                          */
 254                         if (have_unlocked && page->index <= unlocked)
 255                                 continue;
 256                         unlocked = page->index;
 257                         end_page_fscache(page);
 258                         have_unlocked = true;
 259                 }
 260         }
 261
 262         rcu_read_unlock();
 263         netfs_rreq_completed(rreq, was_async);
 264 }
 265
 266 static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
 267                                        bool was_async)
 268 {
 269         struct netfs_read_subrequest *subreq = priv;
 270         struct netfs_read_request *rreq = subreq->rreq;
 271
 272         if (IS_ERR_VALUE(transferred_or_error)) {
 273                 netfs_stat(&netfs_n_rh_write_failed);
 274                 trace_netfs_failure(rreq, subreq, transferred_or_error,
 275                                     netfs_fail_copy_to_cache);
 276         } else {
 277                 netfs_stat(&netfs_n_rh_write_done);
 278         }
 279
 280         trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
 281
 282         /* If we decrement nr_wr_ops to 0, the ref belongs to us. */
 283         if (atomic_dec_and_test(&rreq->nr_wr_ops))
 284                 netfs_rreq_unmark_after_write(rreq, was_async);
 285
 286         netfs_put_subrequest(subreq, was_async);
 287 }
 288
 289 /*
 290  * Perform any outstanding writes to the cache.  We inherit a ref from the
 291  * caller.
 292  */
 293 static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq)
 294 {
 295         struct netfs_cache_resources *cres = &rreq->cache_resources;
 296         struct netfs_read_subrequest *subreq, *next, *p;
 297         struct iov_iter iter;
 298         int ret;
 299
 300         trace_netfs_rreq(rreq, netfs_rreq_trace_write);
 301
 302         /* We don't want terminating writes trying to wake us up whilst we're
 303          * still going through the list.
 304          */
 305         atomic_inc(&rreq->nr_wr_ops);
 306
 307         list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
 308                 if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) {
 309                         list_del_init(&subreq->rreq_link);
 310                         netfs_put_subrequest(subreq, false);
 311                 }
 312         }
 313
 314         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 315                 /* Amalgamate adjacent writes */
 316                 while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
 317                         next = list_next_entry(subreq, rreq_link);
 318                         if (next->start != subreq->start + subreq->len)
 319                                 break;
 320                         subreq->len += next->len;
 321                         list_del_init(&next->rreq_link);
 322                         netfs_put_subrequest(next, false);
 323                 }
 324
 325                 ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
 326                                                rreq->i_size);
 327                 if (ret < 0) {
 328                         trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
 329                         trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
 330                         continue;
 331                 }
 332
 333                 iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
 334                                 subreq->start, subreq->len);
 335
 336                 atomic_inc(&rreq->nr_wr_ops);
 337                 netfs_stat(&netfs_n_rh_write);
 338                 netfs_get_read_subrequest(subreq);
 339                 trace_netfs_sreq(subreq, netfs_sreq_trace_write);
 340                 cres->ops->write(cres, subreq->start, &iter,
 341                                  netfs_rreq_copy_terminated, subreq);
 342         }
 343
 344         /* If we decrement nr_wr_ops to 0, the usage ref belongs to us. */
 345         if (atomic_dec_and_test(&rreq->nr_wr_ops))
 346                 netfs_rreq_unmark_after_write(rreq, false);
 347 }
 348
 349 static void netfs_rreq_write_to_cache_work(struct work_struct *work)
 350 {
 351         struct netfs_read_request *rreq =
 352                 container_of(work, struct netfs_read_request, work);
 353
 354         netfs_rreq_do_write_to_cache(rreq);
 355 }
 356
 357 static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq)
 358 {
 359         rreq->work.func = netfs_rreq_write_to_cache_work;
 360         if (!queue_work(system_unbound_wq, &rreq->work))
 361                 BUG();
 362 }
 363
 364 /*
 365  * Unlock the pages in a read operation.  We need to set PG_fscache on any
 366  * pages we're going to write back before we unlock them.
 367  */
 368 static void netfs_rreq_unlock(struct netfs_read_request *rreq)
 369 {
 370         struct netfs_read_subrequest *subreq;
 371         struct page *page;
 372         unsigned int iopos, account = 0;
 373         pgoff_t start_page = rreq->start / PAGE_SIZE;
 374         pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
 375         bool subreq_failed = false;
 376         int i;
 377
 378         XA_STATE(xas, &rreq->mapping->i_pages, start_page);
 379
 380         if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
 381                 __clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
 382                 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 383                         __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
 384                 }
 385         }
 386
 387         /* Walk through the pagecache and the I/O request lists simultaneously.
 388          * We may have a mixture of cached and uncached sections and we only
 389          * really want to write out the uncached sections.  This is slightly
 390          * complicated by the possibility that we might have huge pages with a
 391          * mixture inside.
 392          */
 393         subreq = list_first_entry(&rreq->subrequests,
 394                                   struct netfs_read_subrequest, rreq_link);
 395         iopos = 0;
 396         subreq_failed = (subreq->error < 0);
 397
 398         trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
 399
 400         rcu_read_lock();
 401         xas_for_each(&xas, page, last_page) {
 402                 unsigned int pgpos = (page->index - start_page) * PAGE_SIZE;
 403                 unsigned int pgend = pgpos + thp_size(page);
 404                 bool pg_failed = false;
 405
 406                 for (;;) {
 407                         if (!subreq) {
 408                                 pg_failed = true;
 409                                 break;
 410                         }
 411                         if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
 412                                 set_page_fscache(page);
 413                         pg_failed |= subreq_failed;
 414                         if (pgend < iopos + subreq->len)
 415                                 break;
 416
 417                         account += subreq->transferred;
 418                         iopos += subreq->len;
 419                         if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
 420                                 subreq = list_next_entry(subreq, rreq_link);
 421                                 subreq_failed = (subreq->error < 0);
 422                         } else {
 423                                 subreq = NULL;
 424                                 subreq_failed = false;
 425                         }
 426                         if (pgend == iopos)
 427                                 break;
 428                 }
 429
 430                 if (!pg_failed) {
 431                         for (i = 0; i < thp_nr_pages(page); i++)
 432                                 flush_dcache_page(page);
 433                         SetPageUptodate(page);
 434                 }
 435
 436                 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_PAGES, &rreq->flags)) {
 437                         if (page->index == rreq->no_unlock_page &&
 438                             test_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags))
 439                                 _debug("no unlock");
 440                         else
 441                                 unlock_page(page);
 442                 }
 443         }
 444         rcu_read_unlock();
 445
 446         task_io_account_read(account);
 447         if (rreq->netfs_ops->done)
 448                 rreq->netfs_ops->done(rreq);
 449 }
 450
 451 /*
 452  * Handle a short read.
 453  */
 454 static void netfs_rreq_short_read(struct netfs_read_request *rreq,
 455                                   struct netfs_read_subrequest *subreq)
 456 {
 457         __clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
 458         __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
 459
 460         netfs_stat(&netfs_n_rh_short_read);
 461         trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
 462
 463         netfs_get_read_subrequest(subreq);
 464         atomic_inc(&rreq->nr_rd_ops);
 465         if (subreq->source == NETFS_READ_FROM_CACHE)
 466                 netfs_read_from_cache(rreq, subreq, true);
 467         else
 468                 netfs_read_from_server(rreq, subreq);
 469 }
 470
 471 /*
 472  * Resubmit any short or failed operations.  Returns true if we got the rreq
 473  * ref back.
 474  */
 475 static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
 476 {
 477         struct netfs_read_subrequest *subreq;
 478
 479         WARN_ON(in_interrupt());
 480
 481         trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
 482
 483         /* We don't want terminating submissions trying to wake us up whilst
 484          * we're still going through the list.
 485          */
 486         atomic_inc(&rreq->nr_rd_ops);
 487
 488         __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 489         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 490                 if (subreq->error) {
 491                         if (subreq->source != NETFS_READ_FROM_CACHE)
 492                                 break;
 493                         subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
 494                         subreq->error = 0;
 495                         netfs_stat(&netfs_n_rh_download_instead);
 496                         trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
 497                         netfs_get_read_subrequest(subreq);
 498                         atomic_inc(&rreq->nr_rd_ops);
 499                         netfs_read_from_server(rreq, subreq);
 500                 } else if (test_bit(NETFS_SREQ_SHORT_READ, &subreq->flags)) {
 501                         netfs_rreq_short_read(rreq, subreq);
 502                 }
 503         }
 504
 505         /* If we decrement nr_rd_ops to 0, the usage ref belongs to us. */
 506         if (atomic_dec_and_test(&rreq->nr_rd_ops))
 507                 return true;
 508
 509         wake_up_var(&rreq->nr_rd_ops);
 510         return false;
 511 }
 512
 513 /*
 514  * Check to see if the data read is still valid.
 515  */
 516 static void netfs_rreq_is_still_valid(struct netfs_read_request *rreq)
 517 {
 518         struct netfs_read_subrequest *subreq;
 519
 520         if (!rreq->netfs_ops->is_still_valid ||
 521             rreq->netfs_ops->is_still_valid(rreq))
 522                 return;
 523
 524         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 525                 if (subreq->source == NETFS_READ_FROM_CACHE) {
 526                         subreq->error = -ESTALE;
 527                         __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 528                 }
 529         }
 530 }
 531
 532 /*
 533  * Assess the state of a read request and decide what to do next.
 534  *
 535  * Note that we could be in an ordinary kernel thread, on a workqueue or in
 536  * softirq context at this point.  We inherit a ref from the caller.
 537  */
 538 static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async)
 539 {
 540         trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
 541
 542 again:
 543         netfs_rreq_is_still_valid(rreq);
 544
 545         if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
 546             test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
 547                 if (netfs_rreq_perform_resubmissions(rreq))
 548                         goto again;
 549                 return;
 550         }
 551
 552         netfs_rreq_unlock(rreq);
 553
 554         clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 555         wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
 556
 557         if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags))
 558                 return netfs_rreq_write_to_cache(rreq);
 559
 560         netfs_rreq_completed(rreq, was_async);
 561 }
 562
 563 static void netfs_rreq_work(struct work_struct *work)
 564 {
 565         struct netfs_read_request *rreq =
 566                 container_of(work, struct netfs_read_request, work);
 567         netfs_rreq_assess(rreq, false);
 568 }
 569
 570 /*
 571  * Handle the completion of all outstanding I/O operations on a read request.
 572  * We inherit a ref from the caller.
 573  */
 574 static void netfs_rreq_terminated(struct netfs_read_request *rreq,
 575                                   bool was_async)
 576 {
 577         if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
 578             was_async) {
 579                 if (!queue_work(system_unbound_wq, &rreq->work))
 580                         BUG();
 581         } else {
 582                 netfs_rreq_assess(rreq, was_async);
 583         }
 584 }
 585
 586 /**
 587  * netfs_subreq_terminated - Note the termination of an I/O operation.
 588  * @subreq: The I/O request that has terminated.
 589  * @transferred_or_error: The amount of data transferred or an error code.
 590  * @was_async: The termination was asynchronous
 591  *
 592  * This tells the read helper that a contributory I/O operation has terminated,
 593  * one way or another, and that it should integrate the results.
 594  *
 595  * The caller indicates in @transferred_or_error the outcome of the operation,
 596  * supplying a positive value to indicate the number of bytes transferred, 0 to
 597  * indicate a failure to transfer anything that should be retried or a negative
 598  * error code.  The helper will look after reissuing I/O operations as
 599  * appropriate and writing downloaded data to the cache.
 600  *
 601  * If @was_async is true, the caller might be running in softirq or interrupt
 602  * context and we can't sleep.
 603  */
 604 void netfs_subreq_terminated(struct netfs_read_subrequest *subreq,
 605                              ssize_t transferred_or_error,
 606                              bool was_async)
 607 {
 608         struct netfs_read_request *rreq = subreq->rreq;
 609         int u;
 610
 611         _enter("[%u]{%llx,%lx},%zd",
 612                subreq->debug_index, subreq->start, subreq->flags,
 613                transferred_or_error);
 614
 615         switch (subreq->source) {
 616         case NETFS_READ_FROM_CACHE:
 617                 netfs_stat(&netfs_n_rh_read_done);
 618                 break;
 619         case NETFS_DOWNLOAD_FROM_SERVER:
 620                 netfs_stat(&netfs_n_rh_download_done);
 621                 break;
 622         default:
 623                 break;
 624         }
 625
 626         if (IS_ERR_VALUE(transferred_or_error)) {
 627                 subreq->error = transferred_or_error;
 628                 trace_netfs_failure(rreq, subreq, transferred_or_error,
 629                                     netfs_fail_read);
 630                 goto failed;
 631         }
 632
 633         if (WARN(transferred_or_error > subreq->len - subreq->transferred,
 634                  "Subreq overread: R%x[%x] %zd > %zu - %zu",
 635                  rreq->debug_id, subreq->debug_index,
 636                  transferred_or_error, subreq->len, subreq->transferred))
 637                 transferred_or_error = subreq->len - subreq->transferred;
 638
 639         subreq->error = 0;
 640         subreq->transferred += transferred_or_error;
 641         if (subreq->transferred < subreq->len)
 642                 goto incomplete;
 643
 644 complete:
 645         __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
 646         if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
 647                 set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
 648
 649 out:
 650         trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
 651
 652         /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
 653         u = atomic_dec_return(&rreq->nr_rd_ops);
 654         if (u == 0)
 655                 netfs_rreq_terminated(rreq, was_async);
 656         else if (u == 1)
 657                 wake_up_var(&rreq->nr_rd_ops);
 658
 659         netfs_put_subrequest(subreq, was_async);
 660         return;
 661
 662 incomplete:
 663         if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
 664                 netfs_clear_unread(subreq);
 665                 subreq->transferred = subreq->len;
 666                 goto complete;
 667         }
 668
 669         if (transferred_or_error == 0) {
 670                 if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
 671                         subreq->error = -ENODATA;
 672                         goto failed;
 673                 }
 674         } else {
 675                 __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
 676         }
 677
 678         __set_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
 679         set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 680         goto out;
 681
 682 failed:
 683         if (subreq->source == NETFS_READ_FROM_CACHE) {
 684                 netfs_stat(&netfs_n_rh_read_failed);
 685                 set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 686         } else {
 687                 netfs_stat(&netfs_n_rh_download_failed);
 688                 set_bit(NETFS_RREQ_FAILED, &rreq->flags);
 689                 rreq->error = subreq->error;
 690         }
 691         goto out;
 692 }
 693 EXPORT_SYMBOL(netfs_subreq_terminated);
 694
 695 static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequest *subreq,
 696                                                        loff_t i_size)
 697 {
 698         struct netfs_read_request *rreq = subreq->rreq;
 699         struct netfs_cache_resources *cres = &rreq->cache_resources;
 700
 701         if (cres->ops)
 702                 return cres->ops->prepare_read(subreq, i_size);
 703         if (subreq->start >= rreq->i_size)
 704                 return NETFS_FILL_WITH_ZEROES;
 705         return NETFS_DOWNLOAD_FROM_SERVER;
 706 }
 707
 708 /*
 709  * Work out what sort of subrequest the next one will be.
 710  */
 711 static enum netfs_read_source
 712 netfs_rreq_prepare_read(struct netfs_read_request *rreq,
 713                         struct netfs_read_subrequest *subreq)
 714 {
 715         enum netfs_read_source source;
 716
 717         _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
 718
 719         source = netfs_cache_prepare_read(subreq, rreq->i_size);
 720         if (source == NETFS_INVALID_READ)
 721                 goto out;
 722
 723         if (source == NETFS_DOWNLOAD_FROM_SERVER) {
 724                 /* Call out to the netfs to let it shrink the request to fit
 725                  * its own I/O sizes and boundaries.  If it shinks it here, it
 726                  * will be called again to make simultaneous calls; if it wants
 727                  * to make serial calls, it can indicate a short read and then
 728                  * we will call it again.
 729                  */
 730                 if (subreq->len > rreq->i_size - subreq->start)
 731                         subreq->len = rreq->i_size - subreq->start;
 732
 733                 if (rreq->netfs_ops->clamp_length &&
 734                     !rreq->netfs_ops->clamp_length(subreq)) {
 735                         source = NETFS_INVALID_READ;
 736                         goto out;
 737                 }
 738         }
 739
 740         if (WARN_ON(subreq->len == 0))
 741                 source = NETFS_INVALID_READ;
 742
 743 out:
 744         subreq->source = source;
 745         trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
 746         return source;
 747 }
 748
 749 /*
 750  * Slice off a piece of a read request and submit an I/O request for it.
 751  */
 752 static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq,
 753                                     unsigned int *_debug_index)
 754 {
 755         struct netfs_read_subrequest *subreq;
 756         enum netfs_read_source source;
 757
 758         subreq = netfs_alloc_subrequest(rreq);
 759         if (!subreq)
 760                 return false;
 761
 762         subreq->debug_index     = (*_debug_index)++;
 763         subreq->start           = rreq->start + rreq->submitted;
 764         subreq->len             = rreq->len   - rreq->submitted;
 765
 766         _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
 767         list_add_tail(&subreq->rreq_link, &rreq->subrequests);
 768
 769         /* Call out to the cache to find out what it can do with the remaining
 770          * subset.  It tells us in subreq->flags what it decided should be done
 771          * and adjusts subreq->len down if the subset crosses a cache boundary.
 772          *
 773          * Then when we hand the subset, it can choose to take a subset of that
 774          * (the starts must coincide), in which case, we go around the loop
 775          * again and ask it to download the next piece.
 776          */
 777         source = netfs_rreq_prepare_read(rreq, subreq);
 778         if (source == NETFS_INVALID_READ)
 779                 goto subreq_failed;
 780
 781         atomic_inc(&rreq->nr_rd_ops);
 782
 783         rreq->submitted += subreq->len;
 784
 785         trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 786         switch (source) {
 787         case NETFS_FILL_WITH_ZEROES:
 788                 netfs_fill_with_zeroes(rreq, subreq);
 789                 break;
 790         case NETFS_DOWNLOAD_FROM_SERVER:
 791                 netfs_read_from_server(rreq, subreq);
 792                 break;
 793         case NETFS_READ_FROM_CACHE:
 794                 netfs_read_from_cache(rreq, subreq, false);
 795                 break;
 796         default:
 797                 BUG();
 798         }
 799
 800         return true;
 801
 802 subreq_failed:
 803         rreq->error = subreq->error;
 804         netfs_put_subrequest(subreq, false);
 805         return false;
 806 }
 807
 808 static void netfs_cache_expand_readahead(struct netfs_read_request *rreq,
 809                                          loff_t *_start, size_t *_len, loff_t i_size)
 810 {
 811         struct netfs_cache_resources *cres = &rreq->cache_resources;
 812
 813         if (cres->ops && cres->ops->expand_readahead)
 814                 cres->ops->expand_readahead(cres, _start, _len, i_size);
 815 }
 816
 817 static void netfs_rreq_expand(struct netfs_read_request *rreq,
 818                               struct readahead_control *ractl)
 819 {
 820         /* Give the cache a chance to change the request parameters.  The
 821          * resultant request must contain the original region.
 822          */
 823         netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
 824
 825         /* Give the netfs a chance to change the request parameters.  The
 826          * resultant request must contain the original region.
 827          */
 828         if (rreq->netfs_ops->expand_readahead)
 829                 rreq->netfs_ops->expand_readahead(rreq);
 830
 831         /* Expand the request if the cache wants it to start earlier.  Note
 832          * that the expansion may get further extended if the VM wishes to
 833          * insert THPs and the preferred start and/or end wind up in the middle
 834          * of THPs.
 835          *
 836          * If this is the case, however, the THP size should be an integer
 837          * multiple of the cache granule size, so we get a whole number of
 838          * granules to deal with.
 839          */
 840         if (rreq->start  != readahead_pos(ractl) ||
 841             rreq->len != readahead_length(ractl)) {
 842                 readahead_expand(ractl, rreq->start, rreq->len);
 843                 rreq->start  = readahead_pos(ractl);
 844                 rreq->len = readahead_length(ractl);
 845
 846                 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 847                                  netfs_read_trace_expanded);
 848         }
 849 }
 850
 851 /**
 852  * netfs_readahead - Helper to manage a read request
 853  * @ractl: The description of the readahead request
 854  * @ops: The network filesystem's operations for the helper to use
 855  * @netfs_priv: Private netfs data to be retained in the request
 856  *
 857  * Fulfil a readahead request by drawing data from the cache if possible, or
 858  * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
 859  * requests from different sources will get munged together.  If necessary, the
 860  * readahead window can be expanded in either direction to a more convenient
 861  * alighment for RPC efficiency or to make storage in the cache feasible.
 862  *
 863  * The calling netfs must provide a table of operations, only one of which,
 864  * issue_op, is mandatory.  It may also be passed a private token, which will
 865  * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
 866  *
 867  * This is usable whether or not caching is enabled.
 868  */
 869 void netfs_readahead(struct readahead_control *ractl,
 870                      const struct netfs_read_request_ops *ops,
 871                      void *netfs_priv)
 872 {
 873         struct netfs_read_request *rreq;
 874         struct page *page;
 875         unsigned int debug_index = 0;
 876         int ret;
 877
 878         _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
 879
 880         if (readahead_count(ractl) == 0)
 881                 goto cleanup;
 882
 883         rreq = netfs_alloc_read_request(ops, netfs_priv, ractl->file);
 884         if (!rreq)
 885                 goto cleanup;
 886         rreq->mapping   = ractl->mapping;
 887         rreq->start     = readahead_pos(ractl);
 888         rreq->len       = readahead_length(ractl);
 889
 890         if (ops->begin_cache_operation) {
 891                 ret = ops->begin_cache_operation(rreq);
 892                 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
 893                         goto cleanup_free;
 894         }
 895
 896         netfs_stat(&netfs_n_rh_readahead);
 897         trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 898                          netfs_read_trace_readahead);
 899
 900         netfs_rreq_expand(rreq, ractl);
 901
 902         atomic_set(&rreq->nr_rd_ops, 1);
 903         do {
 904                 if (!netfs_rreq_submit_slice(rreq, &debug_index))
 905                         break;
 906
 907         } while (rreq->submitted < rreq->len);
 908
 909         /* Drop the refs on the pages here rather than in the cache or
 910          * filesystem.  The locks will be dropped in netfs_rreq_unlock().
 911          */
 912         while ((page = readahead_page(ractl)))
 913                 put_page(page);
 914
 915         /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
 916         if (atomic_dec_and_test(&rreq->nr_rd_ops))
 917                 netfs_rreq_assess(rreq, false);
 918         return;
 919
 920 cleanup_free:
 921         netfs_put_read_request(rreq, false);
 922         return;
 923 cleanup:
 924         if (netfs_priv)
 925                 ops->cleanup(ractl->mapping, netfs_priv);
 926         return;
 927 }
 928 EXPORT_SYMBOL(netfs_readahead);
 929
 930 /**
 931  * netfs_readpage - Helper to manage a readpage request
 932  * @file: The file to read from
 933  * @page: The page to read
 934  * @ops: The network filesystem's operations for the helper to use
 935  * @netfs_priv: Private netfs data to be retained in the request
 936  *
 937  * Fulfil a readpage request by drawing data from the cache if possible, or the
 938  * netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O requests
 939  * from different sources will get munged together.
 940  *
 941  * The calling netfs must provide a table of operations, only one of which,
 942  * issue_op, is mandatory.  It may also be passed a private token, which will
 943  * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
 944  *
 945  * This is usable whether or not caching is enabled.
 946  */
 947 int netfs_readpage(struct file *file,
 948                    struct page *page,
 949                    const struct netfs_read_request_ops *ops,
 950                    void *netfs_priv)
 951 {
 952         struct netfs_read_request *rreq;
 953         unsigned int debug_index = 0;
 954         int ret;
 955
 956         _enter("%lx", page_index(page));
 957
 958         rreq = netfs_alloc_read_request(ops, netfs_priv, file);
 959         if (!rreq) {
 960                 if (netfs_priv)
 961                         ops->cleanup(page_file_mapping(page), netfs_priv);
 962                 unlock_page(page);
 963                 return -ENOMEM;
 964         }
 965         rreq->mapping   = page_file_mapping(page);
 966         rreq->start     = page_file_offset(page);
 967         rreq->len       = thp_size(page);
 968
 969         if (ops->begin_cache_operation) {
 970                 ret = ops->begin_cache_operation(rreq);
 971                 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) {
 972                         unlock_page(page);
 973                         goto out;
 974                 }
 975         }
 976
 977         netfs_stat(&netfs_n_rh_readpage);
 978         trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
 979
 980         netfs_get_read_request(rreq);
 981
 982         atomic_set(&rreq->nr_rd_ops, 1);
 983         do {
 984                 if (!netfs_rreq_submit_slice(rreq, &debug_index))
 985                         break;
 986
 987         } while (rreq->submitted < rreq->len);
 988
 989         /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
 990          * the service code isn't punted off to a random thread pool to
 991          * process.
 992          */
 993         do {
 994                 wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
 995                 netfs_rreq_assess(rreq, false);
 996         } while (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags));
 997
 998         ret = rreq->error;
 999         if (ret == 0 && rreq->submitted < rreq->len) {
1000                 trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_readpage);
1001                 ret = -EIO;
1002         }
1003 out:
1004         netfs_put_read_request(rreq, false);
1005         return ret;
1006 }
1007 EXPORT_SYMBOL(netfs_readpage);
1008
1009 /**
1010  * netfs_skip_page_read - prep a page for writing without reading first
1011  * @page: page being prepared
1012  * @pos: starting position for the write
1013  * @len: length of write
1014  *
1015  * In some cases, write_begin doesn't need to read at all:
1016  * - full page write
1017  * - write that lies in a page that is completely beyond EOF
1018  * - write that covers the the page from start to EOF or beyond it
1019  *
1020  * If any of these criteria are met, then zero out the unwritten parts
1021  * of the page and return true. Otherwise, return false.
1022  */
1023 static bool netfs_skip_page_read(struct page *page, loff_t pos, size_t len)
1024 {
1025         struct inode *inode = page->mapping->host;
1026         loff_t i_size = i_size_read(inode);
1027         size_t offset = offset_in_thp(page, pos);
1028
1029         /* Full page write */
1030         if (offset == 0 && len >= thp_size(page))
1031                 return true;
1032
1033         /* pos beyond last page in the file */
1034         if (pos - offset >= i_size)
1035                 goto zero_out;
1036
1037         /* Write that covers from the start of the page to EOF or beyond */
1038         if (offset == 0 && (pos + len) >= i_size)
1039                 goto zero_out;
1040
1041         return false;
1042 zero_out:
1043         zero_user_segments(page, 0, offset, offset + len, thp_size(page));
1044         return true;
1045 }
1046
1047 /**
1048  * netfs_write_begin - Helper to prepare for writing
1049  * @file: The file to read from
1050  * @mapping: The mapping to read from
1051  * @pos: File position at which the write will begin
1052  * @len: The length of the write (may extend beyond the end of the page chosen)
1053  * @flags: AOP_* flags
1054  * @_page: Where to put the resultant page
1055  * @_fsdata: Place for the netfs to store a cookie
1056  * @ops: The network filesystem's operations for the helper to use
1057  * @netfs_priv: Private netfs data to be retained in the request
1058  *
1059  * Pre-read data for a write-begin request by drawing data from the cache if
1060  * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
1061  * Multiple I/O requests from different sources will get munged together.  If
1062  * necessary, the readahead window can be expanded in either direction to a
1063  * more convenient alighment for RPC efficiency or to make storage in the cache
1064  * feasible.
1065  *
1066  * The calling netfs must provide a table of operations, only one of which,
1067  * issue_op, is mandatory.
1068  *
1069  * The check_write_begin() operation can be provided to check for and flush
1070  * conflicting writes once the page is grabbed and locked.  It is passed a
1071  * pointer to the fsdata cookie that gets returned to the VM to be passed to
1072  * write_end.  It is permitted to sleep.  It should return 0 if the request
1073  * should go ahead; unlock the page and return -EAGAIN to cause the page to be
1074  * regot; or return an error.
1075  *
1076  * This is usable whether or not caching is enabled.
1077  */
1078 int netfs_write_begin(struct file *file, struct address_space *mapping,
1079                       loff_t pos, unsigned int len, unsigned int flags,
1080                       struct page **_page, void **_fsdata,
1081                       const struct netfs_read_request_ops *ops,
1082                       void *netfs_priv)
1083 {
1084         struct netfs_read_request *rreq;
1085         struct page *page, *xpage;
1086         struct inode *inode = file_inode(file);
1087         unsigned int debug_index = 0;
1088         pgoff_t index = pos >> PAGE_SHIFT;
1089         int ret;
1090
1091         DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
1092
1093 retry:
1094         page = grab_cache_page_write_begin(mapping, index, flags);
1095         if (!page)
1096                 return -ENOMEM;
1097
1098         if (ops->check_write_begin) {
1099                 /* Allow the netfs (eg. ceph) to flush conflicts. */
1100                 ret = ops->check_write_begin(file, pos, len, page, _fsdata);
1101                 if (ret < 0) {
1102                         trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
1103                         if (ret == -EAGAIN)
1104                                 goto retry;
1105                         goto error;
1106                 }
1107         }
1108
1109         if (PageUptodate(page))
1110                 goto have_page;
1111
1112         /* If the page is beyond the EOF, we want to clear it - unless it's
1113          * within the cache granule containing the EOF, in which case we need
1114          * to preload the granule.
1115          */
1116         if (!ops->is_cache_enabled(inode) &&
1117             netfs_skip_page_read(page, pos, len)) {
1118                 netfs_stat(&netfs_n_rh_write_zskip);
1119                 goto have_page_no_wait;
1120         }
1121
1122         ret = -ENOMEM;
1123         rreq = netfs_alloc_read_request(ops, netfs_priv, file);
1124         if (!rreq)
1125                 goto error;
1126         rreq->mapping           = page->mapping;
1127         rreq->start             = page_offset(page);
1128         rreq->len               = thp_size(page);
1129         rreq->no_unlock_page    = page->index;
1130         __set_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags);
1131         netfs_priv = NULL;
1132
1133         if (ops->begin_cache_operation) {
1134                 ret = ops->begin_cache_operation(rreq);
1135                 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
1136                         goto error_put;
1137         }
1138
1139         netfs_stat(&netfs_n_rh_write_begin);
1140         trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
1141
1142         /* Expand the request to meet caching requirements and download
1143          * preferences.
1144          */
1145         ractl._nr_pages = thp_nr_pages(page);
1146         netfs_rreq_expand(rreq, &ractl);
1147         netfs_get_read_request(rreq);
1148
1149         /* We hold the page locks, so we can drop the references */
1150         while ((xpage = readahead_page(&ractl)))
1151                 if (xpage != page)
1152                         put_page(xpage);
1153
1154         atomic_set(&rreq->nr_rd_ops, 1);
1155         do {
1156                 if (!netfs_rreq_submit_slice(rreq, &debug_index))
1157                         break;
1158
1159         } while (rreq->submitted < rreq->len);
1160
1161         /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
1162          * the service code isn't punted off to a random thread pool to
1163          * process.
1164          */
1165         for (;;) {
1166                 wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
1167                 netfs_rreq_assess(rreq, false);
1168                 if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
1169                         break;
1170                 cond_resched();
1171         }
1172
1173         ret = rreq->error;
1174         if (ret == 0 && rreq->submitted < rreq->len) {
1175                 trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_write_begin);
1176                 ret = -EIO;
1177         }
1178         netfs_put_read_request(rreq, false);
1179         if (ret < 0)
1180                 goto error;
1181
1182 have_page:
1183         ret = wait_on_page_fscache_killable(page);
1184         if (ret < 0)
1185                 goto error;
1186 have_page_no_wait:
1187         if (netfs_priv)
1188                 ops->cleanup(mapping, netfs_priv);
1189         *_page = page;
1190         _leave(" = 0");
1191         return 0;
1192
1193 error_put:
1194         netfs_put_read_request(rreq, false);
1195 error:
1196         unlock_page(page);
1197         put_page(page);
1198         if (netfs_priv)
1199                 ops->cleanup(mapping, netfs_priv);
1200         _leave(" = %d", ret);
1201         return ret;
1202 }
1203 EXPORT_SYMBOL(netfs_write_begin);