1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Network filesystem high-level buffered read support.
4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
8 #include <linux/export.h>
9 #include <linux/task_io_accounting_ops.h>
13 * Unlock the folios in a read operation. We need to set PG_fscache on any
14 * folios we're going to write back before we unlock them.
16 void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
18 struct netfs_io_subrequest *subreq;
19 struct netfs_folio *finfo;
21 pgoff_t start_page = rreq->start / PAGE_SIZE;
22 pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
24 bool subreq_failed = false;
26 XA_STATE(xas, &rreq->mapping->i_pages, start_page);
28 if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
29 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
30 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
31 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
35 /* Walk through the pagecache and the I/O request lists simultaneously.
36 * We may have a mixture of cached and uncached sections and we only
37 * really want to write out the uncached sections. This is slightly
38 * complicated by the possibility that we might have huge pages with a
41 subreq = list_first_entry(&rreq->subrequests,
42 struct netfs_io_subrequest, rreq_link);
43 subreq_failed = (subreq->error < 0);
45 trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
48 xas_for_each(&xas, folio, last_page) {
50 bool pg_failed = false;
53 if (xas_retry(&xas, folio))
56 pg_end = folio_pos(folio) + folio_size(folio) - 1;
58 folio_started = false;
66 if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
67 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
68 folio_start_fscache(folio);
71 pg_failed |= subreq_failed;
72 sreq_end = subreq->start + subreq->len - 1;
73 if (pg_end < sreq_end)
76 account += subreq->transferred;
77 if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
78 subreq = list_next_entry(subreq, rreq_link);
79 subreq_failed = (subreq->error < 0);
82 subreq_failed = false;
85 if (pg_end == sreq_end)
90 flush_dcache_folio(folio);
91 finfo = netfs_folio_info(folio);
93 trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
94 if (finfo->netfs_group)
95 folio_change_private(folio, finfo->netfs_group);
97 folio_detach_private(folio);
100 folio_mark_uptodate(folio);
103 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
104 if (folio->index == rreq->no_unlock_folio &&
105 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
113 task_io_account_read(account);
114 if (rreq->netfs_ops->done)
115 rreq->netfs_ops->done(rreq);
118 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
119 loff_t *_start, size_t *_len, loff_t i_size)
121 struct netfs_cache_resources *cres = &rreq->cache_resources;
123 if (cres->ops && cres->ops->expand_readahead)
124 cres->ops->expand_readahead(cres, _start, _len, i_size);
127 static void netfs_rreq_expand(struct netfs_io_request *rreq,
128 struct readahead_control *ractl)
130 /* Give the cache a chance to change the request parameters. The
131 * resultant request must contain the original region.
133 netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
135 /* Give the netfs a chance to change the request parameters. The
136 * resultant request must contain the original region.
138 if (rreq->netfs_ops->expand_readahead)
139 rreq->netfs_ops->expand_readahead(rreq);
141 /* Expand the request if the cache wants it to start earlier. Note
142 * that the expansion may get further extended if the VM wishes to
143 * insert THPs and the preferred start and/or end wind up in the middle
146 * If this is the case, however, the THP size should be an integer
147 * multiple of the cache granule size, so we get a whole number of
148 * granules to deal with.
150 if (rreq->start != readahead_pos(ractl) ||
151 rreq->len != readahead_length(ractl)) {
152 readahead_expand(ractl, rreq->start, rreq->len);
153 rreq->start = readahead_pos(ractl);
154 rreq->len = readahead_length(ractl);
156 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
157 netfs_read_trace_expanded);
162 * Begin an operation, and fetch the stored zero point value from the cookie if
165 static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
167 return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
171 * netfs_readahead - Helper to manage a read request
172 * @ractl: The description of the readahead request
174 * Fulfil a readahead request by drawing data from the cache if possible, or
175 * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
176 * requests from different sources will get munged together. If necessary, the
177 * readahead window can be expanded in either direction to a more convenient
178 * alighment for RPC efficiency or to make storage in the cache feasible.
180 * The calling netfs must initialise a netfs context contiguous to the vfs
181 * inode before calling this.
183 * This is usable whether or not caching is enabled.
185 void netfs_readahead(struct readahead_control *ractl)
187 struct netfs_io_request *rreq;
188 struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
191 _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
193 if (readahead_count(ractl) == 0)
196 rreq = netfs_alloc_request(ractl->mapping, ractl->file,
197 readahead_pos(ractl),
198 readahead_length(ractl),
203 ret = netfs_begin_cache_read(rreq, ctx);
204 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
207 netfs_stat(&netfs_n_rh_readahead);
208 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
209 netfs_read_trace_readahead);
211 netfs_rreq_expand(rreq, ractl);
213 /* Set up the output buffer */
214 iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
215 rreq->start, rreq->len);
217 /* Drop the refs on the folios here rather than in the cache or
218 * filesystem. The locks will be dropped in netfs_rreq_unlock().
220 while (readahead_folio(ractl))
223 netfs_begin_read(rreq, false);
224 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
228 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
231 EXPORT_SYMBOL(netfs_readahead);
234 * netfs_read_folio - Helper to manage a read_folio request
235 * @file: The file to read from
236 * @folio: The folio to read
238 * Fulfil a read_folio request by drawing data from the cache if
239 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
240 * Multiple I/O requests from different sources will get munged together.
242 * The calling netfs must initialise a netfs context contiguous to the vfs
243 * inode before calling this.
245 * This is usable whether or not caching is enabled.
247 int netfs_read_folio(struct file *file, struct folio *folio)
249 struct address_space *mapping = folio->mapping;
250 struct netfs_io_request *rreq;
251 struct netfs_inode *ctx = netfs_inode(mapping->host);
252 struct folio *sink = NULL;
255 _enter("%lx", folio->index);
257 rreq = netfs_alloc_request(mapping, file,
258 folio_file_pos(folio), folio_size(folio),
265 ret = netfs_begin_cache_read(rreq, ctx);
266 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
269 netfs_stat(&netfs_n_rh_readpage);
270 trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
272 /* Set up the output buffer */
273 if (folio_test_dirty(folio)) {
274 /* Handle someone trying to read from an unflushed streaming
275 * write. We fiddle the buffer so that a gap at the beginning
276 * and/or a gap at the end get copied to, but the middle is
279 struct netfs_folio *finfo = netfs_folio_info(folio);
280 struct bio_vec *bvec;
281 unsigned int from = finfo->dirty_offset;
282 unsigned int to = from + finfo->dirty_len;
283 unsigned int off = 0, i = 0;
284 size_t flen = folio_size(folio);
285 size_t nr_bvec = flen / PAGE_SIZE + 2;
289 bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
293 sink = folio_alloc(GFP_KERNEL, 0);
297 trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
299 rreq->direct_bv = bvec;
300 rreq->direct_bv_count = nr_bvec;
302 bvec_set_folio(&bvec[i++], folio, from, 0);
306 part = min_t(size_t, to - off, PAGE_SIZE);
307 bvec_set_folio(&bvec[i++], sink, part, 0);
311 bvec_set_folio(&bvec[i++], folio, flen - to, to);
312 iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
314 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
315 rreq->start, rreq->len);
318 ret = netfs_begin_read(rreq, true);
321 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
322 return ret < 0 ? ret : 0;
325 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
330 EXPORT_SYMBOL(netfs_read_folio);
333 * Prepare a folio for writing without reading first
334 * @folio: The folio being prepared
335 * @pos: starting position for the write
336 * @len: length of write
337 * @always_fill: T if the folio should always be completely filled/cleared
339 * In some cases, write_begin doesn't need to read at all:
341 * - write that lies in a folio that is completely beyond EOF
342 * - write that covers the folio from start to EOF or beyond it
344 * If any of these criteria are met, then zero out the unwritten parts
345 * of the folio and return true. Otherwise, return false.
347 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
350 struct inode *inode = folio_inode(folio);
351 loff_t i_size = i_size_read(inode);
352 size_t offset = offset_in_folio(folio, pos);
353 size_t plen = folio_size(folio);
355 if (unlikely(always_fill)) {
356 if (pos - offset + len <= i_size)
357 return false; /* Page entirely before EOF */
358 zero_user_segment(&folio->page, 0, plen);
359 folio_mark_uptodate(folio);
363 /* Full folio write */
364 if (offset == 0 && len >= plen)
367 /* Page entirely beyond the end of the file */
368 if (pos - offset >= i_size)
371 /* Write that covers from the start of the folio to EOF or beyond */
372 if (offset == 0 && (pos + len) >= i_size)
377 zero_user_segments(&folio->page, 0, offset, offset + len, plen);
382 * netfs_write_begin - Helper to prepare for writing
383 * @ctx: The netfs context
384 * @file: The file to read from
385 * @mapping: The mapping to read from
386 * @pos: File position at which the write will begin
387 * @len: The length of the write (may extend beyond the end of the folio chosen)
388 * @_folio: Where to put the resultant folio
389 * @_fsdata: Place for the netfs to store a cookie
391 * Pre-read data for a write-begin request by drawing data from the cache if
392 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
393 * Multiple I/O requests from different sources will get munged together. If
394 * necessary, the readahead window can be expanded in either direction to a
395 * more convenient alighment for RPC efficiency or to make storage in the cache
398 * The calling netfs must provide a table of operations, only one of which,
399 * issue_op, is mandatory.
401 * The check_write_begin() operation can be provided to check for and flush
402 * conflicting writes once the folio is grabbed and locked. It is passed a
403 * pointer to the fsdata cookie that gets returned to the VM to be passed to
404 * write_end. It is permitted to sleep. It should return 0 if the request
405 * should go ahead or it may return an error. It may also unlock and put the
406 * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
407 * will cause the folio to be re-got and the process to be retried.
409 * The calling netfs must initialise a netfs context contiguous to the vfs
410 * inode before calling this.
412 * This is usable whether or not caching is enabled.
414 int netfs_write_begin(struct netfs_inode *ctx,
415 struct file *file, struct address_space *mapping,
416 loff_t pos, unsigned int len, struct folio **_folio,
419 struct netfs_io_request *rreq;
421 pgoff_t index = pos >> PAGE_SHIFT;
424 DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
427 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
428 mapping_gfp_mask(mapping));
430 return PTR_ERR(folio);
432 if (ctx->ops->check_write_begin) {
433 /* Allow the netfs (eg. ceph) to flush conflicts. */
434 ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
436 trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
443 if (folio_test_uptodate(folio))
446 /* If the page is beyond the EOF, we want to clear it - unless it's
447 * within the cache granule containing the EOF, in which case we need
448 * to preload the granule.
450 if (!netfs_is_cache_enabled(ctx) &&
451 netfs_skip_folio_read(folio, pos, len, false)) {
452 netfs_stat(&netfs_n_rh_write_zskip);
453 goto have_folio_no_wait;
456 rreq = netfs_alloc_request(mapping, file,
457 folio_file_pos(folio), folio_size(folio),
458 NETFS_READ_FOR_WRITE);
463 rreq->no_unlock_folio = folio->index;
464 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
466 ret = netfs_begin_cache_read(rreq, ctx);
467 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
470 netfs_stat(&netfs_n_rh_write_begin);
471 trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
473 /* Expand the request to meet caching requirements and download
476 ractl._nr_pages = folio_nr_pages(folio);
477 netfs_rreq_expand(rreq, &ractl);
479 /* Set up the output buffer */
480 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
481 rreq->start, rreq->len);
483 /* We hold the folio locks, so we can drop the references */
485 while (readahead_folio(&ractl))
488 ret = netfs_begin_read(rreq, true);
491 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
494 ret = folio_wait_fscache_killable(folio);
503 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
509 _leave(" = %d", ret);
512 EXPORT_SYMBOL(netfs_write_begin);
515 * Preload the data into a page we're proposing to write into.
517 int netfs_prefetch_for_write(struct file *file, struct folio *folio,
518 size_t offset, size_t len)
520 struct netfs_io_request *rreq;
521 struct address_space *mapping = folio->mapping;
522 struct netfs_inode *ctx = netfs_inode(mapping->host);
523 unsigned long long start = folio_pos(folio);
524 size_t flen = folio_size(folio);
527 _enter("%zx @%llx", flen, start);
531 rreq = netfs_alloc_request(mapping, file, start, flen,
532 NETFS_READ_FOR_WRITE);
538 rreq->no_unlock_folio = folio->index;
539 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
540 ret = netfs_begin_cache_read(rreq, ctx);
541 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
544 netfs_stat(&netfs_n_rh_write_begin);
545 trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
547 /* Set up the output buffer */
548 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
549 rreq->start, rreq->len);
551 ret = netfs_begin_read(rreq, true);
552 netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
556 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
558 _leave(" = %d", ret);
563 * netfs_buffered_read_iter - Filesystem buffered I/O read routine
564 * @iocb: kernel I/O control block
565 * @iter: destination for the data read
567 * This is the ->read_iter() routine for all filesystems that can use the page
570 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
571 * returned when no data can be read without waiting for I/O requests to
572 * complete; it doesn't prevent readahead.
574 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
575 * shall be made for the read or for readahead. When no data can be read,
576 * -EAGAIN shall be returned. When readahead would be triggered, a partial,
577 * possibly empty read shall be returned.
580 * * number of bytes copied, even for partial reads
581 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
583 ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
585 struct inode *inode = file_inode(iocb->ki_filp);
586 struct netfs_inode *ictx = netfs_inode(inode);
589 if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
590 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
593 ret = netfs_start_io_read(inode);
595 ret = filemap_read(iocb, iter, 0);
596 netfs_end_io_read(inode);
600 EXPORT_SYMBOL(netfs_buffered_read_iter);
603 * netfs_file_read_iter - Generic filesystem read routine
604 * @iocb: kernel I/O control block
605 * @iter: destination for the data read
607 * This is the ->read_iter() routine for all filesystems that can use the page
610 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
611 * returned when no data can be read without waiting for I/O requests to
612 * complete; it doesn't prevent readahead.
614 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
615 * shall be made for the read or for readahead. When no data can be read,
616 * -EAGAIN shall be returned. When readahead would be triggered, a partial,
617 * possibly empty read shall be returned.
620 * * number of bytes copied, even for partial reads
621 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
623 ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
625 struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
627 if ((iocb->ki_flags & IOCB_DIRECT) ||
628 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
629 return netfs_unbuffered_read_iter(iocb, iter);
631 return netfs_buffered_read_iter(iocb, iter);
633 EXPORT_SYMBOL(netfs_file_read_iter);