GNU Linux-libre 5.15.54-gnu
[releases.git] / lib / iov_iter.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <crypto/hash.h>
3 #include <linux/export.h>
4 #include <linux/bvec.h>
5 #include <linux/fault-inject-usercopy.h>
6 #include <linux/uio.h>
7 #include <linux/pagemap.h>
8 #include <linux/highmem.h>
9 #include <linux/slab.h>
10 #include <linux/vmalloc.h>
11 #include <linux/splice.h>
12 #include <linux/compat.h>
13 #include <net/checksum.h>
14 #include <linux/scatterlist.h>
15 #include <linux/instrumented.h>
16
17 #define PIPE_PARANOIA /* for now */
18
19 /* covers iovec and kvec alike */
20 #define iterate_iovec(i, n, base, len, off, __p, STEP) {        \
21         size_t off = 0;                                         \
22         size_t skip = i->iov_offset;                            \
23         do {                                                    \
24                 len = min(n, __p->iov_len - skip);              \
25                 if (likely(len)) {                              \
26                         base = __p->iov_base + skip;            \
27                         len -= (STEP);                          \
28                         off += len;                             \
29                         skip += len;                            \
30                         n -= len;                               \
31                         if (skip < __p->iov_len)                \
32                                 break;                          \
33                 }                                               \
34                 __p++;                                          \
35                 skip = 0;                                       \
36         } while (n);                                            \
37         i->iov_offset = skip;                                   \
38         n = off;                                                \
39 }
40
41 #define iterate_bvec(i, n, base, len, off, p, STEP) {           \
42         size_t off = 0;                                         \
43         unsigned skip = i->iov_offset;                          \
44         while (n) {                                             \
45                 unsigned offset = p->bv_offset + skip;          \
46                 unsigned left;                                  \
47                 void *kaddr = kmap_local_page(p->bv_page +      \
48                                         offset / PAGE_SIZE);    \
49                 base = kaddr + offset % PAGE_SIZE;              \
50                 len = min(min(n, (size_t)(p->bv_len - skip)),   \
51                      (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
52                 left = (STEP);                                  \
53                 kunmap_local(kaddr);                            \
54                 len -= left;                                    \
55                 off += len;                                     \
56                 skip += len;                                    \
57                 if (skip == p->bv_len) {                        \
58                         skip = 0;                               \
59                         p++;                                    \
60                 }                                               \
61                 n -= len;                                       \
62                 if (left)                                       \
63                         break;                                  \
64         }                                                       \
65         i->iov_offset = skip;                                   \
66         n = off;                                                \
67 }
68
69 #define iterate_xarray(i, n, base, len, __off, STEP) {          \
70         __label__ __out;                                        \
71         size_t __off = 0;                                       \
72         struct page *head = NULL;                               \
73         loff_t start = i->xarray_start + i->iov_offset;         \
74         unsigned offset = start % PAGE_SIZE;                    \
75         pgoff_t index = start / PAGE_SIZE;                      \
76         int j;                                                  \
77                                                                 \
78         XA_STATE(xas, i->xarray, index);                        \
79                                                                 \
80         rcu_read_lock();                                        \
81         xas_for_each(&xas, head, ULONG_MAX) {                   \
82                 unsigned left;                                  \
83                 if (xas_retry(&xas, head))                      \
84                         continue;                               \
85                 if (WARN_ON(xa_is_value(head)))                 \
86                         break;                                  \
87                 if (WARN_ON(PageHuge(head)))                    \
88                         break;                                  \
89                 for (j = (head->index < index) ? index - head->index : 0; \
90                      j < thp_nr_pages(head); j++) {             \
91                         void *kaddr = kmap_local_page(head + j);        \
92                         base = kaddr + offset;                  \
93                         len = PAGE_SIZE - offset;               \
94                         len = min(n, len);                      \
95                         left = (STEP);                          \
96                         kunmap_local(kaddr);                    \
97                         len -= left;                            \
98                         __off += len;                           \
99                         n -= len;                               \
100                         if (left || n == 0)                     \
101                                 goto __out;                     \
102                         offset = 0;                             \
103                 }                                               \
104         }                                                       \
105 __out:                                                          \
106         rcu_read_unlock();                                      \
107         i->iov_offset += __off;                                         \
108         n = __off;                                              \
109 }
110
111 #define __iterate_and_advance(i, n, base, len, off, I, K) {     \
112         if (unlikely(i->count < n))                             \
113                 n = i->count;                                   \
114         if (likely(n)) {                                        \
115                 if (likely(iter_is_iovec(i))) {                 \
116                         const struct iovec *iov = i->iov;       \
117                         void __user *base;                      \
118                         size_t len;                             \
119                         iterate_iovec(i, n, base, len, off,     \
120                                                 iov, (I))       \
121                         i->nr_segs -= iov - i->iov;             \
122                         i->iov = iov;                           \
123                 } else if (iov_iter_is_bvec(i)) {               \
124                         const struct bio_vec *bvec = i->bvec;   \
125                         void *base;                             \
126                         size_t len;                             \
127                         iterate_bvec(i, n, base, len, off,      \
128                                                 bvec, (K))      \
129                         i->nr_segs -= bvec - i->bvec;           \
130                         i->bvec = bvec;                         \
131                 } else if (iov_iter_is_kvec(i)) {               \
132                         const struct kvec *kvec = i->kvec;      \
133                         void *base;                             \
134                         size_t len;                             \
135                         iterate_iovec(i, n, base, len, off,     \
136                                                 kvec, (K))      \
137                         i->nr_segs -= kvec - i->kvec;           \
138                         i->kvec = kvec;                         \
139                 } else if (iov_iter_is_xarray(i)) {             \
140                         void *base;                             \
141                         size_t len;                             \
142                         iterate_xarray(i, n, base, len, off,    \
143                                                         (K))    \
144                 }                                               \
145                 i->count -= n;                                  \
146         }                                                       \
147 }
148 #define iterate_and_advance(i, n, base, len, off, I, K) \
149         __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
150
151 static int copyout(void __user *to, const void *from, size_t n)
152 {
153         if (should_fail_usercopy())
154                 return n;
155         if (access_ok(to, n)) {
156                 instrument_copy_to_user(to, from, n);
157                 n = raw_copy_to_user(to, from, n);
158         }
159         return n;
160 }
161
162 static int copyin(void *to, const void __user *from, size_t n)
163 {
164         if (should_fail_usercopy())
165                 return n;
166         if (access_ok(from, n)) {
167                 instrument_copy_from_user(to, from, n);
168                 n = raw_copy_from_user(to, from, n);
169         }
170         return n;
171 }
172
173 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
174                          struct iov_iter *i)
175 {
176         size_t skip, copy, left, wanted;
177         const struct iovec *iov;
178         char __user *buf;
179         void *kaddr, *from;
180
181         if (unlikely(bytes > i->count))
182                 bytes = i->count;
183
184         if (unlikely(!bytes))
185                 return 0;
186
187         might_fault();
188         wanted = bytes;
189         iov = i->iov;
190         skip = i->iov_offset;
191         buf = iov->iov_base + skip;
192         copy = min(bytes, iov->iov_len - skip);
193
194         if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
195                 kaddr = kmap_atomic(page);
196                 from = kaddr + offset;
197
198                 /* first chunk, usually the only one */
199                 left = copyout(buf, from, copy);
200                 copy -= left;
201                 skip += copy;
202                 from += copy;
203                 bytes -= copy;
204
205                 while (unlikely(!left && bytes)) {
206                         iov++;
207                         buf = iov->iov_base;
208                         copy = min(bytes, iov->iov_len);
209                         left = copyout(buf, from, copy);
210                         copy -= left;
211                         skip = copy;
212                         from += copy;
213                         bytes -= copy;
214                 }
215                 if (likely(!bytes)) {
216                         kunmap_atomic(kaddr);
217                         goto done;
218                 }
219                 offset = from - kaddr;
220                 buf += copy;
221                 kunmap_atomic(kaddr);
222                 copy = min(bytes, iov->iov_len - skip);
223         }
224         /* Too bad - revert to non-atomic kmap */
225
226         kaddr = kmap(page);
227         from = kaddr + offset;
228         left = copyout(buf, from, copy);
229         copy -= left;
230         skip += copy;
231         from += copy;
232         bytes -= copy;
233         while (unlikely(!left && bytes)) {
234                 iov++;
235                 buf = iov->iov_base;
236                 copy = min(bytes, iov->iov_len);
237                 left = copyout(buf, from, copy);
238                 copy -= left;
239                 skip = copy;
240                 from += copy;
241                 bytes -= copy;
242         }
243         kunmap(page);
244
245 done:
246         if (skip == iov->iov_len) {
247                 iov++;
248                 skip = 0;
249         }
250         i->count -= wanted - bytes;
251         i->nr_segs -= iov - i->iov;
252         i->iov = iov;
253         i->iov_offset = skip;
254         return wanted - bytes;
255 }
256
257 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
258                          struct iov_iter *i)
259 {
260         size_t skip, copy, left, wanted;
261         const struct iovec *iov;
262         char __user *buf;
263         void *kaddr, *to;
264
265         if (unlikely(bytes > i->count))
266                 bytes = i->count;
267
268         if (unlikely(!bytes))
269                 return 0;
270
271         might_fault();
272         wanted = bytes;
273         iov = i->iov;
274         skip = i->iov_offset;
275         buf = iov->iov_base + skip;
276         copy = min(bytes, iov->iov_len - skip);
277
278         if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
279                 kaddr = kmap_atomic(page);
280                 to = kaddr + offset;
281
282                 /* first chunk, usually the only one */
283                 left = copyin(to, buf, copy);
284                 copy -= left;
285                 skip += copy;
286                 to += copy;
287                 bytes -= copy;
288
289                 while (unlikely(!left && bytes)) {
290                         iov++;
291                         buf = iov->iov_base;
292                         copy = min(bytes, iov->iov_len);
293                         left = copyin(to, buf, copy);
294                         copy -= left;
295                         skip = copy;
296                         to += copy;
297                         bytes -= copy;
298                 }
299                 if (likely(!bytes)) {
300                         kunmap_atomic(kaddr);
301                         goto done;
302                 }
303                 offset = to - kaddr;
304                 buf += copy;
305                 kunmap_atomic(kaddr);
306                 copy = min(bytes, iov->iov_len - skip);
307         }
308         /* Too bad - revert to non-atomic kmap */
309
310         kaddr = kmap(page);
311         to = kaddr + offset;
312         left = copyin(to, buf, copy);
313         copy -= left;
314         skip += copy;
315         to += copy;
316         bytes -= copy;
317         while (unlikely(!left && bytes)) {
318                 iov++;
319                 buf = iov->iov_base;
320                 copy = min(bytes, iov->iov_len);
321                 left = copyin(to, buf, copy);
322                 copy -= left;
323                 skip = copy;
324                 to += copy;
325                 bytes -= copy;
326         }
327         kunmap(page);
328
329 done:
330         if (skip == iov->iov_len) {
331                 iov++;
332                 skip = 0;
333         }
334         i->count -= wanted - bytes;
335         i->nr_segs -= iov - i->iov;
336         i->iov = iov;
337         i->iov_offset = skip;
338         return wanted - bytes;
339 }
340
341 #ifdef PIPE_PARANOIA
342 static bool sanity(const struct iov_iter *i)
343 {
344         struct pipe_inode_info *pipe = i->pipe;
345         unsigned int p_head = pipe->head;
346         unsigned int p_tail = pipe->tail;
347         unsigned int p_mask = pipe->ring_size - 1;
348         unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
349         unsigned int i_head = i->head;
350         unsigned int idx;
351
352         if (i->iov_offset) {
353                 struct pipe_buffer *p;
354                 if (unlikely(p_occupancy == 0))
355                         goto Bad;       // pipe must be non-empty
356                 if (unlikely(i_head != p_head - 1))
357                         goto Bad;       // must be at the last buffer...
358
359                 p = &pipe->bufs[i_head & p_mask];
360                 if (unlikely(p->offset + p->len != i->iov_offset))
361                         goto Bad;       // ... at the end of segment
362         } else {
363                 if (i_head != p_head)
364                         goto Bad;       // must be right after the last buffer
365         }
366         return true;
367 Bad:
368         printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
369         printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
370                         p_head, p_tail, pipe->ring_size);
371         for (idx = 0; idx < pipe->ring_size; idx++)
372                 printk(KERN_ERR "[%p %p %d %d]\n",
373                         pipe->bufs[idx].ops,
374                         pipe->bufs[idx].page,
375                         pipe->bufs[idx].offset,
376                         pipe->bufs[idx].len);
377         WARN_ON(1);
378         return false;
379 }
380 #else
381 #define sanity(i) true
382 #endif
383
384 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
385                          struct iov_iter *i)
386 {
387         struct pipe_inode_info *pipe = i->pipe;
388         struct pipe_buffer *buf;
389         unsigned int p_tail = pipe->tail;
390         unsigned int p_mask = pipe->ring_size - 1;
391         unsigned int i_head = i->head;
392         size_t off;
393
394         if (unlikely(bytes > i->count))
395                 bytes = i->count;
396
397         if (unlikely(!bytes))
398                 return 0;
399
400         if (!sanity(i))
401                 return 0;
402
403         off = i->iov_offset;
404         buf = &pipe->bufs[i_head & p_mask];
405         if (off) {
406                 if (offset == off && buf->page == page) {
407                         /* merge with the last one */
408                         buf->len += bytes;
409                         i->iov_offset += bytes;
410                         goto out;
411                 }
412                 i_head++;
413                 buf = &pipe->bufs[i_head & p_mask];
414         }
415         if (pipe_full(i_head, p_tail, pipe->max_usage))
416                 return 0;
417
418         buf->ops = &page_cache_pipe_buf_ops;
419         buf->flags = 0;
420         get_page(page);
421         buf->page = page;
422         buf->offset = offset;
423         buf->len = bytes;
424
425         pipe->head = i_head + 1;
426         i->iov_offset = offset + bytes;
427         i->head = i_head;
428 out:
429         i->count -= bytes;
430         return bytes;
431 }
432
433 /*
434  * fault_in_iov_iter_readable - fault in iov iterator for reading
435  * @i: iterator
436  * @size: maximum length
437  *
438  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
439  * @size.  For each iovec, fault in each page that constitutes the iovec.
440  *
441  * Returns the number of bytes not faulted in (like copy_to_user() and
442  * copy_from_user()).
443  *
444  * Always returns 0 for non-userspace iterators.
445  */
446 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
447 {
448         if (iter_is_iovec(i)) {
449                 size_t count = min(size, iov_iter_count(i));
450                 const struct iovec *p;
451                 size_t skip;
452
453                 size -= count;
454                 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
455                         size_t len = min(count, p->iov_len - skip);
456                         size_t ret;
457
458                         if (unlikely(!len))
459                                 continue;
460                         ret = fault_in_readable(p->iov_base + skip, len);
461                         count -= len - ret;
462                         if (ret)
463                                 break;
464                 }
465                 return count + size;
466         }
467         return 0;
468 }
469 EXPORT_SYMBOL(fault_in_iov_iter_readable);
470
471 /*
472  * fault_in_iov_iter_writeable - fault in iov iterator for writing
473  * @i: iterator
474  * @size: maximum length
475  *
476  * Faults in the iterator using get_user_pages(), i.e., without triggering
477  * hardware page faults.  This is primarily useful when we already know that
478  * some or all of the pages in @i aren't in memory.
479  *
480  * Returns the number of bytes not faulted in, like copy_to_user() and
481  * copy_from_user().
482  *
483  * Always returns 0 for non-user-space iterators.
484  */
485 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
486 {
487         if (iter_is_iovec(i)) {
488                 size_t count = min(size, iov_iter_count(i));
489                 const struct iovec *p;
490                 size_t skip;
491
492                 size -= count;
493                 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
494                         size_t len = min(count, p->iov_len - skip);
495                         size_t ret;
496
497                         if (unlikely(!len))
498                                 continue;
499                         ret = fault_in_safe_writeable(p->iov_base + skip, len);
500                         count -= len - ret;
501                         if (ret)
502                                 break;
503                 }
504                 return count + size;
505         }
506         return 0;
507 }
508 EXPORT_SYMBOL(fault_in_iov_iter_writeable);
509
510 void iov_iter_init(struct iov_iter *i, unsigned int direction,
511                         const struct iovec *iov, unsigned long nr_segs,
512                         size_t count)
513 {
514         WARN_ON(direction & ~(READ | WRITE));
515         *i = (struct iov_iter) {
516                 .iter_type = ITER_IOVEC,
517                 .nofault = false,
518                 .data_source = direction,
519                 .iov = iov,
520                 .nr_segs = nr_segs,
521                 .iov_offset = 0,
522                 .count = count
523         };
524 }
525 EXPORT_SYMBOL(iov_iter_init);
526
527 static inline bool allocated(struct pipe_buffer *buf)
528 {
529         return buf->ops == &default_pipe_buf_ops;
530 }
531
532 static inline void data_start(const struct iov_iter *i,
533                               unsigned int *iter_headp, size_t *offp)
534 {
535         unsigned int p_mask = i->pipe->ring_size - 1;
536         unsigned int iter_head = i->head;
537         size_t off = i->iov_offset;
538
539         if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
540                     off == PAGE_SIZE)) {
541                 iter_head++;
542                 off = 0;
543         }
544         *iter_headp = iter_head;
545         *offp = off;
546 }
547
548 static size_t push_pipe(struct iov_iter *i, size_t size,
549                         int *iter_headp, size_t *offp)
550 {
551         struct pipe_inode_info *pipe = i->pipe;
552         unsigned int p_tail = pipe->tail;
553         unsigned int p_mask = pipe->ring_size - 1;
554         unsigned int iter_head;
555         size_t off;
556         ssize_t left;
557
558         if (unlikely(size > i->count))
559                 size = i->count;
560         if (unlikely(!size))
561                 return 0;
562
563         left = size;
564         data_start(i, &iter_head, &off);
565         *iter_headp = iter_head;
566         *offp = off;
567         if (off) {
568                 left -= PAGE_SIZE - off;
569                 if (left <= 0) {
570                         pipe->bufs[iter_head & p_mask].len += size;
571                         return size;
572                 }
573                 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
574                 iter_head++;
575         }
576         while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
577                 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
578                 struct page *page = alloc_page(GFP_USER);
579                 if (!page)
580                         break;
581
582                 buf->ops = &default_pipe_buf_ops;
583                 buf->flags = 0;
584                 buf->page = page;
585                 buf->offset = 0;
586                 buf->len = min_t(ssize_t, left, PAGE_SIZE);
587                 left -= buf->len;
588                 iter_head++;
589                 pipe->head = iter_head;
590
591                 if (left == 0)
592                         return size;
593         }
594         return size - left;
595 }
596
597 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
598                                 struct iov_iter *i)
599 {
600         struct pipe_inode_info *pipe = i->pipe;
601         unsigned int p_mask = pipe->ring_size - 1;
602         unsigned int i_head;
603         size_t n, off;
604
605         if (!sanity(i))
606                 return 0;
607
608         bytes = n = push_pipe(i, bytes, &i_head, &off);
609         if (unlikely(!n))
610                 return 0;
611         do {
612                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
613                 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
614                 i->head = i_head;
615                 i->iov_offset = off + chunk;
616                 n -= chunk;
617                 addr += chunk;
618                 off = 0;
619                 i_head++;
620         } while (n);
621         i->count -= bytes;
622         return bytes;
623 }
624
625 static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
626                               __wsum sum, size_t off)
627 {
628         __wsum next = csum_partial_copy_nocheck(from, to, len);
629         return csum_block_add(sum, next, off);
630 }
631
632 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
633                                          struct iov_iter *i, __wsum *sump)
634 {
635         struct pipe_inode_info *pipe = i->pipe;
636         unsigned int p_mask = pipe->ring_size - 1;
637         __wsum sum = *sump;
638         size_t off = 0;
639         unsigned int i_head;
640         size_t r;
641
642         if (!sanity(i))
643                 return 0;
644
645         bytes = push_pipe(i, bytes, &i_head, &r);
646         while (bytes) {
647                 size_t chunk = min_t(size_t, bytes, PAGE_SIZE - r);
648                 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
649                 sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off);
650                 kunmap_local(p);
651                 i->head = i_head;
652                 i->iov_offset = r + chunk;
653                 bytes -= chunk;
654                 off += chunk;
655                 r = 0;
656                 i_head++;
657         }
658         *sump = sum;
659         i->count -= off;
660         return off;
661 }
662
663 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
664 {
665         if (unlikely(iov_iter_is_pipe(i)))
666                 return copy_pipe_to_iter(addr, bytes, i);
667         if (iter_is_iovec(i))
668                 might_fault();
669         iterate_and_advance(i, bytes, base, len, off,
670                 copyout(base, addr + off, len),
671                 memcpy(base, addr + off, len)
672         )
673
674         return bytes;
675 }
676 EXPORT_SYMBOL(_copy_to_iter);
677
678 #ifdef CONFIG_ARCH_HAS_COPY_MC
679 static int copyout_mc(void __user *to, const void *from, size_t n)
680 {
681         if (access_ok(to, n)) {
682                 instrument_copy_to_user(to, from, n);
683                 n = copy_mc_to_user((__force void *) to, from, n);
684         }
685         return n;
686 }
687
688 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
689                                 struct iov_iter *i)
690 {
691         struct pipe_inode_info *pipe = i->pipe;
692         unsigned int p_mask = pipe->ring_size - 1;
693         unsigned int i_head;
694         size_t n, off, xfer = 0;
695
696         if (!sanity(i))
697                 return 0;
698
699         n = push_pipe(i, bytes, &i_head, &off);
700         while (n) {
701                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
702                 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
703                 unsigned long rem;
704                 rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
705                 chunk -= rem;
706                 kunmap_local(p);
707                 i->head = i_head;
708                 i->iov_offset = off + chunk;
709                 xfer += chunk;
710                 if (rem)
711                         break;
712                 n -= chunk;
713                 off = 0;
714                 i_head++;
715         }
716         i->count -= xfer;
717         return xfer;
718 }
719
720 /**
721  * _copy_mc_to_iter - copy to iter with source memory error exception handling
722  * @addr: source kernel address
723  * @bytes: total transfer length
724  * @i: destination iterator
725  *
726  * The pmem driver deploys this for the dax operation
727  * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
728  * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
729  * successfully copied.
730  *
731  * The main differences between this and typical _copy_to_iter().
732  *
733  * * Typical tail/residue handling after a fault retries the copy
734  *   byte-by-byte until the fault happens again. Re-triggering machine
735  *   checks is potentially fatal so the implementation uses source
736  *   alignment and poison alignment assumptions to avoid re-triggering
737  *   hardware exceptions.
738  *
739  * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
740  *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
741  *   a short copy.
742  *
743  * Return: number of bytes copied (may be %0)
744  */
745 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
746 {
747         if (unlikely(iov_iter_is_pipe(i)))
748                 return copy_mc_pipe_to_iter(addr, bytes, i);
749         if (iter_is_iovec(i))
750                 might_fault();
751         __iterate_and_advance(i, bytes, base, len, off,
752                 copyout_mc(base, addr + off, len),
753                 copy_mc_to_kernel(base, addr + off, len)
754         )
755
756         return bytes;
757 }
758 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
759 #endif /* CONFIG_ARCH_HAS_COPY_MC */
760
761 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
762 {
763         if (unlikely(iov_iter_is_pipe(i))) {
764                 WARN_ON(1);
765                 return 0;
766         }
767         if (iter_is_iovec(i))
768                 might_fault();
769         iterate_and_advance(i, bytes, base, len, off,
770                 copyin(addr + off, base, len),
771                 memcpy(addr + off, base, len)
772         )
773
774         return bytes;
775 }
776 EXPORT_SYMBOL(_copy_from_iter);
777
778 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
779 {
780         if (unlikely(iov_iter_is_pipe(i))) {
781                 WARN_ON(1);
782                 return 0;
783         }
784         iterate_and_advance(i, bytes, base, len, off,
785                 __copy_from_user_inatomic_nocache(addr + off, base, len),
786                 memcpy(addr + off, base, len)
787         )
788
789         return bytes;
790 }
791 EXPORT_SYMBOL(_copy_from_iter_nocache);
792
793 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
794 /**
795  * _copy_from_iter_flushcache - write destination through cpu cache
796  * @addr: destination kernel address
797  * @bytes: total transfer length
798  * @i: source iterator
799  *
800  * The pmem driver arranges for filesystem-dax to use this facility via
801  * dax_copy_from_iter() for ensuring that writes to persistent memory
802  * are flushed through the CPU cache. It is differentiated from
803  * _copy_from_iter_nocache() in that guarantees all data is flushed for
804  * all iterator types. The _copy_from_iter_nocache() only attempts to
805  * bypass the cache for the ITER_IOVEC case, and on some archs may use
806  * instructions that strand dirty-data in the cache.
807  *
808  * Return: number of bytes copied (may be %0)
809  */
810 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
811 {
812         if (unlikely(iov_iter_is_pipe(i))) {
813                 WARN_ON(1);
814                 return 0;
815         }
816         iterate_and_advance(i, bytes, base, len, off,
817                 __copy_from_user_flushcache(addr + off, base, len),
818                 memcpy_flushcache(addr + off, base, len)
819         )
820
821         return bytes;
822 }
823 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
824 #endif
825
826 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
827 {
828         struct page *head;
829         size_t v = n + offset;
830
831         /*
832          * The general case needs to access the page order in order
833          * to compute the page size.
834          * However, we mostly deal with order-0 pages and thus can
835          * avoid a possible cache line miss for requests that fit all
836          * page orders.
837          */
838         if (n <= v && v <= PAGE_SIZE)
839                 return true;
840
841         head = compound_head(page);
842         v += (page - head) << PAGE_SHIFT;
843
844         if (likely(n <= v && v <= (page_size(head))))
845                 return true;
846         WARN_ON(1);
847         return false;
848 }
849
850 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
851                          struct iov_iter *i)
852 {
853         if (likely(iter_is_iovec(i)))
854                 return copy_page_to_iter_iovec(page, offset, bytes, i);
855         if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
856                 void *kaddr = kmap_local_page(page);
857                 size_t wanted = _copy_to_iter(kaddr + offset, bytes, i);
858                 kunmap_local(kaddr);
859                 return wanted;
860         }
861         if (iov_iter_is_pipe(i))
862                 return copy_page_to_iter_pipe(page, offset, bytes, i);
863         if (unlikely(iov_iter_is_discard(i))) {
864                 if (unlikely(i->count < bytes))
865                         bytes = i->count;
866                 i->count -= bytes;
867                 return bytes;
868         }
869         WARN_ON(1);
870         return 0;
871 }
872
873 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
874                          struct iov_iter *i)
875 {
876         size_t res = 0;
877         if (unlikely(!page_copy_sane(page, offset, bytes)))
878                 return 0;
879         page += offset / PAGE_SIZE; // first subpage
880         offset %= PAGE_SIZE;
881         while (1) {
882                 size_t n = __copy_page_to_iter(page, offset,
883                                 min(bytes, (size_t)PAGE_SIZE - offset), i);
884                 res += n;
885                 bytes -= n;
886                 if (!bytes || !n)
887                         break;
888                 offset += n;
889                 if (offset == PAGE_SIZE) {
890                         page++;
891                         offset = 0;
892                 }
893         }
894         return res;
895 }
896 EXPORT_SYMBOL(copy_page_to_iter);
897
898 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
899                          struct iov_iter *i)
900 {
901         if (unlikely(!page_copy_sane(page, offset, bytes)))
902                 return 0;
903         if (likely(iter_is_iovec(i)))
904                 return copy_page_from_iter_iovec(page, offset, bytes, i);
905         if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
906                 void *kaddr = kmap_local_page(page);
907                 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
908                 kunmap_local(kaddr);
909                 return wanted;
910         }
911         WARN_ON(1);
912         return 0;
913 }
914 EXPORT_SYMBOL(copy_page_from_iter);
915
916 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
917 {
918         struct pipe_inode_info *pipe = i->pipe;
919         unsigned int p_mask = pipe->ring_size - 1;
920         unsigned int i_head;
921         size_t n, off;
922
923         if (!sanity(i))
924                 return 0;
925
926         bytes = n = push_pipe(i, bytes, &i_head, &off);
927         if (unlikely(!n))
928                 return 0;
929
930         do {
931                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
932                 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
933                 memset(p + off, 0, chunk);
934                 kunmap_local(p);
935                 i->head = i_head;
936                 i->iov_offset = off + chunk;
937                 n -= chunk;
938                 off = 0;
939                 i_head++;
940         } while (n);
941         i->count -= bytes;
942         return bytes;
943 }
944
945 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
946 {
947         if (unlikely(iov_iter_is_pipe(i)))
948                 return pipe_zero(bytes, i);
949         iterate_and_advance(i, bytes, base, len, count,
950                 clear_user(base, len),
951                 memset(base, 0, len)
952         )
953
954         return bytes;
955 }
956 EXPORT_SYMBOL(iov_iter_zero);
957
958 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
959                                   struct iov_iter *i)
960 {
961         char *kaddr = kmap_atomic(page), *p = kaddr + offset;
962         if (unlikely(!page_copy_sane(page, offset, bytes))) {
963                 kunmap_atomic(kaddr);
964                 return 0;
965         }
966         if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
967                 kunmap_atomic(kaddr);
968                 WARN_ON(1);
969                 return 0;
970         }
971         iterate_and_advance(i, bytes, base, len, off,
972                 copyin(p + off, base, len),
973                 memcpy(p + off, base, len)
974         )
975         kunmap_atomic(kaddr);
976         return bytes;
977 }
978 EXPORT_SYMBOL(copy_page_from_iter_atomic);
979
980 static inline void pipe_truncate(struct iov_iter *i)
981 {
982         struct pipe_inode_info *pipe = i->pipe;
983         unsigned int p_tail = pipe->tail;
984         unsigned int p_head = pipe->head;
985         unsigned int p_mask = pipe->ring_size - 1;
986
987         if (!pipe_empty(p_head, p_tail)) {
988                 struct pipe_buffer *buf;
989                 unsigned int i_head = i->head;
990                 size_t off = i->iov_offset;
991
992                 if (off) {
993                         buf = &pipe->bufs[i_head & p_mask];
994                         buf->len = off - buf->offset;
995                         i_head++;
996                 }
997                 while (p_head != i_head) {
998                         p_head--;
999                         pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
1000                 }
1001
1002                 pipe->head = p_head;
1003         }
1004 }
1005
1006 static void pipe_advance(struct iov_iter *i, size_t size)
1007 {
1008         struct pipe_inode_info *pipe = i->pipe;
1009         if (size) {
1010                 struct pipe_buffer *buf;
1011                 unsigned int p_mask = pipe->ring_size - 1;
1012                 unsigned int i_head = i->head;
1013                 size_t off = i->iov_offset, left = size;
1014
1015                 if (off) /* make it relative to the beginning of buffer */
1016                         left += off - pipe->bufs[i_head & p_mask].offset;
1017                 while (1) {
1018                         buf = &pipe->bufs[i_head & p_mask];
1019                         if (left <= buf->len)
1020                                 break;
1021                         left -= buf->len;
1022                         i_head++;
1023                 }
1024                 i->head = i_head;
1025                 i->iov_offset = buf->offset + left;
1026         }
1027         i->count -= size;
1028         /* ... and discard everything past that point */
1029         pipe_truncate(i);
1030 }
1031
1032 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
1033 {
1034         struct bvec_iter bi;
1035
1036         bi.bi_size = i->count;
1037         bi.bi_bvec_done = i->iov_offset;
1038         bi.bi_idx = 0;
1039         bvec_iter_advance(i->bvec, &bi, size);
1040
1041         i->bvec += bi.bi_idx;
1042         i->nr_segs -= bi.bi_idx;
1043         i->count = bi.bi_size;
1044         i->iov_offset = bi.bi_bvec_done;
1045 }
1046
1047 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
1048 {
1049         const struct iovec *iov, *end;
1050
1051         if (!i->count)
1052                 return;
1053         i->count -= size;
1054
1055         size += i->iov_offset; // from beginning of current segment
1056         for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
1057                 if (likely(size < iov->iov_len))
1058                         break;
1059                 size -= iov->iov_len;
1060         }
1061         i->iov_offset = size;
1062         i->nr_segs -= iov - i->iov;
1063         i->iov = iov;
1064 }
1065
1066 void iov_iter_advance(struct iov_iter *i, size_t size)
1067 {
1068         if (unlikely(i->count < size))
1069                 size = i->count;
1070         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
1071                 /* iovec and kvec have identical layouts */
1072                 iov_iter_iovec_advance(i, size);
1073         } else if (iov_iter_is_bvec(i)) {
1074                 iov_iter_bvec_advance(i, size);
1075         } else if (iov_iter_is_pipe(i)) {
1076                 pipe_advance(i, size);
1077         } else if (unlikely(iov_iter_is_xarray(i))) {
1078                 i->iov_offset += size;
1079                 i->count -= size;
1080         } else if (iov_iter_is_discard(i)) {
1081                 i->count -= size;
1082         }
1083 }
1084 EXPORT_SYMBOL(iov_iter_advance);
1085
1086 void iov_iter_revert(struct iov_iter *i, size_t unroll)
1087 {
1088         if (!unroll)
1089                 return;
1090         if (WARN_ON(unroll > MAX_RW_COUNT))
1091                 return;
1092         i->count += unroll;
1093         if (unlikely(iov_iter_is_pipe(i))) {
1094                 struct pipe_inode_info *pipe = i->pipe;
1095                 unsigned int p_mask = pipe->ring_size - 1;
1096                 unsigned int i_head = i->head;
1097                 size_t off = i->iov_offset;
1098                 while (1) {
1099                         struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1100                         size_t n = off - b->offset;
1101                         if (unroll < n) {
1102                                 off -= unroll;
1103                                 break;
1104                         }
1105                         unroll -= n;
1106                         if (!unroll && i_head == i->start_head) {
1107                                 off = 0;
1108                                 break;
1109                         }
1110                         i_head--;
1111                         b = &pipe->bufs[i_head & p_mask];
1112                         off = b->offset + b->len;
1113                 }
1114                 i->iov_offset = off;
1115                 i->head = i_head;
1116                 pipe_truncate(i);
1117                 return;
1118         }
1119         if (unlikely(iov_iter_is_discard(i)))
1120                 return;
1121         if (unroll <= i->iov_offset) {
1122                 i->iov_offset -= unroll;
1123                 return;
1124         }
1125         unroll -= i->iov_offset;
1126         if (iov_iter_is_xarray(i)) {
1127                 BUG(); /* We should never go beyond the start of the specified
1128                         * range since we might then be straying into pages that
1129                         * aren't pinned.
1130                         */
1131         } else if (iov_iter_is_bvec(i)) {
1132                 const struct bio_vec *bvec = i->bvec;
1133                 while (1) {
1134                         size_t n = (--bvec)->bv_len;
1135                         i->nr_segs++;
1136                         if (unroll <= n) {
1137                                 i->bvec = bvec;
1138                                 i->iov_offset = n - unroll;
1139                                 return;
1140                         }
1141                         unroll -= n;
1142                 }
1143         } else { /* same logics for iovec and kvec */
1144                 const struct iovec *iov = i->iov;
1145                 while (1) {
1146                         size_t n = (--iov)->iov_len;
1147                         i->nr_segs++;
1148                         if (unroll <= n) {
1149                                 i->iov = iov;
1150                                 i->iov_offset = n - unroll;
1151                                 return;
1152                         }
1153                         unroll -= n;
1154                 }
1155         }
1156 }
1157 EXPORT_SYMBOL(iov_iter_revert);
1158
1159 /*
1160  * Return the count of just the current iov_iter segment.
1161  */
1162 size_t iov_iter_single_seg_count(const struct iov_iter *i)
1163 {
1164         if (i->nr_segs > 1) {
1165                 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1166                         return min(i->count, i->iov->iov_len - i->iov_offset);
1167                 if (iov_iter_is_bvec(i))
1168                         return min(i->count, i->bvec->bv_len - i->iov_offset);
1169         }
1170         return i->count;
1171 }
1172 EXPORT_SYMBOL(iov_iter_single_seg_count);
1173
1174 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1175                         const struct kvec *kvec, unsigned long nr_segs,
1176                         size_t count)
1177 {
1178         WARN_ON(direction & ~(READ | WRITE));
1179         *i = (struct iov_iter){
1180                 .iter_type = ITER_KVEC,
1181                 .data_source = direction,
1182                 .kvec = kvec,
1183                 .nr_segs = nr_segs,
1184                 .iov_offset = 0,
1185                 .count = count
1186         };
1187 }
1188 EXPORT_SYMBOL(iov_iter_kvec);
1189
1190 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1191                         const struct bio_vec *bvec, unsigned long nr_segs,
1192                         size_t count)
1193 {
1194         WARN_ON(direction & ~(READ | WRITE));
1195         *i = (struct iov_iter){
1196                 .iter_type = ITER_BVEC,
1197                 .data_source = direction,
1198                 .bvec = bvec,
1199                 .nr_segs = nr_segs,
1200                 .iov_offset = 0,
1201                 .count = count
1202         };
1203 }
1204 EXPORT_SYMBOL(iov_iter_bvec);
1205
1206 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1207                         struct pipe_inode_info *pipe,
1208                         size_t count)
1209 {
1210         BUG_ON(direction != READ);
1211         WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1212         *i = (struct iov_iter){
1213                 .iter_type = ITER_PIPE,
1214                 .data_source = false,
1215                 .pipe = pipe,
1216                 .head = pipe->head,
1217                 .start_head = pipe->head,
1218                 .iov_offset = 0,
1219                 .count = count
1220         };
1221 }
1222 EXPORT_SYMBOL(iov_iter_pipe);
1223
1224 /**
1225  * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1226  * @i: The iterator to initialise.
1227  * @direction: The direction of the transfer.
1228  * @xarray: The xarray to access.
1229  * @start: The start file position.
1230  * @count: The size of the I/O buffer in bytes.
1231  *
1232  * Set up an I/O iterator to either draw data out of the pages attached to an
1233  * inode or to inject data into those pages.  The pages *must* be prevented
1234  * from evaporation, either by taking a ref on them or locking them by the
1235  * caller.
1236  */
1237 void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1238                      struct xarray *xarray, loff_t start, size_t count)
1239 {
1240         BUG_ON(direction & ~1);
1241         *i = (struct iov_iter) {
1242                 .iter_type = ITER_XARRAY,
1243                 .data_source = direction,
1244                 .xarray = xarray,
1245                 .xarray_start = start,
1246                 .count = count,
1247                 .iov_offset = 0
1248         };
1249 }
1250 EXPORT_SYMBOL(iov_iter_xarray);
1251
1252 /**
1253  * iov_iter_discard - Initialise an I/O iterator that discards data
1254  * @i: The iterator to initialise.
1255  * @direction: The direction of the transfer.
1256  * @count: The size of the I/O buffer in bytes.
1257  *
1258  * Set up an I/O iterator that just discards everything that's written to it.
1259  * It's only available as a READ iterator.
1260  */
1261 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1262 {
1263         BUG_ON(direction != READ);
1264         *i = (struct iov_iter){
1265                 .iter_type = ITER_DISCARD,
1266                 .data_source = false,
1267                 .count = count,
1268                 .iov_offset = 0
1269         };
1270 }
1271 EXPORT_SYMBOL(iov_iter_discard);
1272
1273 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1274 {
1275         unsigned long res = 0;
1276         size_t size = i->count;
1277         size_t skip = i->iov_offset;
1278         unsigned k;
1279
1280         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1281                 size_t len = i->iov[k].iov_len - skip;
1282                 if (len) {
1283                         res |= (unsigned long)i->iov[k].iov_base + skip;
1284                         if (len > size)
1285                                 len = size;
1286                         res |= len;
1287                         size -= len;
1288                         if (!size)
1289                                 break;
1290                 }
1291         }
1292         return res;
1293 }
1294
1295 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1296 {
1297         unsigned res = 0;
1298         size_t size = i->count;
1299         unsigned skip = i->iov_offset;
1300         unsigned k;
1301
1302         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1303                 size_t len = i->bvec[k].bv_len - skip;
1304                 res |= (unsigned long)i->bvec[k].bv_offset + skip;
1305                 if (len > size)
1306                         len = size;
1307                 res |= len;
1308                 size -= len;
1309                 if (!size)
1310                         break;
1311         }
1312         return res;
1313 }
1314
1315 unsigned long iov_iter_alignment(const struct iov_iter *i)
1316 {
1317         /* iovec and kvec have identical layouts */
1318         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1319                 return iov_iter_alignment_iovec(i);
1320
1321         if (iov_iter_is_bvec(i))
1322                 return iov_iter_alignment_bvec(i);
1323
1324         if (iov_iter_is_pipe(i)) {
1325                 unsigned int p_mask = i->pipe->ring_size - 1;
1326                 size_t size = i->count;
1327
1328                 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1329                         return size | i->iov_offset;
1330                 return size;
1331         }
1332
1333         if (iov_iter_is_xarray(i))
1334                 return (i->xarray_start + i->iov_offset) | i->count;
1335
1336         return 0;
1337 }
1338 EXPORT_SYMBOL(iov_iter_alignment);
1339
1340 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1341 {
1342         unsigned long res = 0;
1343         unsigned long v = 0;
1344         size_t size = i->count;
1345         unsigned k;
1346
1347         if (WARN_ON(!iter_is_iovec(i)))
1348                 return ~0U;
1349
1350         for (k = 0; k < i->nr_segs; k++) {
1351                 if (i->iov[k].iov_len) {
1352                         unsigned long base = (unsigned long)i->iov[k].iov_base;
1353                         if (v) // if not the first one
1354                                 res |= base | v; // this start | previous end
1355                         v = base + i->iov[k].iov_len;
1356                         if (size <= i->iov[k].iov_len)
1357                                 break;
1358                         size -= i->iov[k].iov_len;
1359                 }
1360         }
1361         return res;
1362 }
1363 EXPORT_SYMBOL(iov_iter_gap_alignment);
1364
1365 static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1366                                 size_t maxsize,
1367                                 struct page **pages,
1368                                 int iter_head,
1369                                 size_t *start)
1370 {
1371         struct pipe_inode_info *pipe = i->pipe;
1372         unsigned int p_mask = pipe->ring_size - 1;
1373         ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1374         if (!n)
1375                 return -EFAULT;
1376
1377         maxsize = n;
1378         n += *start;
1379         while (n > 0) {
1380                 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1381                 iter_head++;
1382                 n -= PAGE_SIZE;
1383         }
1384
1385         return maxsize;
1386 }
1387
1388 static ssize_t pipe_get_pages(struct iov_iter *i,
1389                    struct page **pages, size_t maxsize, unsigned maxpages,
1390                    size_t *start)
1391 {
1392         unsigned int iter_head, npages;
1393         size_t capacity;
1394
1395         if (!sanity(i))
1396                 return -EFAULT;
1397
1398         data_start(i, &iter_head, start);
1399         /* Amount of free space: some of this one + all after this one */
1400         npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1401         capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1402
1403         return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1404 }
1405
1406 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1407                                           pgoff_t index, unsigned int nr_pages)
1408 {
1409         XA_STATE(xas, xa, index);
1410         struct page *page;
1411         unsigned int ret = 0;
1412
1413         rcu_read_lock();
1414         for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1415                 if (xas_retry(&xas, page))
1416                         continue;
1417
1418                 /* Has the page moved or been split? */
1419                 if (unlikely(page != xas_reload(&xas))) {
1420                         xas_reset(&xas);
1421                         continue;
1422                 }
1423
1424                 pages[ret] = find_subpage(page, xas.xa_index);
1425                 get_page(pages[ret]);
1426                 if (++ret == nr_pages)
1427                         break;
1428         }
1429         rcu_read_unlock();
1430         return ret;
1431 }
1432
1433 static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1434                                      struct page **pages, size_t maxsize,
1435                                      unsigned maxpages, size_t *_start_offset)
1436 {
1437         unsigned nr, offset;
1438         pgoff_t index, count;
1439         size_t size = maxsize;
1440         loff_t pos;
1441
1442         if (!size || !maxpages)
1443                 return 0;
1444
1445         pos = i->xarray_start + i->iov_offset;
1446         index = pos >> PAGE_SHIFT;
1447         offset = pos & ~PAGE_MASK;
1448         *_start_offset = offset;
1449
1450         count = 1;
1451         if (size > PAGE_SIZE - offset) {
1452                 size -= PAGE_SIZE - offset;
1453                 count += size >> PAGE_SHIFT;
1454                 size &= ~PAGE_MASK;
1455                 if (size)
1456                         count++;
1457         }
1458
1459         if (count > maxpages)
1460                 count = maxpages;
1461
1462         nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1463         if (nr == 0)
1464                 return 0;
1465
1466         return min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1467 }
1468
1469 /* must be done on non-empty ITER_IOVEC one */
1470 static unsigned long first_iovec_segment(const struct iov_iter *i,
1471                                          size_t *size, size_t *start,
1472                                          size_t maxsize, unsigned maxpages)
1473 {
1474         size_t skip;
1475         long k;
1476
1477         for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1478                 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
1479                 size_t len = i->iov[k].iov_len - skip;
1480
1481                 if (unlikely(!len))
1482                         continue;
1483                 if (len > maxsize)
1484                         len = maxsize;
1485                 len += (*start = addr % PAGE_SIZE);
1486                 if (len > maxpages * PAGE_SIZE)
1487                         len = maxpages * PAGE_SIZE;
1488                 *size = len;
1489                 return addr & PAGE_MASK;
1490         }
1491         BUG(); // if it had been empty, we wouldn't get called
1492 }
1493
1494 /* must be done on non-empty ITER_BVEC one */
1495 static struct page *first_bvec_segment(const struct iov_iter *i,
1496                                        size_t *size, size_t *start,
1497                                        size_t maxsize, unsigned maxpages)
1498 {
1499         struct page *page;
1500         size_t skip = i->iov_offset, len;
1501
1502         len = i->bvec->bv_len - skip;
1503         if (len > maxsize)
1504                 len = maxsize;
1505         skip += i->bvec->bv_offset;
1506         page = i->bvec->bv_page + skip / PAGE_SIZE;
1507         len += (*start = skip % PAGE_SIZE);
1508         if (len > maxpages * PAGE_SIZE)
1509                 len = maxpages * PAGE_SIZE;
1510         *size = len;
1511         return page;
1512 }
1513
1514 ssize_t iov_iter_get_pages(struct iov_iter *i,
1515                    struct page **pages, size_t maxsize, unsigned maxpages,
1516                    size_t *start)
1517 {
1518         size_t len;
1519         int n, res;
1520
1521         if (maxsize > i->count)
1522                 maxsize = i->count;
1523         if (!maxsize)
1524                 return 0;
1525
1526         if (likely(iter_is_iovec(i))) {
1527                 unsigned int gup_flags = 0;
1528                 unsigned long addr;
1529
1530                 if (iov_iter_rw(i) != WRITE)
1531                         gup_flags |= FOLL_WRITE;
1532                 if (i->nofault)
1533                         gup_flags |= FOLL_NOFAULT;
1534
1535                 addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
1536                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1537                 res = get_user_pages_fast(addr, n, gup_flags, pages);
1538                 if (unlikely(res <= 0))
1539                         return res;
1540                 return (res == n ? len : res * PAGE_SIZE) - *start;
1541         }
1542         if (iov_iter_is_bvec(i)) {
1543                 struct page *page;
1544
1545                 page = first_bvec_segment(i, &len, start, maxsize, maxpages);
1546                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1547                 while (n--)
1548                         get_page(*pages++ = page++);
1549                 return len - *start;
1550         }
1551         if (iov_iter_is_pipe(i))
1552                 return pipe_get_pages(i, pages, maxsize, maxpages, start);
1553         if (iov_iter_is_xarray(i))
1554                 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1555         return -EFAULT;
1556 }
1557 EXPORT_SYMBOL(iov_iter_get_pages);
1558
1559 static struct page **get_pages_array(size_t n)
1560 {
1561         return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1562 }
1563
1564 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1565                    struct page ***pages, size_t maxsize,
1566                    size_t *start)
1567 {
1568         struct page **p;
1569         unsigned int iter_head, npages;
1570         ssize_t n;
1571
1572         if (!sanity(i))
1573                 return -EFAULT;
1574
1575         data_start(i, &iter_head, start);
1576         /* Amount of free space: some of this one + all after this one */
1577         npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1578         n = npages * PAGE_SIZE - *start;
1579         if (maxsize > n)
1580                 maxsize = n;
1581         else
1582                 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1583         p = get_pages_array(npages);
1584         if (!p)
1585                 return -ENOMEM;
1586         n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1587         if (n > 0)
1588                 *pages = p;
1589         else
1590                 kvfree(p);
1591         return n;
1592 }
1593
1594 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1595                                            struct page ***pages, size_t maxsize,
1596                                            size_t *_start_offset)
1597 {
1598         struct page **p;
1599         unsigned nr, offset;
1600         pgoff_t index, count;
1601         size_t size = maxsize;
1602         loff_t pos;
1603
1604         if (!size)
1605                 return 0;
1606
1607         pos = i->xarray_start + i->iov_offset;
1608         index = pos >> PAGE_SHIFT;
1609         offset = pos & ~PAGE_MASK;
1610         *_start_offset = offset;
1611
1612         count = 1;
1613         if (size > PAGE_SIZE - offset) {
1614                 size -= PAGE_SIZE - offset;
1615                 count += size >> PAGE_SHIFT;
1616                 size &= ~PAGE_MASK;
1617                 if (size)
1618                         count++;
1619         }
1620
1621         p = get_pages_array(count);
1622         if (!p)
1623                 return -ENOMEM;
1624         *pages = p;
1625
1626         nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1627         if (nr == 0)
1628                 return 0;
1629
1630         return min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1631 }
1632
1633 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1634                    struct page ***pages, size_t maxsize,
1635                    size_t *start)
1636 {
1637         struct page **p;
1638         size_t len;
1639         int n, res;
1640
1641         if (maxsize > i->count)
1642                 maxsize = i->count;
1643         if (!maxsize)
1644                 return 0;
1645
1646         if (likely(iter_is_iovec(i))) {
1647                 unsigned int gup_flags = 0;
1648                 unsigned long addr;
1649
1650                 if (iov_iter_rw(i) != WRITE)
1651                         gup_flags |= FOLL_WRITE;
1652                 if (i->nofault)
1653                         gup_flags |= FOLL_NOFAULT;
1654
1655                 addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
1656                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1657                 p = get_pages_array(n);
1658                 if (!p)
1659                         return -ENOMEM;
1660                 res = get_user_pages_fast(addr, n, gup_flags, p);
1661                 if (unlikely(res <= 0)) {
1662                         kvfree(p);
1663                         *pages = NULL;
1664                         return res;
1665                 }
1666                 *pages = p;
1667                 return (res == n ? len : res * PAGE_SIZE) - *start;
1668         }
1669         if (iov_iter_is_bvec(i)) {
1670                 struct page *page;
1671
1672                 page = first_bvec_segment(i, &len, start, maxsize, ~0U);
1673                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1674                 *pages = p = get_pages_array(n);
1675                 if (!p)
1676                         return -ENOMEM;
1677                 while (n--)
1678                         get_page(*p++ = page++);
1679                 return len - *start;
1680         }
1681         if (iov_iter_is_pipe(i))
1682                 return pipe_get_pages_alloc(i, pages, maxsize, start);
1683         if (iov_iter_is_xarray(i))
1684                 return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1685         return -EFAULT;
1686 }
1687 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1688
1689 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1690                                struct iov_iter *i)
1691 {
1692         __wsum sum, next;
1693         sum = *csum;
1694         if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1695                 WARN_ON(1);
1696                 return 0;
1697         }
1698         iterate_and_advance(i, bytes, base, len, off, ({
1699                 next = csum_and_copy_from_user(base, addr + off, len);
1700                 sum = csum_block_add(sum, next, off);
1701                 next ? 0 : len;
1702         }), ({
1703                 sum = csum_and_memcpy(addr + off, base, len, sum, off);
1704         })
1705         )
1706         *csum = sum;
1707         return bytes;
1708 }
1709 EXPORT_SYMBOL(csum_and_copy_from_iter);
1710
1711 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1712                              struct iov_iter *i)
1713 {
1714         struct csum_state *csstate = _csstate;
1715         __wsum sum, next;
1716
1717         if (unlikely(iov_iter_is_discard(i))) {
1718                 WARN_ON(1);     /* for now */
1719                 return 0;
1720         }
1721
1722         sum = csum_shift(csstate->csum, csstate->off);
1723         if (unlikely(iov_iter_is_pipe(i)))
1724                 bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum);
1725         else iterate_and_advance(i, bytes, base, len, off, ({
1726                 next = csum_and_copy_to_user(addr + off, base, len);
1727                 sum = csum_block_add(sum, next, off);
1728                 next ? 0 : len;
1729         }), ({
1730                 sum = csum_and_memcpy(base, addr + off, len, sum, off);
1731         })
1732         )
1733         csstate->csum = csum_shift(sum, csstate->off);
1734         csstate->off += bytes;
1735         return bytes;
1736 }
1737 EXPORT_SYMBOL(csum_and_copy_to_iter);
1738
1739 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1740                 struct iov_iter *i)
1741 {
1742 #ifdef CONFIG_CRYPTO_HASH
1743         struct ahash_request *hash = hashp;
1744         struct scatterlist sg;
1745         size_t copied;
1746
1747         copied = copy_to_iter(addr, bytes, i);
1748         sg_init_one(&sg, addr, copied);
1749         ahash_request_set_crypt(hash, &sg, NULL, copied);
1750         crypto_ahash_update(hash);
1751         return copied;
1752 #else
1753         return 0;
1754 #endif
1755 }
1756 EXPORT_SYMBOL(hash_and_copy_to_iter);
1757
1758 static int iov_npages(const struct iov_iter *i, int maxpages)
1759 {
1760         size_t skip = i->iov_offset, size = i->count;
1761         const struct iovec *p;
1762         int npages = 0;
1763
1764         for (p = i->iov; size; skip = 0, p++) {
1765                 unsigned offs = offset_in_page(p->iov_base + skip);
1766                 size_t len = min(p->iov_len - skip, size);
1767
1768                 if (len) {
1769                         size -= len;
1770                         npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1771                         if (unlikely(npages > maxpages))
1772                                 return maxpages;
1773                 }
1774         }
1775         return npages;
1776 }
1777
1778 static int bvec_npages(const struct iov_iter *i, int maxpages)
1779 {
1780         size_t skip = i->iov_offset, size = i->count;
1781         const struct bio_vec *p;
1782         int npages = 0;
1783
1784         for (p = i->bvec; size; skip = 0, p++) {
1785                 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1786                 size_t len = min(p->bv_len - skip, size);
1787
1788                 size -= len;
1789                 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1790                 if (unlikely(npages > maxpages))
1791                         return maxpages;
1792         }
1793         return npages;
1794 }
1795
1796 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1797 {
1798         if (unlikely(!i->count))
1799                 return 0;
1800         /* iovec and kvec have identical layouts */
1801         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1802                 return iov_npages(i, maxpages);
1803         if (iov_iter_is_bvec(i))
1804                 return bvec_npages(i, maxpages);
1805         if (iov_iter_is_pipe(i)) {
1806                 unsigned int iter_head;
1807                 int npages;
1808                 size_t off;
1809
1810                 if (!sanity(i))
1811                         return 0;
1812
1813                 data_start(i, &iter_head, &off);
1814                 /* some of this one + all after this one */
1815                 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1816                 return min(npages, maxpages);
1817         }
1818         if (iov_iter_is_xarray(i)) {
1819                 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1820                 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1821                 return min(npages, maxpages);
1822         }
1823         return 0;
1824 }
1825 EXPORT_SYMBOL(iov_iter_npages);
1826
1827 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1828 {
1829         *new = *old;
1830         if (unlikely(iov_iter_is_pipe(new))) {
1831                 WARN_ON(1);
1832                 return NULL;
1833         }
1834         if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
1835                 return NULL;
1836         if (iov_iter_is_bvec(new))
1837                 return new->bvec = kmemdup(new->bvec,
1838                                     new->nr_segs * sizeof(struct bio_vec),
1839                                     flags);
1840         else
1841                 /* iovec and kvec have identical layout */
1842                 return new->iov = kmemdup(new->iov,
1843                                    new->nr_segs * sizeof(struct iovec),
1844                                    flags);
1845 }
1846 EXPORT_SYMBOL(dup_iter);
1847
1848 static int copy_compat_iovec_from_user(struct iovec *iov,
1849                 const struct iovec __user *uvec, unsigned long nr_segs)
1850 {
1851         const struct compat_iovec __user *uiov =
1852                 (const struct compat_iovec __user *)uvec;
1853         int ret = -EFAULT, i;
1854
1855         if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1856                 return -EFAULT;
1857
1858         for (i = 0; i < nr_segs; i++) {
1859                 compat_uptr_t buf;
1860                 compat_ssize_t len;
1861
1862                 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1863                 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1864
1865                 /* check for compat_size_t not fitting in compat_ssize_t .. */
1866                 if (len < 0) {
1867                         ret = -EINVAL;
1868                         goto uaccess_end;
1869                 }
1870                 iov[i].iov_base = compat_ptr(buf);
1871                 iov[i].iov_len = len;
1872         }
1873
1874         ret = 0;
1875 uaccess_end:
1876         user_access_end();
1877         return ret;
1878 }
1879
1880 static int copy_iovec_from_user(struct iovec *iov,
1881                 const struct iovec __user *uvec, unsigned long nr_segs)
1882 {
1883         unsigned long seg;
1884
1885         if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1886                 return -EFAULT;
1887         for (seg = 0; seg < nr_segs; seg++) {
1888                 if ((ssize_t)iov[seg].iov_len < 0)
1889                         return -EINVAL;
1890         }
1891
1892         return 0;
1893 }
1894
1895 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1896                 unsigned long nr_segs, unsigned long fast_segs,
1897                 struct iovec *fast_iov, bool compat)
1898 {
1899         struct iovec *iov = fast_iov;
1900         int ret;
1901
1902         /*
1903          * SuS says "The readv() function *may* fail if the iovcnt argument was
1904          * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1905          * traditionally returned zero for zero segments, so...
1906          */
1907         if (nr_segs == 0)
1908                 return iov;
1909         if (nr_segs > UIO_MAXIOV)
1910                 return ERR_PTR(-EINVAL);
1911         if (nr_segs > fast_segs) {
1912                 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1913                 if (!iov)
1914                         return ERR_PTR(-ENOMEM);
1915         }
1916
1917         if (compat)
1918                 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1919         else
1920                 ret = copy_iovec_from_user(iov, uvec, nr_segs);
1921         if (ret) {
1922                 if (iov != fast_iov)
1923                         kfree(iov);
1924                 return ERR_PTR(ret);
1925         }
1926
1927         return iov;
1928 }
1929
1930 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1931                  unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1932                  struct iov_iter *i, bool compat)
1933 {
1934         ssize_t total_len = 0;
1935         unsigned long seg;
1936         struct iovec *iov;
1937
1938         iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1939         if (IS_ERR(iov)) {
1940                 *iovp = NULL;
1941                 return PTR_ERR(iov);
1942         }
1943
1944         /*
1945          * According to the Single Unix Specification we should return EINVAL if
1946          * an element length is < 0 when cast to ssize_t or if the total length
1947          * would overflow the ssize_t return value of the system call.
1948          *
1949          * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1950          * overflow case.
1951          */
1952         for (seg = 0; seg < nr_segs; seg++) {
1953                 ssize_t len = (ssize_t)iov[seg].iov_len;
1954
1955                 if (!access_ok(iov[seg].iov_base, len)) {
1956                         if (iov != *iovp)
1957                                 kfree(iov);
1958                         *iovp = NULL;
1959                         return -EFAULT;
1960                 }
1961
1962                 if (len > MAX_RW_COUNT - total_len) {
1963                         len = MAX_RW_COUNT - total_len;
1964                         iov[seg].iov_len = len;
1965                 }
1966                 total_len += len;
1967         }
1968
1969         iov_iter_init(i, type, iov, nr_segs, total_len);
1970         if (iov == *iovp)
1971                 *iovp = NULL;
1972         else
1973                 *iovp = iov;
1974         return total_len;
1975 }
1976
1977 /**
1978  * import_iovec() - Copy an array of &struct iovec from userspace
1979  *     into the kernel, check that it is valid, and initialize a new
1980  *     &struct iov_iter iterator to access it.
1981  *
1982  * @type: One of %READ or %WRITE.
1983  * @uvec: Pointer to the userspace array.
1984  * @nr_segs: Number of elements in userspace array.
1985  * @fast_segs: Number of elements in @iov.
1986  * @iovp: (input and output parameter) Pointer to pointer to (usually small
1987  *     on-stack) kernel array.
1988  * @i: Pointer to iterator that will be initialized on success.
1989  *
1990  * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1991  * then this function places %NULL in *@iov on return. Otherwise, a new
1992  * array will be allocated and the result placed in *@iov. This means that
1993  * the caller may call kfree() on *@iov regardless of whether the small
1994  * on-stack array was used or not (and regardless of whether this function
1995  * returns an error or not).
1996  *
1997  * Return: Negative error code on error, bytes imported on success
1998  */
1999 ssize_t import_iovec(int type, const struct iovec __user *uvec,
2000                  unsigned nr_segs, unsigned fast_segs,
2001                  struct iovec **iovp, struct iov_iter *i)
2002 {
2003         return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
2004                               in_compat_syscall());
2005 }
2006 EXPORT_SYMBOL(import_iovec);
2007
2008 int import_single_range(int rw, void __user *buf, size_t len,
2009                  struct iovec *iov, struct iov_iter *i)
2010 {
2011         if (len > MAX_RW_COUNT)
2012                 len = MAX_RW_COUNT;
2013         if (unlikely(!access_ok(buf, len)))
2014                 return -EFAULT;
2015
2016         iov->iov_base = buf;
2017         iov->iov_len = len;
2018         iov_iter_init(i, rw, iov, 1, len);
2019         return 0;
2020 }
2021 EXPORT_SYMBOL(import_single_range);
2022
2023 /**
2024  * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
2025  *     iov_iter_save_state() was called.
2026  *
2027  * @i: &struct iov_iter to restore
2028  * @state: state to restore from
2029  *
2030  * Used after iov_iter_save_state() to bring restore @i, if operations may
2031  * have advanced it.
2032  *
2033  * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
2034  */
2035 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
2036 {
2037         if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
2038                          !iov_iter_is_kvec(i))
2039                 return;
2040         i->iov_offset = state->iov_offset;
2041         i->count = state->count;
2042         /*
2043          * For the *vec iters, nr_segs + iov is constant - if we increment
2044          * the vec, then we also decrement the nr_segs count. Hence we don't
2045          * need to track both of these, just one is enough and we can deduct
2046          * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
2047          * size, so we can just increment the iov pointer as they are unionzed.
2048          * ITER_BVEC _may_ be the same size on some archs, but on others it is
2049          * not. Be safe and handle it separately.
2050          */
2051         BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
2052         if (iov_iter_is_bvec(i))
2053                 i->bvec -= state->nr_segs - i->nr_segs;
2054         else
2055                 i->iov -= state->nr_segs - i->nr_segs;
2056         i->nr_segs = state->nr_segs;
2057 }