GNU Linux-libre 6.1.86-gnu
[releases.git] / fs / splice.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * "splice": joining two ropes together by interweaving their strands.
4  *
5  * This is the "extended pipe" functionality, where a pipe is used as
6  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7  * buffer that you can use to transfer data from one end to the other.
8  *
9  * The traditional unix read/write is extended with a "splice()" operation
10  * that transfers data buffers to or from a pipe buffer.
11  *
12  * Named by Larry McVoy, original implementation from Linus, extended by
13  * Jens to support splicing to files, network, direct splicing, etc and
14  * fixing lots of bugs.
15  *
16  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19  *
20  */
21 #include <linux/bvec.h>
22 #include <linux/fs.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/splice.h>
26 #include <linux/memcontrol.h>
27 #include <linux/mm_inline.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/export.h>
31 #include <linux/syscalls.h>
32 #include <linux/uio.h>
33 #include <linux/security.h>
34 #include <linux/gfp.h>
35 #include <linux/socket.h>
36 #include <linux/sched/signal.h>
37
38 #include "internal.h"
39
40 /*
41  * Attempt to steal a page from a pipe buffer. This should perhaps go into
42  * a vm helper function, it's already simplified quite a bit by the
43  * addition of remove_mapping(). If success is returned, the caller may
44  * attempt to reuse this page for another destination.
45  */
46 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
47                 struct pipe_buffer *buf)
48 {
49         struct folio *folio = page_folio(buf->page);
50         struct address_space *mapping;
51
52         folio_lock(folio);
53
54         mapping = folio_mapping(folio);
55         if (mapping) {
56                 WARN_ON(!folio_test_uptodate(folio));
57
58                 /*
59                  * At least for ext2 with nobh option, we need to wait on
60                  * writeback completing on this folio, since we'll remove it
61                  * from the pagecache.  Otherwise truncate wont wait on the
62                  * folio, allowing the disk blocks to be reused by someone else
63                  * before we actually wrote our data to them. fs corruption
64                  * ensues.
65                  */
66                 folio_wait_writeback(folio);
67
68                 if (!filemap_release_folio(folio, GFP_KERNEL))
69                         goto out_unlock;
70
71                 /*
72                  * If we succeeded in removing the mapping, set LRU flag
73                  * and return good.
74                  */
75                 if (remove_mapping(mapping, folio)) {
76                         buf->flags |= PIPE_BUF_FLAG_LRU;
77                         return true;
78                 }
79         }
80
81         /*
82          * Raced with truncate or failed to remove folio from current
83          * address space, unlock and return failure.
84          */
85 out_unlock:
86         folio_unlock(folio);
87         return false;
88 }
89
90 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
91                                         struct pipe_buffer *buf)
92 {
93         put_page(buf->page);
94         buf->flags &= ~PIPE_BUF_FLAG_LRU;
95 }
96
97 /*
98  * Check whether the contents of buf is OK to access. Since the content
99  * is a page cache page, IO may be in flight.
100  */
101 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
102                                        struct pipe_buffer *buf)
103 {
104         struct page *page = buf->page;
105         int err;
106
107         if (!PageUptodate(page)) {
108                 lock_page(page);
109
110                 /*
111                  * Page got truncated/unhashed. This will cause a 0-byte
112                  * splice, if this is the first page.
113                  */
114                 if (!page->mapping) {
115                         err = -ENODATA;
116                         goto error;
117                 }
118
119                 /*
120                  * Uh oh, read-error from disk.
121                  */
122                 if (!PageUptodate(page)) {
123                         err = -EIO;
124                         goto error;
125                 }
126
127                 /*
128                  * Page is ok afterall, we are done.
129                  */
130                 unlock_page(page);
131         }
132
133         return 0;
134 error:
135         unlock_page(page);
136         return err;
137 }
138
139 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
140         .confirm        = page_cache_pipe_buf_confirm,
141         .release        = page_cache_pipe_buf_release,
142         .try_steal      = page_cache_pipe_buf_try_steal,
143         .get            = generic_pipe_buf_get,
144 };
145
146 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
147                 struct pipe_buffer *buf)
148 {
149         if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
150                 return false;
151
152         buf->flags |= PIPE_BUF_FLAG_LRU;
153         return generic_pipe_buf_try_steal(pipe, buf);
154 }
155
156 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
157         .release        = page_cache_pipe_buf_release,
158         .try_steal      = user_page_pipe_buf_try_steal,
159         .get            = generic_pipe_buf_get,
160 };
161
162 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
163 {
164         smp_mb();
165         if (waitqueue_active(&pipe->rd_wait))
166                 wake_up_interruptible(&pipe->rd_wait);
167         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
168 }
169
170 /**
171  * splice_to_pipe - fill passed data into a pipe
172  * @pipe:       pipe to fill
173  * @spd:        data to fill
174  *
175  * Description:
176  *    @spd contains a map of pages and len/offset tuples, along with
177  *    the struct pipe_buf_operations associated with these pages. This
178  *    function will link that data to the pipe.
179  *
180  */
181 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
182                        struct splice_pipe_desc *spd)
183 {
184         unsigned int spd_pages = spd->nr_pages;
185         unsigned int tail = pipe->tail;
186         unsigned int head = pipe->head;
187         unsigned int mask = pipe->ring_size - 1;
188         int ret = 0, page_nr = 0;
189
190         if (!spd_pages)
191                 return 0;
192
193         if (unlikely(!pipe->readers)) {
194                 send_sig(SIGPIPE, current, 0);
195                 ret = -EPIPE;
196                 goto out;
197         }
198
199         while (!pipe_full(head, tail, pipe->max_usage)) {
200                 struct pipe_buffer *buf = &pipe->bufs[head & mask];
201
202                 buf->page = spd->pages[page_nr];
203                 buf->offset = spd->partial[page_nr].offset;
204                 buf->len = spd->partial[page_nr].len;
205                 buf->private = spd->partial[page_nr].private;
206                 buf->ops = spd->ops;
207                 buf->flags = 0;
208
209                 head++;
210                 pipe->head = head;
211                 page_nr++;
212                 ret += buf->len;
213
214                 if (!--spd->nr_pages)
215                         break;
216         }
217
218         if (!ret)
219                 ret = -EAGAIN;
220
221 out:
222         while (page_nr < spd_pages)
223                 spd->spd_release(spd, page_nr++);
224
225         return ret;
226 }
227 EXPORT_SYMBOL_GPL(splice_to_pipe);
228
229 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
230 {
231         unsigned int head = pipe->head;
232         unsigned int tail = pipe->tail;
233         unsigned int mask = pipe->ring_size - 1;
234         int ret;
235
236         if (unlikely(!pipe->readers)) {
237                 send_sig(SIGPIPE, current, 0);
238                 ret = -EPIPE;
239         } else if (pipe_full(head, tail, pipe->max_usage)) {
240                 ret = -EAGAIN;
241         } else {
242                 pipe->bufs[head & mask] = *buf;
243                 pipe->head = head + 1;
244                 return buf->len;
245         }
246         pipe_buf_release(pipe, buf);
247         return ret;
248 }
249 EXPORT_SYMBOL(add_to_pipe);
250
251 /*
252  * Check if we need to grow the arrays holding pages and partial page
253  * descriptions.
254  */
255 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
256 {
257         unsigned int max_usage = READ_ONCE(pipe->max_usage);
258
259         spd->nr_pages_max = max_usage;
260         if (max_usage <= PIPE_DEF_BUFFERS)
261                 return 0;
262
263         spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
264         spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
265                                      GFP_KERNEL);
266
267         if (spd->pages && spd->partial)
268                 return 0;
269
270         kfree(spd->pages);
271         kfree(spd->partial);
272         return -ENOMEM;
273 }
274
275 void splice_shrink_spd(struct splice_pipe_desc *spd)
276 {
277         if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
278                 return;
279
280         kfree(spd->pages);
281         kfree(spd->partial);
282 }
283
284 /**
285  * generic_file_splice_read - splice data from file to a pipe
286  * @in:         file to splice from
287  * @ppos:       position in @in
288  * @pipe:       pipe to splice to
289  * @len:        number of bytes to splice
290  * @flags:      splice modifier flags
291  *
292  * Description:
293  *    Will read pages from given file and fill them into a pipe. Can be
294  *    used as long as it has more or less sane ->read_iter().
295  *
296  */
297 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
298                                  struct pipe_inode_info *pipe, size_t len,
299                                  unsigned int flags)
300 {
301         struct iov_iter to;
302         struct kiocb kiocb;
303         int ret;
304
305         iov_iter_pipe(&to, ITER_DEST, pipe, len);
306         init_sync_kiocb(&kiocb, in);
307         kiocb.ki_pos = *ppos;
308         ret = call_read_iter(in, &kiocb, &to);
309         if (ret > 0) {
310                 *ppos = kiocb.ki_pos;
311                 file_accessed(in);
312         } else if (ret < 0) {
313                 /* free what was emitted */
314                 pipe_discard_from(pipe, to.start_head);
315                 /*
316                  * callers of ->splice_read() expect -EAGAIN on
317                  * "can't put anything in there", rather than -EFAULT.
318                  */
319                 if (ret == -EFAULT)
320                         ret = -EAGAIN;
321         }
322
323         return ret;
324 }
325 EXPORT_SYMBOL(generic_file_splice_read);
326
327 const struct pipe_buf_operations default_pipe_buf_ops = {
328         .release        = generic_pipe_buf_release,
329         .try_steal      = generic_pipe_buf_try_steal,
330         .get            = generic_pipe_buf_get,
331 };
332
333 /* Pipe buffer operations for a socket and similar. */
334 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
335         .release        = generic_pipe_buf_release,
336         .get            = generic_pipe_buf_get,
337 };
338 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
339
340 /*
341  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
342  * using sendpage(). Return the number of bytes sent.
343  */
344 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
345                             struct pipe_buffer *buf, struct splice_desc *sd)
346 {
347         struct file *file = sd->u.file;
348         loff_t pos = sd->pos;
349         int more;
350
351         if (!likely(file->f_op->sendpage))
352                 return -EINVAL;
353
354         more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
355
356         if (sd->len < sd->total_len &&
357             pipe_occupancy(pipe->head, pipe->tail) > 1)
358                 more |= MSG_SENDPAGE_NOTLAST;
359
360         return file->f_op->sendpage(file, buf->page, buf->offset,
361                                     sd->len, &pos, more);
362 }
363
364 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
365 {
366         smp_mb();
367         if (waitqueue_active(&pipe->wr_wait))
368                 wake_up_interruptible(&pipe->wr_wait);
369         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
370 }
371
372 /**
373  * splice_from_pipe_feed - feed available data from a pipe to a file
374  * @pipe:       pipe to splice from
375  * @sd:         information to @actor
376  * @actor:      handler that splices the data
377  *
378  * Description:
379  *    This function loops over the pipe and calls @actor to do the
380  *    actual moving of a single struct pipe_buffer to the desired
381  *    destination.  It returns when there's no more buffers left in
382  *    the pipe or if the requested number of bytes (@sd->total_len)
383  *    have been copied.  It returns a positive number (one) if the
384  *    pipe needs to be filled with more data, zero if the required
385  *    number of bytes have been copied and -errno on error.
386  *
387  *    This, together with splice_from_pipe_{begin,end,next}, may be
388  *    used to implement the functionality of __splice_from_pipe() when
389  *    locking is required around copying the pipe buffers to the
390  *    destination.
391  */
392 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
393                           splice_actor *actor)
394 {
395         unsigned int head = pipe->head;
396         unsigned int tail = pipe->tail;
397         unsigned int mask = pipe->ring_size - 1;
398         int ret;
399
400         while (!pipe_empty(head, tail)) {
401                 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
402
403                 sd->len = buf->len;
404                 if (sd->len > sd->total_len)
405                         sd->len = sd->total_len;
406
407                 ret = pipe_buf_confirm(pipe, buf);
408                 if (unlikely(ret)) {
409                         if (ret == -ENODATA)
410                                 ret = 0;
411                         return ret;
412                 }
413
414                 ret = actor(pipe, buf, sd);
415                 if (ret <= 0)
416                         return ret;
417
418                 buf->offset += ret;
419                 buf->len -= ret;
420
421                 sd->num_spliced += ret;
422                 sd->len -= ret;
423                 sd->pos += ret;
424                 sd->total_len -= ret;
425
426                 if (!buf->len) {
427                         pipe_buf_release(pipe, buf);
428                         tail++;
429                         pipe->tail = tail;
430                         if (pipe->files)
431                                 sd->need_wakeup = true;
432                 }
433
434                 if (!sd->total_len)
435                         return 0;
436         }
437
438         return 1;
439 }
440
441 /* We know we have a pipe buffer, but maybe it's empty? */
442 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
443 {
444         unsigned int tail = pipe->tail;
445         unsigned int mask = pipe->ring_size - 1;
446         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
447
448         if (unlikely(!buf->len)) {
449                 pipe_buf_release(pipe, buf);
450                 pipe->tail = tail+1;
451                 return true;
452         }
453
454         return false;
455 }
456
457 /**
458  * splice_from_pipe_next - wait for some data to splice from
459  * @pipe:       pipe to splice from
460  * @sd:         information about the splice operation
461  *
462  * Description:
463  *    This function will wait for some data and return a positive
464  *    value (one) if pipe buffers are available.  It will return zero
465  *    or -errno if no more data needs to be spliced.
466  */
467 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
468 {
469         /*
470          * Check for signal early to make process killable when there are
471          * always buffers available
472          */
473         if (signal_pending(current))
474                 return -ERESTARTSYS;
475
476 repeat:
477         while (pipe_empty(pipe->head, pipe->tail)) {
478                 if (!pipe->writers)
479                         return 0;
480
481                 if (sd->num_spliced)
482                         return 0;
483
484                 if (sd->flags & SPLICE_F_NONBLOCK)
485                         return -EAGAIN;
486
487                 if (signal_pending(current))
488                         return -ERESTARTSYS;
489
490                 if (sd->need_wakeup) {
491                         wakeup_pipe_writers(pipe);
492                         sd->need_wakeup = false;
493                 }
494
495                 pipe_wait_readable(pipe);
496         }
497
498         if (eat_empty_buffer(pipe))
499                 goto repeat;
500
501         return 1;
502 }
503
504 /**
505  * splice_from_pipe_begin - start splicing from pipe
506  * @sd:         information about the splice operation
507  *
508  * Description:
509  *    This function should be called before a loop containing
510  *    splice_from_pipe_next() and splice_from_pipe_feed() to
511  *    initialize the necessary fields of @sd.
512  */
513 static void splice_from_pipe_begin(struct splice_desc *sd)
514 {
515         sd->num_spliced = 0;
516         sd->need_wakeup = false;
517 }
518
519 /**
520  * splice_from_pipe_end - finish splicing from pipe
521  * @pipe:       pipe to splice from
522  * @sd:         information about the splice operation
523  *
524  * Description:
525  *    This function will wake up pipe writers if necessary.  It should
526  *    be called after a loop containing splice_from_pipe_next() and
527  *    splice_from_pipe_feed().
528  */
529 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
530 {
531         if (sd->need_wakeup)
532                 wakeup_pipe_writers(pipe);
533 }
534
535 /**
536  * __splice_from_pipe - splice data from a pipe to given actor
537  * @pipe:       pipe to splice from
538  * @sd:         information to @actor
539  * @actor:      handler that splices the data
540  *
541  * Description:
542  *    This function does little more than loop over the pipe and call
543  *    @actor to do the actual moving of a single struct pipe_buffer to
544  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
545  *    pipe_to_user.
546  *
547  */
548 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
549                            splice_actor *actor)
550 {
551         int ret;
552
553         splice_from_pipe_begin(sd);
554         do {
555                 cond_resched();
556                 ret = splice_from_pipe_next(pipe, sd);
557                 if (ret > 0)
558                         ret = splice_from_pipe_feed(pipe, sd, actor);
559         } while (ret > 0);
560         splice_from_pipe_end(pipe, sd);
561
562         return sd->num_spliced ? sd->num_spliced : ret;
563 }
564 EXPORT_SYMBOL(__splice_from_pipe);
565
566 /**
567  * splice_from_pipe - splice data from a pipe to a file
568  * @pipe:       pipe to splice from
569  * @out:        file to splice to
570  * @ppos:       position in @out
571  * @len:        how many bytes to splice
572  * @flags:      splice modifier flags
573  * @actor:      handler that splices the data
574  *
575  * Description:
576  *    See __splice_from_pipe. This function locks the pipe inode,
577  *    otherwise it's identical to __splice_from_pipe().
578  *
579  */
580 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
581                          loff_t *ppos, size_t len, unsigned int flags,
582                          splice_actor *actor)
583 {
584         ssize_t ret;
585         struct splice_desc sd = {
586                 .total_len = len,
587                 .flags = flags,
588                 .pos = *ppos,
589                 .u.file = out,
590         };
591
592         pipe_lock(pipe);
593         ret = __splice_from_pipe(pipe, &sd, actor);
594         pipe_unlock(pipe);
595
596         return ret;
597 }
598
599 /**
600  * iter_file_splice_write - splice data from a pipe to a file
601  * @pipe:       pipe info
602  * @out:        file to write to
603  * @ppos:       position in @out
604  * @len:        number of bytes to splice
605  * @flags:      splice modifier flags
606  *
607  * Description:
608  *    Will either move or copy pages (determined by @flags options) from
609  *    the given pipe inode to the given file.
610  *    This one is ->write_iter-based.
611  *
612  */
613 ssize_t
614 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
615                           loff_t *ppos, size_t len, unsigned int flags)
616 {
617         struct splice_desc sd = {
618                 .total_len = len,
619                 .flags = flags,
620                 .pos = *ppos,
621                 .u.file = out,
622         };
623         int nbufs = pipe->max_usage;
624         struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
625                                         GFP_KERNEL);
626         ssize_t ret;
627
628         if (unlikely(!array))
629                 return -ENOMEM;
630
631         pipe_lock(pipe);
632
633         splice_from_pipe_begin(&sd);
634         while (sd.total_len) {
635                 struct iov_iter from;
636                 unsigned int head, tail, mask;
637                 size_t left;
638                 int n;
639
640                 ret = splice_from_pipe_next(pipe, &sd);
641                 if (ret <= 0)
642                         break;
643
644                 if (unlikely(nbufs < pipe->max_usage)) {
645                         kfree(array);
646                         nbufs = pipe->max_usage;
647                         array = kcalloc(nbufs, sizeof(struct bio_vec),
648                                         GFP_KERNEL);
649                         if (!array) {
650                                 ret = -ENOMEM;
651                                 break;
652                         }
653                 }
654
655                 head = pipe->head;
656                 tail = pipe->tail;
657                 mask = pipe->ring_size - 1;
658
659                 /* build the vector */
660                 left = sd.total_len;
661                 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
662                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
663                         size_t this_len = buf->len;
664
665                         /* zero-length bvecs are not supported, skip them */
666                         if (!this_len)
667                                 continue;
668                         this_len = min(this_len, left);
669
670                         ret = pipe_buf_confirm(pipe, buf);
671                         if (unlikely(ret)) {
672                                 if (ret == -ENODATA)
673                                         ret = 0;
674                                 goto done;
675                         }
676
677                         array[n].bv_page = buf->page;
678                         array[n].bv_len = this_len;
679                         array[n].bv_offset = buf->offset;
680                         left -= this_len;
681                         n++;
682                 }
683
684                 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
685                 ret = vfs_iter_write(out, &from, &sd.pos, 0);
686                 if (ret <= 0)
687                         break;
688
689                 sd.num_spliced += ret;
690                 sd.total_len -= ret;
691                 *ppos = sd.pos;
692
693                 /* dismiss the fully eaten buffers, adjust the partial one */
694                 tail = pipe->tail;
695                 while (ret) {
696                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
697                         if (ret >= buf->len) {
698                                 ret -= buf->len;
699                                 buf->len = 0;
700                                 pipe_buf_release(pipe, buf);
701                                 tail++;
702                                 pipe->tail = tail;
703                                 if (pipe->files)
704                                         sd.need_wakeup = true;
705                         } else {
706                                 buf->offset += ret;
707                                 buf->len -= ret;
708                                 ret = 0;
709                         }
710                 }
711         }
712 done:
713         kfree(array);
714         splice_from_pipe_end(pipe, &sd);
715
716         pipe_unlock(pipe);
717
718         if (sd.num_spliced)
719                 ret = sd.num_spliced;
720
721         return ret;
722 }
723
724 EXPORT_SYMBOL(iter_file_splice_write);
725
726 /**
727  * generic_splice_sendpage - splice data from a pipe to a socket
728  * @pipe:       pipe to splice from
729  * @out:        socket to write to
730  * @ppos:       position in @out
731  * @len:        number of bytes to splice
732  * @flags:      splice modifier flags
733  *
734  * Description:
735  *    Will send @len bytes from the pipe to a network socket. No data copying
736  *    is involved.
737  *
738  */
739 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
740                                 loff_t *ppos, size_t len, unsigned int flags)
741 {
742         return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
743 }
744
745 EXPORT_SYMBOL(generic_splice_sendpage);
746
747 static int warn_unsupported(struct file *file, const char *op)
748 {
749         pr_debug_ratelimited(
750                 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
751                 op, file, current->pid, current->comm);
752         return -EINVAL;
753 }
754
755 /*
756  * Attempt to initiate a splice from pipe to file.
757  */
758 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
759                            loff_t *ppos, size_t len, unsigned int flags)
760 {
761         if (unlikely(!out->f_op->splice_write))
762                 return warn_unsupported(out, "write");
763         return out->f_op->splice_write(pipe, out, ppos, len, flags);
764 }
765
766 /*
767  * Indicate to the caller that there was a premature EOF when reading from the
768  * source and the caller didn't indicate they would be sending more data after
769  * this.
770  */
771 static void do_splice_eof(struct splice_desc *sd)
772 {
773         if (sd->splice_eof)
774                 sd->splice_eof(sd);
775 }
776
777 /*
778  * Attempt to initiate a splice from a file to a pipe.
779  */
780 static long do_splice_to(struct file *in, loff_t *ppos,
781                          struct pipe_inode_info *pipe, size_t len,
782                          unsigned int flags)
783 {
784         unsigned int p_space;
785         int ret;
786
787         if (unlikely(!(in->f_mode & FMODE_READ)))
788                 return -EBADF;
789
790         /* Don't try to read more the pipe has space for. */
791         p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
792         len = min_t(size_t, len, p_space << PAGE_SHIFT);
793
794         ret = rw_verify_area(READ, in, ppos, len);
795         if (unlikely(ret < 0))
796                 return ret;
797
798         if (unlikely(len > MAX_RW_COUNT))
799                 len = MAX_RW_COUNT;
800
801         if (unlikely(!in->f_op->splice_read))
802                 return warn_unsupported(in, "read");
803         return in->f_op->splice_read(in, ppos, pipe, len, flags);
804 }
805
806 /**
807  * splice_direct_to_actor - splices data directly between two non-pipes
808  * @in:         file to splice from
809  * @sd:         actor information on where to splice to
810  * @actor:      handles the data splicing
811  *
812  * Description:
813  *    This is a special case helper to splice directly between two
814  *    points, without requiring an explicit pipe. Internally an allocated
815  *    pipe is cached in the process, and reused during the lifetime of
816  *    that process.
817  *
818  */
819 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
820                                splice_direct_actor *actor)
821 {
822         struct pipe_inode_info *pipe;
823         long ret, bytes;
824         size_t len;
825         int i, flags, more;
826
827         /*
828          * We require the input to be seekable, as we don't want to randomly
829          * drop data for eg socket -> socket splicing. Use the piped splicing
830          * for that!
831          */
832         if (unlikely(!(in->f_mode & FMODE_LSEEK)))
833                 return -EINVAL;
834
835         /*
836          * neither in nor out is a pipe, setup an internal pipe attached to
837          * 'out' and transfer the wanted data from 'in' to 'out' through that
838          */
839         pipe = current->splice_pipe;
840         if (unlikely(!pipe)) {
841                 pipe = alloc_pipe_info();
842                 if (!pipe)
843                         return -ENOMEM;
844
845                 /*
846                  * We don't have an immediate reader, but we'll read the stuff
847                  * out of the pipe right after the splice_to_pipe(). So set
848                  * PIPE_READERS appropriately.
849                  */
850                 pipe->readers = 1;
851
852                 current->splice_pipe = pipe;
853         }
854
855         /*
856          * Do the splice.
857          */
858         ret = 0;
859         bytes = 0;
860         len = sd->total_len;
861         flags = sd->flags;
862
863         /*
864          * Don't block on output, we have to drain the direct pipe.
865          */
866         sd->flags &= ~SPLICE_F_NONBLOCK;
867         more = sd->flags & SPLICE_F_MORE;
868
869         WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
870
871         while (len) {
872                 size_t read_len;
873                 loff_t pos = sd->pos, prev_pos = pos;
874
875                 ret = do_splice_to(in, &pos, pipe, len, flags);
876                 if (unlikely(ret <= 0))
877                         goto read_failure;
878
879                 read_len = ret;
880                 sd->total_len = read_len;
881
882                 /*
883                  * If more data is pending, set SPLICE_F_MORE
884                  * If this is the last data and SPLICE_F_MORE was not set
885                  * initially, clears it.
886                  */
887                 if (read_len < len)
888                         sd->flags |= SPLICE_F_MORE;
889                 else if (!more)
890                         sd->flags &= ~SPLICE_F_MORE;
891                 /*
892                  * NOTE: nonblocking mode only applies to the input. We
893                  * must not do the output in nonblocking mode as then we
894                  * could get stuck data in the internal pipe:
895                  */
896                 ret = actor(pipe, sd);
897                 if (unlikely(ret <= 0)) {
898                         sd->pos = prev_pos;
899                         goto out_release;
900                 }
901
902                 bytes += ret;
903                 len -= ret;
904                 sd->pos = pos;
905
906                 if (ret < read_len) {
907                         sd->pos = prev_pos + ret;
908                         goto out_release;
909                 }
910         }
911
912 done:
913         pipe->tail = pipe->head = 0;
914         file_accessed(in);
915         return bytes;
916
917 read_failure:
918         /*
919          * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
920          * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
921          * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
922          * least 1 byte *then* we will also do the ->splice_eof() call.
923          */
924         if (ret == 0 && !more && len > 0 && bytes)
925                 do_splice_eof(sd);
926 out_release:
927         /*
928          * If we did an incomplete transfer we must release
929          * the pipe buffers in question:
930          */
931         for (i = 0; i < pipe->ring_size; i++) {
932                 struct pipe_buffer *buf = &pipe->bufs[i];
933
934                 if (buf->ops)
935                         pipe_buf_release(pipe, buf);
936         }
937
938         if (!bytes)
939                 bytes = ret;
940
941         goto done;
942 }
943 EXPORT_SYMBOL(splice_direct_to_actor);
944
945 static int direct_splice_actor(struct pipe_inode_info *pipe,
946                                struct splice_desc *sd)
947 {
948         struct file *file = sd->u.file;
949
950         return do_splice_from(pipe, file, sd->opos, sd->total_len,
951                               sd->flags);
952 }
953
954 static void direct_file_splice_eof(struct splice_desc *sd)
955 {
956         struct file *file = sd->u.file;
957
958         if (file->f_op->splice_eof)
959                 file->f_op->splice_eof(file);
960 }
961
962 /**
963  * do_splice_direct - splices data directly between two files
964  * @in:         file to splice from
965  * @ppos:       input file offset
966  * @out:        file to splice to
967  * @opos:       output file offset
968  * @len:        number of bytes to splice
969  * @flags:      splice modifier flags
970  *
971  * Description:
972  *    For use by do_sendfile(). splice can easily emulate sendfile, but
973  *    doing it in the application would incur an extra system call
974  *    (splice in + splice out, as compared to just sendfile()). So this helper
975  *    can splice directly through a process-private pipe.
976  *
977  */
978 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
979                       loff_t *opos, size_t len, unsigned int flags)
980 {
981         struct splice_desc sd = {
982                 .len            = len,
983                 .total_len      = len,
984                 .flags          = flags,
985                 .pos            = *ppos,
986                 .u.file         = out,
987                 .splice_eof     = direct_file_splice_eof,
988                 .opos           = opos,
989         };
990         long ret;
991
992         if (unlikely(!(out->f_mode & FMODE_WRITE)))
993                 return -EBADF;
994
995         if (unlikely(out->f_flags & O_APPEND))
996                 return -EINVAL;
997
998         ret = rw_verify_area(WRITE, out, opos, len);
999         if (unlikely(ret < 0))
1000                 return ret;
1001
1002         ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1003         if (ret > 0)
1004                 *ppos = sd.pos;
1005
1006         return ret;
1007 }
1008 EXPORT_SYMBOL(do_splice_direct);
1009
1010 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1011 {
1012         for (;;) {
1013                 if (unlikely(!pipe->readers)) {
1014                         send_sig(SIGPIPE, current, 0);
1015                         return -EPIPE;
1016                 }
1017                 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1018                         return 0;
1019                 if (flags & SPLICE_F_NONBLOCK)
1020                         return -EAGAIN;
1021                 if (signal_pending(current))
1022                         return -ERESTARTSYS;
1023                 pipe_wait_writable(pipe);
1024         }
1025 }
1026
1027 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1028                                struct pipe_inode_info *opipe,
1029                                size_t len, unsigned int flags);
1030
1031 long splice_file_to_pipe(struct file *in,
1032                          struct pipe_inode_info *opipe,
1033                          loff_t *offset,
1034                          size_t len, unsigned int flags)
1035 {
1036         long ret;
1037
1038         pipe_lock(opipe);
1039         ret = wait_for_space(opipe, flags);
1040         if (!ret)
1041                 ret = do_splice_to(in, offset, opipe, len, flags);
1042         pipe_unlock(opipe);
1043         if (ret > 0)
1044                 wakeup_pipe_readers(opipe);
1045         return ret;
1046 }
1047
1048 /*
1049  * Determine where to splice to/from.
1050  */
1051 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1052                loff_t *off_out, size_t len, unsigned int flags)
1053 {
1054         struct pipe_inode_info *ipipe;
1055         struct pipe_inode_info *opipe;
1056         loff_t offset;
1057         long ret;
1058
1059         if (unlikely(!(in->f_mode & FMODE_READ) ||
1060                      !(out->f_mode & FMODE_WRITE)))
1061                 return -EBADF;
1062
1063         ipipe = get_pipe_info(in, true);
1064         opipe = get_pipe_info(out, true);
1065
1066         if (ipipe && opipe) {
1067                 if (off_in || off_out)
1068                         return -ESPIPE;
1069
1070                 /* Splicing to self would be fun, but... */
1071                 if (ipipe == opipe)
1072                         return -EINVAL;
1073
1074                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1075                         flags |= SPLICE_F_NONBLOCK;
1076
1077                 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1078         }
1079
1080         if (ipipe) {
1081                 if (off_in)
1082                         return -ESPIPE;
1083                 if (off_out) {
1084                         if (!(out->f_mode & FMODE_PWRITE))
1085                                 return -EINVAL;
1086                         offset = *off_out;
1087                 } else {
1088                         offset = out->f_pos;
1089                 }
1090
1091                 if (unlikely(out->f_flags & O_APPEND))
1092                         return -EINVAL;
1093
1094                 ret = rw_verify_area(WRITE, out, &offset, len);
1095                 if (unlikely(ret < 0))
1096                         return ret;
1097
1098                 if (in->f_flags & O_NONBLOCK)
1099                         flags |= SPLICE_F_NONBLOCK;
1100
1101                 file_start_write(out);
1102                 ret = do_splice_from(ipipe, out, &offset, len, flags);
1103                 file_end_write(out);
1104
1105                 if (!off_out)
1106                         out->f_pos = offset;
1107                 else
1108                         *off_out = offset;
1109
1110                 return ret;
1111         }
1112
1113         if (opipe) {
1114                 if (off_out)
1115                         return -ESPIPE;
1116                 if (off_in) {
1117                         if (!(in->f_mode & FMODE_PREAD))
1118                                 return -EINVAL;
1119                         offset = *off_in;
1120                 } else {
1121                         offset = in->f_pos;
1122                 }
1123
1124                 if (out->f_flags & O_NONBLOCK)
1125                         flags |= SPLICE_F_NONBLOCK;
1126
1127                 ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1128                 if (!off_in)
1129                         in->f_pos = offset;
1130                 else
1131                         *off_in = offset;
1132
1133                 return ret;
1134         }
1135
1136         return -EINVAL;
1137 }
1138
1139 static long __do_splice(struct file *in, loff_t __user *off_in,
1140                         struct file *out, loff_t __user *off_out,
1141                         size_t len, unsigned int flags)
1142 {
1143         struct pipe_inode_info *ipipe;
1144         struct pipe_inode_info *opipe;
1145         loff_t offset, *__off_in = NULL, *__off_out = NULL;
1146         long ret;
1147
1148         ipipe = get_pipe_info(in, true);
1149         opipe = get_pipe_info(out, true);
1150
1151         if (ipipe && off_in)
1152                 return -ESPIPE;
1153         if (opipe && off_out)
1154                 return -ESPIPE;
1155
1156         if (off_out) {
1157                 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1158                         return -EFAULT;
1159                 __off_out = &offset;
1160         }
1161         if (off_in) {
1162                 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1163                         return -EFAULT;
1164                 __off_in = &offset;
1165         }
1166
1167         ret = do_splice(in, __off_in, out, __off_out, len, flags);
1168         if (ret < 0)
1169                 return ret;
1170
1171         if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1172                 return -EFAULT;
1173         if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1174                 return -EFAULT;
1175
1176         return ret;
1177 }
1178
1179 static int iter_to_pipe(struct iov_iter *from,
1180                         struct pipe_inode_info *pipe,
1181                         unsigned flags)
1182 {
1183         struct pipe_buffer buf = {
1184                 .ops = &user_page_pipe_buf_ops,
1185                 .flags = flags
1186         };
1187         size_t total = 0;
1188         int ret = 0;
1189
1190         while (iov_iter_count(from)) {
1191                 struct page *pages[16];
1192                 ssize_t left;
1193                 size_t start;
1194                 int i, n;
1195
1196                 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1197                 if (left <= 0) {
1198                         ret = left;
1199                         break;
1200                 }
1201
1202                 n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1203                 for (i = 0; i < n; i++) {
1204                         int size = min_t(int, left, PAGE_SIZE - start);
1205
1206                         buf.page = pages[i];
1207                         buf.offset = start;
1208                         buf.len = size;
1209                         ret = add_to_pipe(pipe, &buf);
1210                         if (unlikely(ret < 0)) {
1211                                 iov_iter_revert(from, left);
1212                                 // this one got dropped by add_to_pipe()
1213                                 while (++i < n)
1214                                         put_page(pages[i]);
1215                                 goto out;
1216                         }
1217                         total += ret;
1218                         left -= size;
1219                         start = 0;
1220                 }
1221         }
1222 out:
1223         return total ? total : ret;
1224 }
1225
1226 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1227                         struct splice_desc *sd)
1228 {
1229         int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1230         return n == sd->len ? n : -EFAULT;
1231 }
1232
1233 /*
1234  * For lack of a better implementation, implement vmsplice() to userspace
1235  * as a simple copy of the pipes pages to the user iov.
1236  */
1237 static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1238                              unsigned int flags)
1239 {
1240         struct pipe_inode_info *pipe = get_pipe_info(file, true);
1241         struct splice_desc sd = {
1242                 .total_len = iov_iter_count(iter),
1243                 .flags = flags,
1244                 .u.data = iter
1245         };
1246         long ret = 0;
1247
1248         if (!pipe)
1249                 return -EBADF;
1250
1251         if (sd.total_len) {
1252                 pipe_lock(pipe);
1253                 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1254                 pipe_unlock(pipe);
1255         }
1256
1257         return ret;
1258 }
1259
1260 /*
1261  * vmsplice splices a user address range into a pipe. It can be thought of
1262  * as splice-from-memory, where the regular splice is splice-from-file (or
1263  * to file). In both cases the output is a pipe, naturally.
1264  */
1265 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1266                              unsigned int flags)
1267 {
1268         struct pipe_inode_info *pipe;
1269         long ret = 0;
1270         unsigned buf_flag = 0;
1271
1272         if (flags & SPLICE_F_GIFT)
1273                 buf_flag = PIPE_BUF_FLAG_GIFT;
1274
1275         pipe = get_pipe_info(file, true);
1276         if (!pipe)
1277                 return -EBADF;
1278
1279         pipe_lock(pipe);
1280         ret = wait_for_space(pipe, flags);
1281         if (!ret)
1282                 ret = iter_to_pipe(iter, pipe, buf_flag);
1283         pipe_unlock(pipe);
1284         if (ret > 0)
1285                 wakeup_pipe_readers(pipe);
1286         return ret;
1287 }
1288
1289 static int vmsplice_type(struct fd f, int *type)
1290 {
1291         if (!f.file)
1292                 return -EBADF;
1293         if (f.file->f_mode & FMODE_WRITE) {
1294                 *type = ITER_SOURCE;
1295         } else if (f.file->f_mode & FMODE_READ) {
1296                 *type = ITER_DEST;
1297         } else {
1298                 fdput(f);
1299                 return -EBADF;
1300         }
1301         return 0;
1302 }
1303
1304 /*
1305  * Note that vmsplice only really supports true splicing _from_ user memory
1306  * to a pipe, not the other way around. Splicing from user memory is a simple
1307  * operation that can be supported without any funky alignment restrictions
1308  * or nasty vm tricks. We simply map in the user memory and fill them into
1309  * a pipe. The reverse isn't quite as easy, though. There are two possible
1310  * solutions for that:
1311  *
1312  *      - memcpy() the data internally, at which point we might as well just
1313  *        do a regular read() on the buffer anyway.
1314  *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1315  *        has restriction limitations on both ends of the pipe).
1316  *
1317  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1318  *
1319  */
1320 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1321                 unsigned long, nr_segs, unsigned int, flags)
1322 {
1323         struct iovec iovstack[UIO_FASTIOV];
1324         struct iovec *iov = iovstack;
1325         struct iov_iter iter;
1326         ssize_t error;
1327         struct fd f;
1328         int type;
1329
1330         if (unlikely(flags & ~SPLICE_F_ALL))
1331                 return -EINVAL;
1332
1333         f = fdget(fd);
1334         error = vmsplice_type(f, &type);
1335         if (error)
1336                 return error;
1337
1338         error = import_iovec(type, uiov, nr_segs,
1339                              ARRAY_SIZE(iovstack), &iov, &iter);
1340         if (error < 0)
1341                 goto out_fdput;
1342
1343         if (!iov_iter_count(&iter))
1344                 error = 0;
1345         else if (type == ITER_SOURCE)
1346                 error = vmsplice_to_pipe(f.file, &iter, flags);
1347         else
1348                 error = vmsplice_to_user(f.file, &iter, flags);
1349
1350         kfree(iov);
1351 out_fdput:
1352         fdput(f);
1353         return error;
1354 }
1355
1356 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1357                 int, fd_out, loff_t __user *, off_out,
1358                 size_t, len, unsigned int, flags)
1359 {
1360         struct fd in, out;
1361         long error;
1362
1363         if (unlikely(!len))
1364                 return 0;
1365
1366         if (unlikely(flags & ~SPLICE_F_ALL))
1367                 return -EINVAL;
1368
1369         error = -EBADF;
1370         in = fdget(fd_in);
1371         if (in.file) {
1372                 out = fdget(fd_out);
1373                 if (out.file) {
1374                         error = __do_splice(in.file, off_in, out.file, off_out,
1375                                                 len, flags);
1376                         fdput(out);
1377                 }
1378                 fdput(in);
1379         }
1380         return error;
1381 }
1382
1383 /*
1384  * Make sure there's data to read. Wait for input if we can, otherwise
1385  * return an appropriate error.
1386  */
1387 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1388 {
1389         int ret;
1390
1391         /*
1392          * Check the pipe occupancy without the inode lock first. This function
1393          * is speculative anyways, so missing one is ok.
1394          */
1395         if (!pipe_empty(pipe->head, pipe->tail))
1396                 return 0;
1397
1398         ret = 0;
1399         pipe_lock(pipe);
1400
1401         while (pipe_empty(pipe->head, pipe->tail)) {
1402                 if (signal_pending(current)) {
1403                         ret = -ERESTARTSYS;
1404                         break;
1405                 }
1406                 if (!pipe->writers)
1407                         break;
1408                 if (flags & SPLICE_F_NONBLOCK) {
1409                         ret = -EAGAIN;
1410                         break;
1411                 }
1412                 pipe_wait_readable(pipe);
1413         }
1414
1415         pipe_unlock(pipe);
1416         return ret;
1417 }
1418
1419 /*
1420  * Make sure there's writeable room. Wait for room if we can, otherwise
1421  * return an appropriate error.
1422  */
1423 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1424 {
1425         int ret;
1426
1427         /*
1428          * Check pipe occupancy without the inode lock first. This function
1429          * is speculative anyways, so missing one is ok.
1430          */
1431         if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1432                 return 0;
1433
1434         ret = 0;
1435         pipe_lock(pipe);
1436
1437         while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1438                 if (!pipe->readers) {
1439                         send_sig(SIGPIPE, current, 0);
1440                         ret = -EPIPE;
1441                         break;
1442                 }
1443                 if (flags & SPLICE_F_NONBLOCK) {
1444                         ret = -EAGAIN;
1445                         break;
1446                 }
1447                 if (signal_pending(current)) {
1448                         ret = -ERESTARTSYS;
1449                         break;
1450                 }
1451                 pipe_wait_writable(pipe);
1452         }
1453
1454         pipe_unlock(pipe);
1455         return ret;
1456 }
1457
1458 /*
1459  * Splice contents of ipipe to opipe.
1460  */
1461 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1462                                struct pipe_inode_info *opipe,
1463                                size_t len, unsigned int flags)
1464 {
1465         struct pipe_buffer *ibuf, *obuf;
1466         unsigned int i_head, o_head;
1467         unsigned int i_tail, o_tail;
1468         unsigned int i_mask, o_mask;
1469         int ret = 0;
1470         bool input_wakeup = false;
1471
1472
1473 retry:
1474         ret = ipipe_prep(ipipe, flags);
1475         if (ret)
1476                 return ret;
1477
1478         ret = opipe_prep(opipe, flags);
1479         if (ret)
1480                 return ret;
1481
1482         /*
1483          * Potential ABBA deadlock, work around it by ordering lock
1484          * grabbing by pipe info address. Otherwise two different processes
1485          * could deadlock (one doing tee from A -> B, the other from B -> A).
1486          */
1487         pipe_double_lock(ipipe, opipe);
1488
1489         i_tail = ipipe->tail;
1490         i_mask = ipipe->ring_size - 1;
1491         o_head = opipe->head;
1492         o_mask = opipe->ring_size - 1;
1493
1494         do {
1495                 size_t o_len;
1496
1497                 if (!opipe->readers) {
1498                         send_sig(SIGPIPE, current, 0);
1499                         if (!ret)
1500                                 ret = -EPIPE;
1501                         break;
1502                 }
1503
1504                 i_head = ipipe->head;
1505                 o_tail = opipe->tail;
1506
1507                 if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1508                         break;
1509
1510                 /*
1511                  * Cannot make any progress, because either the input
1512                  * pipe is empty or the output pipe is full.
1513                  */
1514                 if (pipe_empty(i_head, i_tail) ||
1515                     pipe_full(o_head, o_tail, opipe->max_usage)) {
1516                         /* Already processed some buffers, break */
1517                         if (ret)
1518                                 break;
1519
1520                         if (flags & SPLICE_F_NONBLOCK) {
1521                                 ret = -EAGAIN;
1522                                 break;
1523                         }
1524
1525                         /*
1526                          * We raced with another reader/writer and haven't
1527                          * managed to process any buffers.  A zero return
1528                          * value means EOF, so retry instead.
1529                          */
1530                         pipe_unlock(ipipe);
1531                         pipe_unlock(opipe);
1532                         goto retry;
1533                 }
1534
1535                 ibuf = &ipipe->bufs[i_tail & i_mask];
1536                 obuf = &opipe->bufs[o_head & o_mask];
1537
1538                 if (len >= ibuf->len) {
1539                         /*
1540                          * Simply move the whole buffer from ipipe to opipe
1541                          */
1542                         *obuf = *ibuf;
1543                         ibuf->ops = NULL;
1544                         i_tail++;
1545                         ipipe->tail = i_tail;
1546                         input_wakeup = true;
1547                         o_len = obuf->len;
1548                         o_head++;
1549                         opipe->head = o_head;
1550                 } else {
1551                         /*
1552                          * Get a reference to this pipe buffer,
1553                          * so we can copy the contents over.
1554                          */
1555                         if (!pipe_buf_get(ipipe, ibuf)) {
1556                                 if (ret == 0)
1557                                         ret = -EFAULT;
1558                                 break;
1559                         }
1560                         *obuf = *ibuf;
1561
1562                         /*
1563                          * Don't inherit the gift and merge flags, we need to
1564                          * prevent multiple steals of this page.
1565                          */
1566                         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1567                         obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1568
1569                         obuf->len = len;
1570                         ibuf->offset += len;
1571                         ibuf->len -= len;
1572                         o_len = len;
1573                         o_head++;
1574                         opipe->head = o_head;
1575                 }
1576                 ret += o_len;
1577                 len -= o_len;
1578         } while (len);
1579
1580         pipe_unlock(ipipe);
1581         pipe_unlock(opipe);
1582
1583         /*
1584          * If we put data in the output pipe, wakeup any potential readers.
1585          */
1586         if (ret > 0)
1587                 wakeup_pipe_readers(opipe);
1588
1589         if (input_wakeup)
1590                 wakeup_pipe_writers(ipipe);
1591
1592         return ret;
1593 }
1594
1595 /*
1596  * Link contents of ipipe to opipe.
1597  */
1598 static int link_pipe(struct pipe_inode_info *ipipe,
1599                      struct pipe_inode_info *opipe,
1600                      size_t len, unsigned int flags)
1601 {
1602         struct pipe_buffer *ibuf, *obuf;
1603         unsigned int i_head, o_head;
1604         unsigned int i_tail, o_tail;
1605         unsigned int i_mask, o_mask;
1606         int ret = 0;
1607
1608         /*
1609          * Potential ABBA deadlock, work around it by ordering lock
1610          * grabbing by pipe info address. Otherwise two different processes
1611          * could deadlock (one doing tee from A -> B, the other from B -> A).
1612          */
1613         pipe_double_lock(ipipe, opipe);
1614
1615         i_tail = ipipe->tail;
1616         i_mask = ipipe->ring_size - 1;
1617         o_head = opipe->head;
1618         o_mask = opipe->ring_size - 1;
1619
1620         do {
1621                 if (!opipe->readers) {
1622                         send_sig(SIGPIPE, current, 0);
1623                         if (!ret)
1624                                 ret = -EPIPE;
1625                         break;
1626                 }
1627
1628                 i_head = ipipe->head;
1629                 o_tail = opipe->tail;
1630
1631                 /*
1632                  * If we have iterated all input buffers or run out of
1633                  * output room, break.
1634                  */
1635                 if (pipe_empty(i_head, i_tail) ||
1636                     pipe_full(o_head, o_tail, opipe->max_usage))
1637                         break;
1638
1639                 ibuf = &ipipe->bufs[i_tail & i_mask];
1640                 obuf = &opipe->bufs[o_head & o_mask];
1641
1642                 /*
1643                  * Get a reference to this pipe buffer,
1644                  * so we can copy the contents over.
1645                  */
1646                 if (!pipe_buf_get(ipipe, ibuf)) {
1647                         if (ret == 0)
1648                                 ret = -EFAULT;
1649                         break;
1650                 }
1651
1652                 *obuf = *ibuf;
1653
1654                 /*
1655                  * Don't inherit the gift and merge flag, we need to prevent
1656                  * multiple steals of this page.
1657                  */
1658                 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1659                 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1660
1661                 if (obuf->len > len)
1662                         obuf->len = len;
1663                 ret += obuf->len;
1664                 len -= obuf->len;
1665
1666                 o_head++;
1667                 opipe->head = o_head;
1668                 i_tail++;
1669         } while (len);
1670
1671         pipe_unlock(ipipe);
1672         pipe_unlock(opipe);
1673
1674         /*
1675          * If we put data in the output pipe, wakeup any potential readers.
1676          */
1677         if (ret > 0)
1678                 wakeup_pipe_readers(opipe);
1679
1680         return ret;
1681 }
1682
1683 /*
1684  * This is a tee(1) implementation that works on pipes. It doesn't copy
1685  * any data, it simply references the 'in' pages on the 'out' pipe.
1686  * The 'flags' used are the SPLICE_F_* variants, currently the only
1687  * applicable one is SPLICE_F_NONBLOCK.
1688  */
1689 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1690 {
1691         struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1692         struct pipe_inode_info *opipe = get_pipe_info(out, true);
1693         int ret = -EINVAL;
1694
1695         if (unlikely(!(in->f_mode & FMODE_READ) ||
1696                      !(out->f_mode & FMODE_WRITE)))
1697                 return -EBADF;
1698
1699         /*
1700          * Duplicate the contents of ipipe to opipe without actually
1701          * copying the data.
1702          */
1703         if (ipipe && opipe && ipipe != opipe) {
1704                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1705                         flags |= SPLICE_F_NONBLOCK;
1706
1707                 /*
1708                  * Keep going, unless we encounter an error. The ipipe/opipe
1709                  * ordering doesn't really matter.
1710                  */
1711                 ret = ipipe_prep(ipipe, flags);
1712                 if (!ret) {
1713                         ret = opipe_prep(opipe, flags);
1714                         if (!ret)
1715                                 ret = link_pipe(ipipe, opipe, len, flags);
1716                 }
1717         }
1718
1719         return ret;
1720 }
1721
1722 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1723 {
1724         struct fd in, out;
1725         int error;
1726
1727         if (unlikely(flags & ~SPLICE_F_ALL))
1728                 return -EINVAL;
1729
1730         if (unlikely(!len))
1731                 return 0;
1732
1733         error = -EBADF;
1734         in = fdget(fdin);
1735         if (in.file) {
1736                 out = fdget(fdout);
1737                 if (out.file) {
1738                         error = do_tee(in.file, out.file, len, flags);
1739                         fdput(out);
1740                 }
1741                 fdput(in);
1742         }
1743
1744         return error;
1745 }