]> git.ipfire.org Git - thirdparty/kernel/linux.git/blob - fs/splice.c
7cda013e5a1ef1f5781f6e1d6774c8865a30ab0d
[thirdparty/kernel/linux.git] / fs / splice.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * "splice": joining two ropes together by interweaving their strands.
4 *
5 * This is the "extended pipe" functionality, where a pipe is used as
6 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7 * buffer that you can use to transfer data from one end to the other.
8 *
9 * The traditional unix read/write is extended with a "splice()" operation
10 * that transfers data buffers to or from a pipe buffer.
11 *
12 * Named by Larry McVoy, original implementation from Linus, extended by
13 * Jens to support splicing to files, network, direct splicing, etc and
14 * fixing lots of bugs.
15 *
16 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19 *
20 */
21 #include <linux/bvec.h>
22 #include <linux/fs.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/splice.h>
26 #include <linux/memcontrol.h>
27 #include <linux/mm_inline.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/export.h>
31 #include <linux/syscalls.h>
32 #include <linux/uio.h>
33 #include <linux/fsnotify.h>
34 #include <linux/security.h>
35 #include <linux/gfp.h>
36 #include <linux/net.h>
37 #include <linux/socket.h>
38 #include <linux/sched/signal.h>
39
40 #include "internal.h"
41
42 /*
43 * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
44 * indicate they support non-blocking reads or writes, we must clear it
45 * here if set to avoid blocking other users of this pipe if splice is
46 * being done on it.
47 */
48 static noinline void noinline pipe_clear_nowait(struct file *file)
49 {
50 fmode_t fmode = READ_ONCE(file->f_mode);
51
52 do {
53 if (!(fmode & FMODE_NOWAIT))
54 break;
55 } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
56 }
57
58 /*
59 * Attempt to steal a page from a pipe buffer. This should perhaps go into
60 * a vm helper function, it's already simplified quite a bit by the
61 * addition of remove_mapping(). If success is returned, the caller may
62 * attempt to reuse this page for another destination.
63 */
64 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
65 struct pipe_buffer *buf)
66 {
67 struct folio *folio = page_folio(buf->page);
68 struct address_space *mapping;
69
70 folio_lock(folio);
71
72 mapping = folio_mapping(folio);
73 if (mapping) {
74 WARN_ON(!folio_test_uptodate(folio));
75
76 /*
77 * At least for ext2 with nobh option, we need to wait on
78 * writeback completing on this folio, since we'll remove it
79 * from the pagecache. Otherwise truncate wont wait on the
80 * folio, allowing the disk blocks to be reused by someone else
81 * before we actually wrote our data to them. fs corruption
82 * ensues.
83 */
84 folio_wait_writeback(folio);
85
86 if (!filemap_release_folio(folio, GFP_KERNEL))
87 goto out_unlock;
88
89 /*
90 * If we succeeded in removing the mapping, set LRU flag
91 * and return good.
92 */
93 if (remove_mapping(mapping, folio)) {
94 buf->flags |= PIPE_BUF_FLAG_LRU;
95 return true;
96 }
97 }
98
99 /*
100 * Raced with truncate or failed to remove folio from current
101 * address space, unlock and return failure.
102 */
103 out_unlock:
104 folio_unlock(folio);
105 return false;
106 }
107
108 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
109 struct pipe_buffer *buf)
110 {
111 put_page(buf->page);
112 buf->flags &= ~PIPE_BUF_FLAG_LRU;
113 }
114
115 /*
116 * Check whether the contents of buf is OK to access. Since the content
117 * is a page cache page, IO may be in flight.
118 */
119 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
120 struct pipe_buffer *buf)
121 {
122 struct folio *folio = page_folio(buf->page);
123 int err;
124
125 if (!folio_test_uptodate(folio)) {
126 folio_lock(folio);
127
128 /*
129 * Folio got truncated/unhashed. This will cause a 0-byte
130 * splice, if this is the first page.
131 */
132 if (!folio->mapping) {
133 err = -ENODATA;
134 goto error;
135 }
136
137 /*
138 * Uh oh, read-error from disk.
139 */
140 if (!folio_test_uptodate(folio)) {
141 err = -EIO;
142 goto error;
143 }
144
145 /* Folio is ok after all, we are done */
146 folio_unlock(folio);
147 }
148
149 return 0;
150 error:
151 folio_unlock(folio);
152 return err;
153 }
154
155 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
156 .confirm = page_cache_pipe_buf_confirm,
157 .release = page_cache_pipe_buf_release,
158 .try_steal = page_cache_pipe_buf_try_steal,
159 .get = generic_pipe_buf_get,
160 };
161
162 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
163 struct pipe_buffer *buf)
164 {
165 if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
166 return false;
167
168 buf->flags |= PIPE_BUF_FLAG_LRU;
169 return generic_pipe_buf_try_steal(pipe, buf);
170 }
171
172 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
173 .release = page_cache_pipe_buf_release,
174 .try_steal = user_page_pipe_buf_try_steal,
175 .get = generic_pipe_buf_get,
176 };
177
178 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
179 {
180 smp_mb();
181 if (waitqueue_active(&pipe->rd_wait))
182 wake_up_interruptible(&pipe->rd_wait);
183 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
184 }
185
186 /**
187 * splice_to_pipe - fill passed data into a pipe
188 * @pipe: pipe to fill
189 * @spd: data to fill
190 *
191 * Description:
192 * @spd contains a map of pages and len/offset tuples, along with
193 * the struct pipe_buf_operations associated with these pages. This
194 * function will link that data to the pipe.
195 *
196 */
197 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
198 struct splice_pipe_desc *spd)
199 {
200 unsigned int spd_pages = spd->nr_pages;
201 unsigned int tail = pipe->tail;
202 unsigned int head = pipe->head;
203 unsigned int mask = pipe->ring_size - 1;
204 int ret = 0, page_nr = 0;
205
206 if (!spd_pages)
207 return 0;
208
209 if (unlikely(!pipe->readers)) {
210 send_sig(SIGPIPE, current, 0);
211 ret = -EPIPE;
212 goto out;
213 }
214
215 while (!pipe_full(head, tail, pipe->max_usage)) {
216 struct pipe_buffer *buf = &pipe->bufs[head & mask];
217
218 buf->page = spd->pages[page_nr];
219 buf->offset = spd->partial[page_nr].offset;
220 buf->len = spd->partial[page_nr].len;
221 buf->private = spd->partial[page_nr].private;
222 buf->ops = spd->ops;
223 buf->flags = 0;
224
225 head++;
226 pipe->head = head;
227 page_nr++;
228 ret += buf->len;
229
230 if (!--spd->nr_pages)
231 break;
232 }
233
234 if (!ret)
235 ret = -EAGAIN;
236
237 out:
238 while (page_nr < spd_pages)
239 spd->spd_release(spd, page_nr++);
240
241 return ret;
242 }
243 EXPORT_SYMBOL_GPL(splice_to_pipe);
244
245 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
246 {
247 unsigned int head = pipe->head;
248 unsigned int tail = pipe->tail;
249 unsigned int mask = pipe->ring_size - 1;
250 int ret;
251
252 if (unlikely(!pipe->readers)) {
253 send_sig(SIGPIPE, current, 0);
254 ret = -EPIPE;
255 } else if (pipe_full(head, tail, pipe->max_usage)) {
256 ret = -EAGAIN;
257 } else {
258 pipe->bufs[head & mask] = *buf;
259 pipe->head = head + 1;
260 return buf->len;
261 }
262 pipe_buf_release(pipe, buf);
263 return ret;
264 }
265 EXPORT_SYMBOL(add_to_pipe);
266
267 /*
268 * Check if we need to grow the arrays holding pages and partial page
269 * descriptions.
270 */
271 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
272 {
273 unsigned int max_usage = READ_ONCE(pipe->max_usage);
274
275 spd->nr_pages_max = max_usage;
276 if (max_usage <= PIPE_DEF_BUFFERS)
277 return 0;
278
279 spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
280 spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
281 GFP_KERNEL);
282
283 if (spd->pages && spd->partial)
284 return 0;
285
286 kfree(spd->pages);
287 kfree(spd->partial);
288 return -ENOMEM;
289 }
290
291 void splice_shrink_spd(struct splice_pipe_desc *spd)
292 {
293 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
294 return;
295
296 kfree(spd->pages);
297 kfree(spd->partial);
298 }
299
300 /**
301 * copy_splice_read - Copy data from a file and splice the copy into a pipe
302 * @in: The file to read from
303 * @ppos: Pointer to the file position to read from
304 * @pipe: The pipe to splice into
305 * @len: The amount to splice
306 * @flags: The SPLICE_F_* flags
307 *
308 * This function allocates a bunch of pages sufficient to hold the requested
309 * amount of data (but limited by the remaining pipe capacity), passes it to
310 * the file's ->read_iter() to read into and then splices the used pages into
311 * the pipe.
312 *
313 * Return: On success, the number of bytes read will be returned and *@ppos
314 * will be updated if appropriate; 0 will be returned if there is no more data
315 * to be read; -EAGAIN will be returned if the pipe had no space, and some
316 * other negative error code will be returned on error. A short read may occur
317 * if the pipe has insufficient space, we reach the end of the data or we hit a
318 * hole.
319 */
320 ssize_t copy_splice_read(struct file *in, loff_t *ppos,
321 struct pipe_inode_info *pipe,
322 size_t len, unsigned int flags)
323 {
324 struct iov_iter to;
325 struct bio_vec *bv;
326 struct kiocb kiocb;
327 struct page **pages;
328 ssize_t ret;
329 size_t used, npages, chunk, remain, keep = 0;
330 int i;
331
332 /* Work out how much data we can actually add into the pipe */
333 used = pipe_occupancy(pipe->head, pipe->tail);
334 npages = max_t(ssize_t, pipe->max_usage - used, 0);
335 len = min_t(size_t, len, npages * PAGE_SIZE);
336 npages = DIV_ROUND_UP(len, PAGE_SIZE);
337
338 bv = kzalloc(array_size(npages, sizeof(bv[0])) +
339 array_size(npages, sizeof(struct page *)), GFP_KERNEL);
340 if (!bv)
341 return -ENOMEM;
342
343 pages = (struct page **)(bv + npages);
344 npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
345 if (!npages) {
346 kfree(bv);
347 return -ENOMEM;
348 }
349
350 remain = len = min_t(size_t, len, npages * PAGE_SIZE);
351
352 for (i = 0; i < npages; i++) {
353 chunk = min_t(size_t, PAGE_SIZE, remain);
354 bv[i].bv_page = pages[i];
355 bv[i].bv_offset = 0;
356 bv[i].bv_len = chunk;
357 remain -= chunk;
358 }
359
360 /* Do the I/O */
361 iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
362 init_sync_kiocb(&kiocb, in);
363 kiocb.ki_pos = *ppos;
364 ret = call_read_iter(in, &kiocb, &to);
365
366 if (ret > 0) {
367 keep = DIV_ROUND_UP(ret, PAGE_SIZE);
368 *ppos = kiocb.ki_pos;
369 }
370
371 /*
372 * Callers of ->splice_read() expect -EAGAIN on "can't put anything in
373 * there", rather than -EFAULT.
374 */
375 if (ret == -EFAULT)
376 ret = -EAGAIN;
377
378 /* Free any pages that didn't get touched at all. */
379 if (keep < npages)
380 release_pages(pages + keep, npages - keep);
381
382 /* Push the remaining pages into the pipe. */
383 remain = ret;
384 for (i = 0; i < keep; i++) {
385 struct pipe_buffer *buf = pipe_head_buf(pipe);
386
387 chunk = min_t(size_t, remain, PAGE_SIZE);
388 *buf = (struct pipe_buffer) {
389 .ops = &default_pipe_buf_ops,
390 .page = bv[i].bv_page,
391 .offset = 0,
392 .len = chunk,
393 };
394 pipe->head++;
395 remain -= chunk;
396 }
397
398 kfree(bv);
399 return ret;
400 }
401 EXPORT_SYMBOL(copy_splice_read);
402
403 const struct pipe_buf_operations default_pipe_buf_ops = {
404 .release = generic_pipe_buf_release,
405 .try_steal = generic_pipe_buf_try_steal,
406 .get = generic_pipe_buf_get,
407 };
408
409 /* Pipe buffer operations for a socket and similar. */
410 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
411 .release = generic_pipe_buf_release,
412 .get = generic_pipe_buf_get,
413 };
414 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
415
416 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
417 {
418 smp_mb();
419 if (waitqueue_active(&pipe->wr_wait))
420 wake_up_interruptible(&pipe->wr_wait);
421 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
422 }
423
424 /**
425 * splice_from_pipe_feed - feed available data from a pipe to a file
426 * @pipe: pipe to splice from
427 * @sd: information to @actor
428 * @actor: handler that splices the data
429 *
430 * Description:
431 * This function loops over the pipe and calls @actor to do the
432 * actual moving of a single struct pipe_buffer to the desired
433 * destination. It returns when there's no more buffers left in
434 * the pipe or if the requested number of bytes (@sd->total_len)
435 * have been copied. It returns a positive number (one) if the
436 * pipe needs to be filled with more data, zero if the required
437 * number of bytes have been copied and -errno on error.
438 *
439 * This, together with splice_from_pipe_{begin,end,next}, may be
440 * used to implement the functionality of __splice_from_pipe() when
441 * locking is required around copying the pipe buffers to the
442 * destination.
443 */
444 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
445 splice_actor *actor)
446 {
447 unsigned int head = pipe->head;
448 unsigned int tail = pipe->tail;
449 unsigned int mask = pipe->ring_size - 1;
450 int ret;
451
452 while (!pipe_empty(head, tail)) {
453 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
454
455 sd->len = buf->len;
456 if (sd->len > sd->total_len)
457 sd->len = sd->total_len;
458
459 ret = pipe_buf_confirm(pipe, buf);
460 if (unlikely(ret)) {
461 if (ret == -ENODATA)
462 ret = 0;
463 return ret;
464 }
465
466 ret = actor(pipe, buf, sd);
467 if (ret <= 0)
468 return ret;
469
470 buf->offset += ret;
471 buf->len -= ret;
472
473 sd->num_spliced += ret;
474 sd->len -= ret;
475 sd->pos += ret;
476 sd->total_len -= ret;
477
478 if (!buf->len) {
479 pipe_buf_release(pipe, buf);
480 tail++;
481 pipe->tail = tail;
482 if (pipe->files)
483 sd->need_wakeup = true;
484 }
485
486 if (!sd->total_len)
487 return 0;
488 }
489
490 return 1;
491 }
492
493 /* We know we have a pipe buffer, but maybe it's empty? */
494 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
495 {
496 unsigned int tail = pipe->tail;
497 unsigned int mask = pipe->ring_size - 1;
498 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
499
500 if (unlikely(!buf->len)) {
501 pipe_buf_release(pipe, buf);
502 pipe->tail = tail+1;
503 return true;
504 }
505
506 return false;
507 }
508
509 /**
510 * splice_from_pipe_next - wait for some data to splice from
511 * @pipe: pipe to splice from
512 * @sd: information about the splice operation
513 *
514 * Description:
515 * This function will wait for some data and return a positive
516 * value (one) if pipe buffers are available. It will return zero
517 * or -errno if no more data needs to be spliced.
518 */
519 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
520 {
521 /*
522 * Check for signal early to make process killable when there are
523 * always buffers available
524 */
525 if (signal_pending(current))
526 return -ERESTARTSYS;
527
528 repeat:
529 while (pipe_empty(pipe->head, pipe->tail)) {
530 if (!pipe->writers)
531 return 0;
532
533 if (sd->num_spliced)
534 return 0;
535
536 if (sd->flags & SPLICE_F_NONBLOCK)
537 return -EAGAIN;
538
539 if (signal_pending(current))
540 return -ERESTARTSYS;
541
542 if (sd->need_wakeup) {
543 wakeup_pipe_writers(pipe);
544 sd->need_wakeup = false;
545 }
546
547 pipe_wait_readable(pipe);
548 }
549
550 if (eat_empty_buffer(pipe))
551 goto repeat;
552
553 return 1;
554 }
555
556 /**
557 * splice_from_pipe_begin - start splicing from pipe
558 * @sd: information about the splice operation
559 *
560 * Description:
561 * This function should be called before a loop containing
562 * splice_from_pipe_next() and splice_from_pipe_feed() to
563 * initialize the necessary fields of @sd.
564 */
565 static void splice_from_pipe_begin(struct splice_desc *sd)
566 {
567 sd->num_spliced = 0;
568 sd->need_wakeup = false;
569 }
570
571 /**
572 * splice_from_pipe_end - finish splicing from pipe
573 * @pipe: pipe to splice from
574 * @sd: information about the splice operation
575 *
576 * Description:
577 * This function will wake up pipe writers if necessary. It should
578 * be called after a loop containing splice_from_pipe_next() and
579 * splice_from_pipe_feed().
580 */
581 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
582 {
583 if (sd->need_wakeup)
584 wakeup_pipe_writers(pipe);
585 }
586
587 /**
588 * __splice_from_pipe - splice data from a pipe to given actor
589 * @pipe: pipe to splice from
590 * @sd: information to @actor
591 * @actor: handler that splices the data
592 *
593 * Description:
594 * This function does little more than loop over the pipe and call
595 * @actor to do the actual moving of a single struct pipe_buffer to
596 * the desired destination. See pipe_to_file, pipe_to_sendmsg, or
597 * pipe_to_user.
598 *
599 */
600 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
601 splice_actor *actor)
602 {
603 int ret;
604
605 splice_from_pipe_begin(sd);
606 do {
607 cond_resched();
608 ret = splice_from_pipe_next(pipe, sd);
609 if (ret > 0)
610 ret = splice_from_pipe_feed(pipe, sd, actor);
611 } while (ret > 0);
612 splice_from_pipe_end(pipe, sd);
613
614 return sd->num_spliced ? sd->num_spliced : ret;
615 }
616 EXPORT_SYMBOL(__splice_from_pipe);
617
618 /**
619 * splice_from_pipe - splice data from a pipe to a file
620 * @pipe: pipe to splice from
621 * @out: file to splice to
622 * @ppos: position in @out
623 * @len: how many bytes to splice
624 * @flags: splice modifier flags
625 * @actor: handler that splices the data
626 *
627 * Description:
628 * See __splice_from_pipe. This function locks the pipe inode,
629 * otherwise it's identical to __splice_from_pipe().
630 *
631 */
632 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
633 loff_t *ppos, size_t len, unsigned int flags,
634 splice_actor *actor)
635 {
636 ssize_t ret;
637 struct splice_desc sd = {
638 .total_len = len,
639 .flags = flags,
640 .pos = *ppos,
641 .u.file = out,
642 };
643
644 pipe_lock(pipe);
645 ret = __splice_from_pipe(pipe, &sd, actor);
646 pipe_unlock(pipe);
647
648 return ret;
649 }
650
651 /**
652 * iter_file_splice_write - splice data from a pipe to a file
653 * @pipe: pipe info
654 * @out: file to write to
655 * @ppos: position in @out
656 * @len: number of bytes to splice
657 * @flags: splice modifier flags
658 *
659 * Description:
660 * Will either move or copy pages (determined by @flags options) from
661 * the given pipe inode to the given file.
662 * This one is ->write_iter-based.
663 *
664 */
665 ssize_t
666 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
667 loff_t *ppos, size_t len, unsigned int flags)
668 {
669 struct splice_desc sd = {
670 .total_len = len,
671 .flags = flags,
672 .pos = *ppos,
673 .u.file = out,
674 };
675 int nbufs = pipe->max_usage;
676 struct bio_vec *array;
677 ssize_t ret;
678
679 if (!out->f_op->write_iter)
680 return -EINVAL;
681
682 array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL);
683 if (unlikely(!array))
684 return -ENOMEM;
685
686 pipe_lock(pipe);
687
688 splice_from_pipe_begin(&sd);
689 while (sd.total_len) {
690 struct kiocb kiocb;
691 struct iov_iter from;
692 unsigned int head, tail, mask;
693 size_t left;
694 int n;
695
696 ret = splice_from_pipe_next(pipe, &sd);
697 if (ret <= 0)
698 break;
699
700 if (unlikely(nbufs < pipe->max_usage)) {
701 kfree(array);
702 nbufs = pipe->max_usage;
703 array = kcalloc(nbufs, sizeof(struct bio_vec),
704 GFP_KERNEL);
705 if (!array) {
706 ret = -ENOMEM;
707 break;
708 }
709 }
710
711 head = pipe->head;
712 tail = pipe->tail;
713 mask = pipe->ring_size - 1;
714
715 /* build the vector */
716 left = sd.total_len;
717 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
718 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
719 size_t this_len = buf->len;
720
721 /* zero-length bvecs are not supported, skip them */
722 if (!this_len)
723 continue;
724 this_len = min(this_len, left);
725
726 ret = pipe_buf_confirm(pipe, buf);
727 if (unlikely(ret)) {
728 if (ret == -ENODATA)
729 ret = 0;
730 goto done;
731 }
732
733 bvec_set_page(&array[n], buf->page, this_len,
734 buf->offset);
735 left -= this_len;
736 n++;
737 }
738
739 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
740 init_sync_kiocb(&kiocb, out);
741 kiocb.ki_pos = sd.pos;
742 ret = call_write_iter(out, &kiocb, &from);
743 sd.pos = kiocb.ki_pos;
744 if (ret <= 0)
745 break;
746
747 sd.num_spliced += ret;
748 sd.total_len -= ret;
749 *ppos = sd.pos;
750
751 /* dismiss the fully eaten buffers, adjust the partial one */
752 tail = pipe->tail;
753 while (ret) {
754 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
755 if (ret >= buf->len) {
756 ret -= buf->len;
757 buf->len = 0;
758 pipe_buf_release(pipe, buf);
759 tail++;
760 pipe->tail = tail;
761 if (pipe->files)
762 sd.need_wakeup = true;
763 } else {
764 buf->offset += ret;
765 buf->len -= ret;
766 ret = 0;
767 }
768 }
769 }
770 done:
771 kfree(array);
772 splice_from_pipe_end(pipe, &sd);
773
774 pipe_unlock(pipe);
775
776 if (sd.num_spliced)
777 ret = sd.num_spliced;
778
779 return ret;
780 }
781
782 EXPORT_SYMBOL(iter_file_splice_write);
783
784 #ifdef CONFIG_NET
785 /**
786 * splice_to_socket - splice data from a pipe to a socket
787 * @pipe: pipe to splice from
788 * @out: socket to write to
789 * @ppos: position in @out
790 * @len: number of bytes to splice
791 * @flags: splice modifier flags
792 *
793 * Description:
794 * Will send @len bytes from the pipe to a network socket. No data copying
795 * is involved.
796 *
797 */
798 ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
799 loff_t *ppos, size_t len, unsigned int flags)
800 {
801 struct socket *sock = sock_from_file(out);
802 struct bio_vec bvec[16];
803 struct msghdr msg = {};
804 ssize_t ret = 0;
805 size_t spliced = 0;
806 bool need_wakeup = false;
807
808 pipe_lock(pipe);
809
810 while (len > 0) {
811 unsigned int head, tail, mask, bc = 0;
812 size_t remain = len;
813
814 /*
815 * Check for signal early to make process killable when there
816 * are always buffers available
817 */
818 ret = -ERESTARTSYS;
819 if (signal_pending(current))
820 break;
821
822 while (pipe_empty(pipe->head, pipe->tail)) {
823 ret = 0;
824 if (!pipe->writers)
825 goto out;
826
827 if (spliced)
828 goto out;
829
830 ret = -EAGAIN;
831 if (flags & SPLICE_F_NONBLOCK)
832 goto out;
833
834 ret = -ERESTARTSYS;
835 if (signal_pending(current))
836 goto out;
837
838 if (need_wakeup) {
839 wakeup_pipe_writers(pipe);
840 need_wakeup = false;
841 }
842
843 pipe_wait_readable(pipe);
844 }
845
846 head = pipe->head;
847 tail = pipe->tail;
848 mask = pipe->ring_size - 1;
849
850 while (!pipe_empty(head, tail)) {
851 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
852 size_t seg;
853
854 if (!buf->len) {
855 tail++;
856 continue;
857 }
858
859 seg = min_t(size_t, remain, buf->len);
860
861 ret = pipe_buf_confirm(pipe, buf);
862 if (unlikely(ret)) {
863 if (ret == -ENODATA)
864 ret = 0;
865 break;
866 }
867
868 bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
869 remain -= seg;
870 if (remain == 0 || bc >= ARRAY_SIZE(bvec))
871 break;
872 tail++;
873 }
874
875 if (!bc)
876 break;
877
878 msg.msg_flags = MSG_SPLICE_PAGES;
879 if (flags & SPLICE_F_MORE)
880 msg.msg_flags |= MSG_MORE;
881 if (remain && pipe_occupancy(pipe->head, tail) > 0)
882 msg.msg_flags |= MSG_MORE;
883 if (out->f_flags & O_NONBLOCK)
884 msg.msg_flags |= MSG_DONTWAIT;
885
886 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
887 len - remain);
888 ret = sock_sendmsg(sock, &msg);
889 if (ret <= 0)
890 break;
891
892 spliced += ret;
893 len -= ret;
894 tail = pipe->tail;
895 while (ret > 0) {
896 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
897 size_t seg = min_t(size_t, ret, buf->len);
898
899 buf->offset += seg;
900 buf->len -= seg;
901 ret -= seg;
902
903 if (!buf->len) {
904 pipe_buf_release(pipe, buf);
905 tail++;
906 }
907 }
908
909 if (tail != pipe->tail) {
910 pipe->tail = tail;
911 if (pipe->files)
912 need_wakeup = true;
913 }
914 }
915
916 out:
917 pipe_unlock(pipe);
918 if (need_wakeup)
919 wakeup_pipe_writers(pipe);
920 return spliced ?: ret;
921 }
922 #endif
923
924 static int warn_unsupported(struct file *file, const char *op)
925 {
926 pr_debug_ratelimited(
927 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
928 op, file, current->pid, current->comm);
929 return -EINVAL;
930 }
931
932 /*
933 * Attempt to initiate a splice from pipe to file.
934 */
935 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
936 loff_t *ppos, size_t len, unsigned int flags)
937 {
938 if (unlikely(!out->f_op->splice_write))
939 return warn_unsupported(out, "write");
940 return out->f_op->splice_write(pipe, out, ppos, len, flags);
941 }
942
943 /*
944 * Indicate to the caller that there was a premature EOF when reading from the
945 * source and the caller didn't indicate they would be sending more data after
946 * this.
947 */
948 static void do_splice_eof(struct splice_desc *sd)
949 {
950 if (sd->splice_eof)
951 sd->splice_eof(sd);
952 }
953
954 /*
955 * Callers already called rw_verify_area() on the entire range.
956 * No need to call it for sub ranges.
957 */
958 static long do_splice_read(struct file *in, loff_t *ppos,
959 struct pipe_inode_info *pipe, size_t len,
960 unsigned int flags)
961 {
962 unsigned int p_space;
963
964 if (unlikely(!(in->f_mode & FMODE_READ)))
965 return -EBADF;
966 if (!len)
967 return 0;
968
969 /* Don't try to read more the pipe has space for. */
970 p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
971 len = min_t(size_t, len, p_space << PAGE_SHIFT);
972
973 if (unlikely(len > MAX_RW_COUNT))
974 len = MAX_RW_COUNT;
975
976 if (unlikely(!in->f_op->splice_read))
977 return warn_unsupported(in, "read");
978 /*
979 * O_DIRECT and DAX don't deal with the pagecache, so we allocate a
980 * buffer, copy into it and splice that into the pipe.
981 */
982 if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
983 return copy_splice_read(in, ppos, pipe, len, flags);
984 return in->f_op->splice_read(in, ppos, pipe, len, flags);
985 }
986
987 /**
988 * vfs_splice_read - Read data from a file and splice it into a pipe
989 * @in: File to splice from
990 * @ppos: Input file offset
991 * @pipe: Pipe to splice to
992 * @len: Number of bytes to splice
993 * @flags: Splice modifier flags (SPLICE_F_*)
994 *
995 * Splice the requested amount of data from the input file to the pipe. This
996 * is synchronous as the caller must hold the pipe lock across the entire
997 * operation.
998 *
999 * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
1000 * a hole and a negative error code otherwise.
1001 */
1002 long vfs_splice_read(struct file *in, loff_t *ppos,
1003 struct pipe_inode_info *pipe, size_t len,
1004 unsigned int flags)
1005 {
1006 int ret;
1007
1008 ret = rw_verify_area(READ, in, ppos, len);
1009 if (unlikely(ret < 0))
1010 return ret;
1011
1012 return do_splice_read(in, ppos, pipe, len, flags);
1013 }
1014 EXPORT_SYMBOL_GPL(vfs_splice_read);
1015
1016 /**
1017 * splice_direct_to_actor - splices data directly between two non-pipes
1018 * @in: file to splice from
1019 * @sd: actor information on where to splice to
1020 * @actor: handles the data splicing
1021 *
1022 * Description:
1023 * This is a special case helper to splice directly between two
1024 * points, without requiring an explicit pipe. Internally an allocated
1025 * pipe is cached in the process, and reused during the lifetime of
1026 * that process.
1027 *
1028 */
1029 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1030 splice_direct_actor *actor)
1031 {
1032 struct pipe_inode_info *pipe;
1033 long ret, bytes;
1034 size_t len;
1035 int i, flags, more;
1036
1037 /*
1038 * We require the input to be seekable, as we don't want to randomly
1039 * drop data for eg socket -> socket splicing. Use the piped splicing
1040 * for that!
1041 */
1042 if (unlikely(!(in->f_mode & FMODE_LSEEK)))
1043 return -EINVAL;
1044
1045 /*
1046 * neither in nor out is a pipe, setup an internal pipe attached to
1047 * 'out' and transfer the wanted data from 'in' to 'out' through that
1048 */
1049 pipe = current->splice_pipe;
1050 if (unlikely(!pipe)) {
1051 pipe = alloc_pipe_info();
1052 if (!pipe)
1053 return -ENOMEM;
1054
1055 /*
1056 * We don't have an immediate reader, but we'll read the stuff
1057 * out of the pipe right after the splice_to_pipe(). So set
1058 * PIPE_READERS appropriately.
1059 */
1060 pipe->readers = 1;
1061
1062 current->splice_pipe = pipe;
1063 }
1064
1065 /*
1066 * Do the splice.
1067 */
1068 bytes = 0;
1069 len = sd->total_len;
1070
1071 /* Don't block on output, we have to drain the direct pipe. */
1072 flags = sd->flags;
1073 sd->flags &= ~SPLICE_F_NONBLOCK;
1074
1075 /*
1076 * We signal MORE until we've read sufficient data to fulfill the
1077 * request and we keep signalling it if the caller set it.
1078 */
1079 more = sd->flags & SPLICE_F_MORE;
1080 sd->flags |= SPLICE_F_MORE;
1081
1082 WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
1083
1084 while (len) {
1085 size_t read_len;
1086 loff_t pos = sd->pos, prev_pos = pos;
1087
1088 ret = do_splice_read(in, &pos, pipe, len, flags);
1089 if (unlikely(ret <= 0))
1090 goto read_failure;
1091
1092 read_len = ret;
1093 sd->total_len = read_len;
1094
1095 /*
1096 * If we now have sufficient data to fulfill the request then
1097 * we clear SPLICE_F_MORE if it was not set initially.
1098 */
1099 if (read_len >= len && !more)
1100 sd->flags &= ~SPLICE_F_MORE;
1101
1102 /*
1103 * NOTE: nonblocking mode only applies to the input. We
1104 * must not do the output in nonblocking mode as then we
1105 * could get stuck data in the internal pipe:
1106 */
1107 ret = actor(pipe, sd);
1108 if (unlikely(ret <= 0)) {
1109 sd->pos = prev_pos;
1110 goto out_release;
1111 }
1112
1113 bytes += ret;
1114 len -= ret;
1115 sd->pos = pos;
1116
1117 if (ret < read_len) {
1118 sd->pos = prev_pos + ret;
1119 goto out_release;
1120 }
1121 }
1122
1123 done:
1124 pipe->tail = pipe->head = 0;
1125 file_accessed(in);
1126 return bytes;
1127
1128 read_failure:
1129 /*
1130 * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
1131 * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
1132 * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
1133 * least 1 byte *then* we will also do the ->splice_eof() call.
1134 */
1135 if (ret == 0 && !more && len > 0 && bytes)
1136 do_splice_eof(sd);
1137 out_release:
1138 /*
1139 * If we did an incomplete transfer we must release
1140 * the pipe buffers in question:
1141 */
1142 for (i = 0; i < pipe->ring_size; i++) {
1143 struct pipe_buffer *buf = &pipe->bufs[i];
1144
1145 if (buf->ops)
1146 pipe_buf_release(pipe, buf);
1147 }
1148
1149 if (!bytes)
1150 bytes = ret;
1151
1152 goto done;
1153 }
1154 EXPORT_SYMBOL(splice_direct_to_actor);
1155
1156 static int direct_splice_actor(struct pipe_inode_info *pipe,
1157 struct splice_desc *sd)
1158 {
1159 struct file *file = sd->u.file;
1160 long ret;
1161
1162 file_start_write(file);
1163 ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
1164 file_end_write(file);
1165 return ret;
1166 }
1167
1168 static int splice_file_range_actor(struct pipe_inode_info *pipe,
1169 struct splice_desc *sd)
1170 {
1171 struct file *file = sd->u.file;
1172
1173 return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
1174 }
1175
1176 static void direct_file_splice_eof(struct splice_desc *sd)
1177 {
1178 struct file *file = sd->u.file;
1179
1180 if (file->f_op->splice_eof)
1181 file->f_op->splice_eof(file);
1182 }
1183
1184 static long do_splice_direct_actor(struct file *in, loff_t *ppos,
1185 struct file *out, loff_t *opos,
1186 size_t len, unsigned int flags,
1187 splice_direct_actor *actor)
1188 {
1189 struct splice_desc sd = {
1190 .len = len,
1191 .total_len = len,
1192 .flags = flags,
1193 .pos = *ppos,
1194 .u.file = out,
1195 .splice_eof = direct_file_splice_eof,
1196 .opos = opos,
1197 };
1198 long ret;
1199
1200 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1201 return -EBADF;
1202
1203 if (unlikely(out->f_flags & O_APPEND))
1204 return -EINVAL;
1205
1206 ret = splice_direct_to_actor(in, &sd, actor);
1207 if (ret > 0)
1208 *ppos = sd.pos;
1209
1210 return ret;
1211 }
1212 /**
1213 * do_splice_direct - splices data directly between two files
1214 * @in: file to splice from
1215 * @ppos: input file offset
1216 * @out: file to splice to
1217 * @opos: output file offset
1218 * @len: number of bytes to splice
1219 * @flags: splice modifier flags
1220 *
1221 * Description:
1222 * For use by do_sendfile(). splice can easily emulate sendfile, but
1223 * doing it in the application would incur an extra system call
1224 * (splice in + splice out, as compared to just sendfile()). So this helper
1225 * can splice directly through a process-private pipe.
1226 *
1227 * Callers already called rw_verify_area() on the entire range.
1228 */
1229 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1230 loff_t *opos, size_t len, unsigned int flags)
1231 {
1232 return do_splice_direct_actor(in, ppos, out, opos, len, flags,
1233 direct_splice_actor);
1234 }
1235 EXPORT_SYMBOL(do_splice_direct);
1236
1237 /**
1238 * splice_file_range - splices data between two files for copy_file_range()
1239 * @in: file to splice from
1240 * @ppos: input file offset
1241 * @out: file to splice to
1242 * @opos: output file offset
1243 * @len: number of bytes to splice
1244 *
1245 * Description:
1246 * For use by generic_copy_file_range() and ->copy_file_range() methods.
1247 * Like do_splice_direct(), but vfs_copy_file_range() already holds
1248 * start_file_write() on @out file.
1249 *
1250 * Callers already called rw_verify_area() on the entire range.
1251 */
1252 long splice_file_range(struct file *in, loff_t *ppos, struct file *out,
1253 loff_t *opos, size_t len)
1254 {
1255 lockdep_assert(file_write_started(out));
1256
1257 return do_splice_direct_actor(in, ppos, out, opos, len, 0,
1258 splice_file_range_actor);
1259 }
1260 EXPORT_SYMBOL(splice_file_range);
1261
1262 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1263 {
1264 for (;;) {
1265 if (unlikely(!pipe->readers)) {
1266 send_sig(SIGPIPE, current, 0);
1267 return -EPIPE;
1268 }
1269 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1270 return 0;
1271 if (flags & SPLICE_F_NONBLOCK)
1272 return -EAGAIN;
1273 if (signal_pending(current))
1274 return -ERESTARTSYS;
1275 pipe_wait_writable(pipe);
1276 }
1277 }
1278
1279 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1280 struct pipe_inode_info *opipe,
1281 size_t len, unsigned int flags);
1282
1283 long splice_file_to_pipe(struct file *in,
1284 struct pipe_inode_info *opipe,
1285 loff_t *offset,
1286 size_t len, unsigned int flags)
1287 {
1288 long ret;
1289
1290 pipe_lock(opipe);
1291 ret = wait_for_space(opipe, flags);
1292 if (!ret)
1293 ret = do_splice_read(in, offset, opipe, len, flags);
1294 pipe_unlock(opipe);
1295 if (ret > 0)
1296 wakeup_pipe_readers(opipe);
1297 return ret;
1298 }
1299
1300 /*
1301 * Determine where to splice to/from.
1302 */
1303 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1304 loff_t *off_out, size_t len, unsigned int flags)
1305 {
1306 struct pipe_inode_info *ipipe;
1307 struct pipe_inode_info *opipe;
1308 loff_t offset;
1309 long ret;
1310
1311 if (unlikely(!(in->f_mode & FMODE_READ) ||
1312 !(out->f_mode & FMODE_WRITE)))
1313 return -EBADF;
1314
1315 ipipe = get_pipe_info(in, true);
1316 opipe = get_pipe_info(out, true);
1317
1318 if (ipipe && opipe) {
1319 if (off_in || off_out)
1320 return -ESPIPE;
1321
1322 /* Splicing to self would be fun, but... */
1323 if (ipipe == opipe)
1324 return -EINVAL;
1325
1326 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1327 flags |= SPLICE_F_NONBLOCK;
1328
1329 ret = splice_pipe_to_pipe(ipipe, opipe, len, flags);
1330 } else if (ipipe) {
1331 if (off_in)
1332 return -ESPIPE;
1333 if (off_out) {
1334 if (!(out->f_mode & FMODE_PWRITE))
1335 return -EINVAL;
1336 offset = *off_out;
1337 } else {
1338 offset = out->f_pos;
1339 }
1340
1341 if (unlikely(out->f_flags & O_APPEND))
1342 return -EINVAL;
1343
1344 ret = rw_verify_area(WRITE, out, &offset, len);
1345 if (unlikely(ret < 0))
1346 return ret;
1347
1348 if (in->f_flags & O_NONBLOCK)
1349 flags |= SPLICE_F_NONBLOCK;
1350
1351 file_start_write(out);
1352 ret = do_splice_from(ipipe, out, &offset, len, flags);
1353 file_end_write(out);
1354
1355 if (!off_out)
1356 out->f_pos = offset;
1357 else
1358 *off_out = offset;
1359 } else if (opipe) {
1360 if (off_out)
1361 return -ESPIPE;
1362 if (off_in) {
1363 if (!(in->f_mode & FMODE_PREAD))
1364 return -EINVAL;
1365 offset = *off_in;
1366 } else {
1367 offset = in->f_pos;
1368 }
1369
1370 ret = rw_verify_area(READ, in, &offset, len);
1371 if (unlikely(ret < 0))
1372 return ret;
1373
1374 if (out->f_flags & O_NONBLOCK)
1375 flags |= SPLICE_F_NONBLOCK;
1376
1377 ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1378
1379 if (!off_in)
1380 in->f_pos = offset;
1381 else
1382 *off_in = offset;
1383 } else {
1384 ret = -EINVAL;
1385 }
1386
1387 if (ret > 0) {
1388 /*
1389 * Generate modify out before access in:
1390 * do_splice_from() may've already sent modify out,
1391 * and this ensures the events get merged.
1392 */
1393 fsnotify_modify(out);
1394 fsnotify_access(in);
1395 }
1396
1397 return ret;
1398 }
1399
1400 static long __do_splice(struct file *in, loff_t __user *off_in,
1401 struct file *out, loff_t __user *off_out,
1402 size_t len, unsigned int flags)
1403 {
1404 struct pipe_inode_info *ipipe;
1405 struct pipe_inode_info *opipe;
1406 loff_t offset, *__off_in = NULL, *__off_out = NULL;
1407 long ret;
1408
1409 ipipe = get_pipe_info(in, true);
1410 opipe = get_pipe_info(out, true);
1411
1412 if (ipipe) {
1413 if (off_in)
1414 return -ESPIPE;
1415 pipe_clear_nowait(in);
1416 }
1417 if (opipe) {
1418 if (off_out)
1419 return -ESPIPE;
1420 pipe_clear_nowait(out);
1421 }
1422
1423 if (off_out) {
1424 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1425 return -EFAULT;
1426 __off_out = &offset;
1427 }
1428 if (off_in) {
1429 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1430 return -EFAULT;
1431 __off_in = &offset;
1432 }
1433
1434 ret = do_splice(in, __off_in, out, __off_out, len, flags);
1435 if (ret < 0)
1436 return ret;
1437
1438 if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1439 return -EFAULT;
1440 if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1441 return -EFAULT;
1442
1443 return ret;
1444 }
1445
1446 static int iter_to_pipe(struct iov_iter *from,
1447 struct pipe_inode_info *pipe,
1448 unsigned flags)
1449 {
1450 struct pipe_buffer buf = {
1451 .ops = &user_page_pipe_buf_ops,
1452 .flags = flags
1453 };
1454 size_t total = 0;
1455 int ret = 0;
1456
1457 while (iov_iter_count(from)) {
1458 struct page *pages[16];
1459 ssize_t left;
1460 size_t start;
1461 int i, n;
1462
1463 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1464 if (left <= 0) {
1465 ret = left;
1466 break;
1467 }
1468
1469 n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1470 for (i = 0; i < n; i++) {
1471 int size = min_t(int, left, PAGE_SIZE - start);
1472
1473 buf.page = pages[i];
1474 buf.offset = start;
1475 buf.len = size;
1476 ret = add_to_pipe(pipe, &buf);
1477 if (unlikely(ret < 0)) {
1478 iov_iter_revert(from, left);
1479 // this one got dropped by add_to_pipe()
1480 while (++i < n)
1481 put_page(pages[i]);
1482 goto out;
1483 }
1484 total += ret;
1485 left -= size;
1486 start = 0;
1487 }
1488 }
1489 out:
1490 return total ? total : ret;
1491 }
1492
1493 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1494 struct splice_desc *sd)
1495 {
1496 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1497 return n == sd->len ? n : -EFAULT;
1498 }
1499
1500 /*
1501 * For lack of a better implementation, implement vmsplice() to userspace
1502 * as a simple copy of the pipes pages to the user iov.
1503 */
1504 static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1505 unsigned int flags)
1506 {
1507 struct pipe_inode_info *pipe = get_pipe_info(file, true);
1508 struct splice_desc sd = {
1509 .total_len = iov_iter_count(iter),
1510 .flags = flags,
1511 .u.data = iter
1512 };
1513 long ret = 0;
1514
1515 if (!pipe)
1516 return -EBADF;
1517
1518 pipe_clear_nowait(file);
1519
1520 if (sd.total_len) {
1521 pipe_lock(pipe);
1522 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1523 pipe_unlock(pipe);
1524 }
1525
1526 if (ret > 0)
1527 fsnotify_access(file);
1528
1529 return ret;
1530 }
1531
1532 /*
1533 * vmsplice splices a user address range into a pipe. It can be thought of
1534 * as splice-from-memory, where the regular splice is splice-from-file (or
1535 * to file). In both cases the output is a pipe, naturally.
1536 */
1537 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1538 unsigned int flags)
1539 {
1540 struct pipe_inode_info *pipe;
1541 long ret = 0;
1542 unsigned buf_flag = 0;
1543
1544 if (flags & SPLICE_F_GIFT)
1545 buf_flag = PIPE_BUF_FLAG_GIFT;
1546
1547 pipe = get_pipe_info(file, true);
1548 if (!pipe)
1549 return -EBADF;
1550
1551 pipe_clear_nowait(file);
1552
1553 pipe_lock(pipe);
1554 ret = wait_for_space(pipe, flags);
1555 if (!ret)
1556 ret = iter_to_pipe(iter, pipe, buf_flag);
1557 pipe_unlock(pipe);
1558 if (ret > 0) {
1559 wakeup_pipe_readers(pipe);
1560 fsnotify_modify(file);
1561 }
1562 return ret;
1563 }
1564
1565 static int vmsplice_type(struct fd f, int *type)
1566 {
1567 if (!f.file)
1568 return -EBADF;
1569 if (f.file->f_mode & FMODE_WRITE) {
1570 *type = ITER_SOURCE;
1571 } else if (f.file->f_mode & FMODE_READ) {
1572 *type = ITER_DEST;
1573 } else {
1574 fdput(f);
1575 return -EBADF;
1576 }
1577 return 0;
1578 }
1579
1580 /*
1581 * Note that vmsplice only really supports true splicing _from_ user memory
1582 * to a pipe, not the other way around. Splicing from user memory is a simple
1583 * operation that can be supported without any funky alignment restrictions
1584 * or nasty vm tricks. We simply map in the user memory and fill them into
1585 * a pipe. The reverse isn't quite as easy, though. There are two possible
1586 * solutions for that:
1587 *
1588 * - memcpy() the data internally, at which point we might as well just
1589 * do a regular read() on the buffer anyway.
1590 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1591 * has restriction limitations on both ends of the pipe).
1592 *
1593 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1594 *
1595 */
1596 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1597 unsigned long, nr_segs, unsigned int, flags)
1598 {
1599 struct iovec iovstack[UIO_FASTIOV];
1600 struct iovec *iov = iovstack;
1601 struct iov_iter iter;
1602 ssize_t error;
1603 struct fd f;
1604 int type;
1605
1606 if (unlikely(flags & ~SPLICE_F_ALL))
1607 return -EINVAL;
1608
1609 f = fdget(fd);
1610 error = vmsplice_type(f, &type);
1611 if (error)
1612 return error;
1613
1614 error = import_iovec(type, uiov, nr_segs,
1615 ARRAY_SIZE(iovstack), &iov, &iter);
1616 if (error < 0)
1617 goto out_fdput;
1618
1619 if (!iov_iter_count(&iter))
1620 error = 0;
1621 else if (type == ITER_SOURCE)
1622 error = vmsplice_to_pipe(f.file, &iter, flags);
1623 else
1624 error = vmsplice_to_user(f.file, &iter, flags);
1625
1626 kfree(iov);
1627 out_fdput:
1628 fdput(f);
1629 return error;
1630 }
1631
1632 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1633 int, fd_out, loff_t __user *, off_out,
1634 size_t, len, unsigned int, flags)
1635 {
1636 struct fd in, out;
1637 long error;
1638
1639 if (unlikely(!len))
1640 return 0;
1641
1642 if (unlikely(flags & ~SPLICE_F_ALL))
1643 return -EINVAL;
1644
1645 error = -EBADF;
1646 in = fdget(fd_in);
1647 if (in.file) {
1648 out = fdget(fd_out);
1649 if (out.file) {
1650 error = __do_splice(in.file, off_in, out.file, off_out,
1651 len, flags);
1652 fdput(out);
1653 }
1654 fdput(in);
1655 }
1656 return error;
1657 }
1658
1659 /*
1660 * Make sure there's data to read. Wait for input if we can, otherwise
1661 * return an appropriate error.
1662 */
1663 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1664 {
1665 int ret;
1666
1667 /*
1668 * Check the pipe occupancy without the inode lock first. This function
1669 * is speculative anyways, so missing one is ok.
1670 */
1671 if (!pipe_empty(pipe->head, pipe->tail))
1672 return 0;
1673
1674 ret = 0;
1675 pipe_lock(pipe);
1676
1677 while (pipe_empty(pipe->head, pipe->tail)) {
1678 if (signal_pending(current)) {
1679 ret = -ERESTARTSYS;
1680 break;
1681 }
1682 if (!pipe->writers)
1683 break;
1684 if (flags & SPLICE_F_NONBLOCK) {
1685 ret = -EAGAIN;
1686 break;
1687 }
1688 pipe_wait_readable(pipe);
1689 }
1690
1691 pipe_unlock(pipe);
1692 return ret;
1693 }
1694
1695 /*
1696 * Make sure there's writeable room. Wait for room if we can, otherwise
1697 * return an appropriate error.
1698 */
1699 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1700 {
1701 int ret;
1702
1703 /*
1704 * Check pipe occupancy without the inode lock first. This function
1705 * is speculative anyways, so missing one is ok.
1706 */
1707 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1708 return 0;
1709
1710 ret = 0;
1711 pipe_lock(pipe);
1712
1713 while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1714 if (!pipe->readers) {
1715 send_sig(SIGPIPE, current, 0);
1716 ret = -EPIPE;
1717 break;
1718 }
1719 if (flags & SPLICE_F_NONBLOCK) {
1720 ret = -EAGAIN;
1721 break;
1722 }
1723 if (signal_pending(current)) {
1724 ret = -ERESTARTSYS;
1725 break;
1726 }
1727 pipe_wait_writable(pipe);
1728 }
1729
1730 pipe_unlock(pipe);
1731 return ret;
1732 }
1733
1734 /*
1735 * Splice contents of ipipe to opipe.
1736 */
1737 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1738 struct pipe_inode_info *opipe,
1739 size_t len, unsigned int flags)
1740 {
1741 struct pipe_buffer *ibuf, *obuf;
1742 unsigned int i_head, o_head;
1743 unsigned int i_tail, o_tail;
1744 unsigned int i_mask, o_mask;
1745 int ret = 0;
1746 bool input_wakeup = false;
1747
1748
1749 retry:
1750 ret = ipipe_prep(ipipe, flags);
1751 if (ret)
1752 return ret;
1753
1754 ret = opipe_prep(opipe, flags);
1755 if (ret)
1756 return ret;
1757
1758 /*
1759 * Potential ABBA deadlock, work around it by ordering lock
1760 * grabbing by pipe info address. Otherwise two different processes
1761 * could deadlock (one doing tee from A -> B, the other from B -> A).
1762 */
1763 pipe_double_lock(ipipe, opipe);
1764
1765 i_tail = ipipe->tail;
1766 i_mask = ipipe->ring_size - 1;
1767 o_head = opipe->head;
1768 o_mask = opipe->ring_size - 1;
1769
1770 do {
1771 size_t o_len;
1772
1773 if (!opipe->readers) {
1774 send_sig(SIGPIPE, current, 0);
1775 if (!ret)
1776 ret = -EPIPE;
1777 break;
1778 }
1779
1780 i_head = ipipe->head;
1781 o_tail = opipe->tail;
1782
1783 if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1784 break;
1785
1786 /*
1787 * Cannot make any progress, because either the input
1788 * pipe is empty or the output pipe is full.
1789 */
1790 if (pipe_empty(i_head, i_tail) ||
1791 pipe_full(o_head, o_tail, opipe->max_usage)) {
1792 /* Already processed some buffers, break */
1793 if (ret)
1794 break;
1795
1796 if (flags & SPLICE_F_NONBLOCK) {
1797 ret = -EAGAIN;
1798 break;
1799 }
1800
1801 /*
1802 * We raced with another reader/writer and haven't
1803 * managed to process any buffers. A zero return
1804 * value means EOF, so retry instead.
1805 */
1806 pipe_unlock(ipipe);
1807 pipe_unlock(opipe);
1808 goto retry;
1809 }
1810
1811 ibuf = &ipipe->bufs[i_tail & i_mask];
1812 obuf = &opipe->bufs[o_head & o_mask];
1813
1814 if (len >= ibuf->len) {
1815 /*
1816 * Simply move the whole buffer from ipipe to opipe
1817 */
1818 *obuf = *ibuf;
1819 ibuf->ops = NULL;
1820 i_tail++;
1821 ipipe->tail = i_tail;
1822 input_wakeup = true;
1823 o_len = obuf->len;
1824 o_head++;
1825 opipe->head = o_head;
1826 } else {
1827 /*
1828 * Get a reference to this pipe buffer,
1829 * so we can copy the contents over.
1830 */
1831 if (!pipe_buf_get(ipipe, ibuf)) {
1832 if (ret == 0)
1833 ret = -EFAULT;
1834 break;
1835 }
1836 *obuf = *ibuf;
1837
1838 /*
1839 * Don't inherit the gift and merge flags, we need to
1840 * prevent multiple steals of this page.
1841 */
1842 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1843 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1844
1845 obuf->len = len;
1846 ibuf->offset += len;
1847 ibuf->len -= len;
1848 o_len = len;
1849 o_head++;
1850 opipe->head = o_head;
1851 }
1852 ret += o_len;
1853 len -= o_len;
1854 } while (len);
1855
1856 pipe_unlock(ipipe);
1857 pipe_unlock(opipe);
1858
1859 /*
1860 * If we put data in the output pipe, wakeup any potential readers.
1861 */
1862 if (ret > 0)
1863 wakeup_pipe_readers(opipe);
1864
1865 if (input_wakeup)
1866 wakeup_pipe_writers(ipipe);
1867
1868 return ret;
1869 }
1870
1871 /*
1872 * Link contents of ipipe to opipe.
1873 */
1874 static int link_pipe(struct pipe_inode_info *ipipe,
1875 struct pipe_inode_info *opipe,
1876 size_t len, unsigned int flags)
1877 {
1878 struct pipe_buffer *ibuf, *obuf;
1879 unsigned int i_head, o_head;
1880 unsigned int i_tail, o_tail;
1881 unsigned int i_mask, o_mask;
1882 int ret = 0;
1883
1884 /*
1885 * Potential ABBA deadlock, work around it by ordering lock
1886 * grabbing by pipe info address. Otherwise two different processes
1887 * could deadlock (one doing tee from A -> B, the other from B -> A).
1888 */
1889 pipe_double_lock(ipipe, opipe);
1890
1891 i_tail = ipipe->tail;
1892 i_mask = ipipe->ring_size - 1;
1893 o_head = opipe->head;
1894 o_mask = opipe->ring_size - 1;
1895
1896 do {
1897 if (!opipe->readers) {
1898 send_sig(SIGPIPE, current, 0);
1899 if (!ret)
1900 ret = -EPIPE;
1901 break;
1902 }
1903
1904 i_head = ipipe->head;
1905 o_tail = opipe->tail;
1906
1907 /*
1908 * If we have iterated all input buffers or run out of
1909 * output room, break.
1910 */
1911 if (pipe_empty(i_head, i_tail) ||
1912 pipe_full(o_head, o_tail, opipe->max_usage))
1913 break;
1914
1915 ibuf = &ipipe->bufs[i_tail & i_mask];
1916 obuf = &opipe->bufs[o_head & o_mask];
1917
1918 /*
1919 * Get a reference to this pipe buffer,
1920 * so we can copy the contents over.
1921 */
1922 if (!pipe_buf_get(ipipe, ibuf)) {
1923 if (ret == 0)
1924 ret = -EFAULT;
1925 break;
1926 }
1927
1928 *obuf = *ibuf;
1929
1930 /*
1931 * Don't inherit the gift and merge flag, we need to prevent
1932 * multiple steals of this page.
1933 */
1934 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1935 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1936
1937 if (obuf->len > len)
1938 obuf->len = len;
1939 ret += obuf->len;
1940 len -= obuf->len;
1941
1942 o_head++;
1943 opipe->head = o_head;
1944 i_tail++;
1945 } while (len);
1946
1947 pipe_unlock(ipipe);
1948 pipe_unlock(opipe);
1949
1950 /*
1951 * If we put data in the output pipe, wakeup any potential readers.
1952 */
1953 if (ret > 0)
1954 wakeup_pipe_readers(opipe);
1955
1956 return ret;
1957 }
1958
1959 /*
1960 * This is a tee(1) implementation that works on pipes. It doesn't copy
1961 * any data, it simply references the 'in' pages on the 'out' pipe.
1962 * The 'flags' used are the SPLICE_F_* variants, currently the only
1963 * applicable one is SPLICE_F_NONBLOCK.
1964 */
1965 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1966 {
1967 struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1968 struct pipe_inode_info *opipe = get_pipe_info(out, true);
1969 int ret = -EINVAL;
1970
1971 if (unlikely(!(in->f_mode & FMODE_READ) ||
1972 !(out->f_mode & FMODE_WRITE)))
1973 return -EBADF;
1974
1975 /*
1976 * Duplicate the contents of ipipe to opipe without actually
1977 * copying the data.
1978 */
1979 if (ipipe && opipe && ipipe != opipe) {
1980 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1981 flags |= SPLICE_F_NONBLOCK;
1982
1983 /*
1984 * Keep going, unless we encounter an error. The ipipe/opipe
1985 * ordering doesn't really matter.
1986 */
1987 ret = ipipe_prep(ipipe, flags);
1988 if (!ret) {
1989 ret = opipe_prep(opipe, flags);
1990 if (!ret)
1991 ret = link_pipe(ipipe, opipe, len, flags);
1992 }
1993 }
1994
1995 if (ret > 0) {
1996 fsnotify_access(in);
1997 fsnotify_modify(out);
1998 }
1999
2000 return ret;
2001 }
2002
2003 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
2004 {
2005 struct fd in, out;
2006 int error;
2007
2008 if (unlikely(flags & ~SPLICE_F_ALL))
2009 return -EINVAL;
2010
2011 if (unlikely(!len))
2012 return 0;
2013
2014 error = -EBADF;
2015 in = fdget(fdin);
2016 if (in.file) {
2017 out = fdget(fdout);
2018 if (out.file) {
2019 error = do_tee(in.file, out.file, len, flags);
2020 fdput(out);
2021 }
2022 fdput(in);
2023 }
2024
2025 return error;
2026 }