1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2017 Red Hat, Inc.
6 #include <linux/cred.h>
7 #include <linux/file.h>
8 #include <linux/mount.h>
9 #include <linux/xattr.h>
10 #include <linux/uio.h>
11 #include <linux/uaccess.h>
12 #include <linux/splice.h>
13 #include <linux/security.h>
16 #include "overlayfs.h"
18 #include "../internal.h" /* for sb_init_dio_done_wq */
23 struct kiocb
*orig_iocb
;
24 /* used for aio completion */
25 struct work_struct work
;
29 static struct kmem_cache
*ovl_aio_request_cachep
;
31 static char ovl_whatisit(struct inode
*inode
, struct inode
*realinode
)
33 if (realinode
!= ovl_inode_upper(inode
))
35 if (ovl_has_upperdata(inode
))
41 /* No atime modification on underlying */
42 #define OVL_OPEN_FLAGS (O_NOATIME)
44 static struct file
*ovl_open_realfile(const struct file
*file
,
45 const struct path
*realpath
)
47 struct inode
*realinode
= d_inode(realpath
->dentry
);
48 struct inode
*inode
= file_inode(file
);
49 struct mnt_idmap
*real_idmap
;
50 struct file
*realfile
;
51 const struct cred
*old_cred
;
52 int flags
= file
->f_flags
| OVL_OPEN_FLAGS
;
53 int acc_mode
= ACC_MODE(flags
);
57 acc_mode
|= MAY_APPEND
;
59 old_cred
= ovl_override_creds(inode
->i_sb
);
60 real_idmap
= mnt_idmap(realpath
->mnt
);
61 err
= inode_permission(real_idmap
, realinode
, MAY_OPEN
| acc_mode
);
63 realfile
= ERR_PTR(err
);
65 if (!inode_owner_or_capable(real_idmap
, realinode
))
68 realfile
= backing_file_open(&file
->f_path
, flags
, realpath
,
71 revert_creds(old_cred
);
73 pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
74 file
, file
, ovl_whatisit(inode
, realinode
), file
->f_flags
,
75 realfile
, IS_ERR(realfile
) ? 0 : realfile
->f_flags
);
80 #define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT)
82 static int ovl_change_flags(struct file
*file
, unsigned int flags
)
84 struct inode
*inode
= file_inode(file
);
87 flags
&= OVL_SETFL_MASK
;
89 if (((flags
^ file
->f_flags
) & O_APPEND
) && IS_APPEND(inode
))
92 if ((flags
& O_DIRECT
) && !(file
->f_mode
& FMODE_CAN_ODIRECT
))
95 if (file
->f_op
->check_flags
) {
96 err
= file
->f_op
->check_flags(flags
);
101 spin_lock(&file
->f_lock
);
102 file
->f_flags
= (file
->f_flags
& ~OVL_SETFL_MASK
) | flags
;
103 file
->f_iocb_flags
= iocb_flags(file
);
104 spin_unlock(&file
->f_lock
);
109 static int ovl_real_fdget_meta(const struct file
*file
, struct fd
*real
,
112 struct dentry
*dentry
= file_dentry(file
);
113 struct path realpath
;
117 real
->file
= file
->private_data
;
120 ovl_path_real(dentry
, &realpath
);
122 /* lazy lookup and verify of lowerdata */
123 err
= ovl_verify_lowerdata(dentry
);
127 ovl_path_realdata(dentry
, &realpath
);
129 if (!realpath
.dentry
)
132 /* Has it been copied up since we'd opened it? */
133 if (unlikely(file_inode(real
->file
) != d_inode(realpath
.dentry
))) {
134 real
->flags
= FDPUT_FPUT
;
135 real
->file
= ovl_open_realfile(file
, &realpath
);
137 return PTR_ERR_OR_ZERO(real
->file
);
140 /* Did the flags change since open? */
141 if (unlikely((file
->f_flags
^ real
->file
->f_flags
) & ~OVL_OPEN_FLAGS
))
142 return ovl_change_flags(real
->file
, file
->f_flags
);
147 static int ovl_real_fdget(const struct file
*file
, struct fd
*real
)
149 if (d_is_dir(file_dentry(file
))) {
151 real
->file
= ovl_dir_real_file(file
, false);
153 return PTR_ERR_OR_ZERO(real
->file
);
156 return ovl_real_fdget_meta(file
, real
, false);
159 static int ovl_open(struct inode
*inode
, struct file
*file
)
161 struct dentry
*dentry
= file_dentry(file
);
162 struct file
*realfile
;
163 struct path realpath
;
166 /* lazy lookup and verify lowerdata */
167 err
= ovl_verify_lowerdata(dentry
);
171 err
= ovl_maybe_copy_up(dentry
, file
->f_flags
);
175 /* No longer need these flags, so don't pass them on to underlying fs */
176 file
->f_flags
&= ~(O_CREAT
| O_EXCL
| O_NOCTTY
| O_TRUNC
);
178 ovl_path_realdata(dentry
, &realpath
);
179 if (!realpath
.dentry
)
182 realfile
= ovl_open_realfile(file
, &realpath
);
183 if (IS_ERR(realfile
))
184 return PTR_ERR(realfile
);
186 file
->private_data
= realfile
;
191 static int ovl_release(struct inode
*inode
, struct file
*file
)
193 fput(file
->private_data
);
198 static loff_t
ovl_llseek(struct file
*file
, loff_t offset
, int whence
)
200 struct inode
*inode
= file_inode(file
);
202 const struct cred
*old_cred
;
206 * The two special cases below do not need to involve real fs,
207 * so we can optimizing concurrent callers.
210 if (whence
== SEEK_CUR
)
213 if (whence
== SEEK_SET
)
214 return vfs_setpos(file
, 0, 0);
217 ret
= ovl_real_fdget(file
, &real
);
222 * Overlay file f_pos is the master copy that is preserved
223 * through copy up and modified on read/write, but only real
224 * fs knows how to SEEK_HOLE/SEEK_DATA and real fs may impose
225 * limitations that are more strict than ->s_maxbytes for specific
226 * files, so we use the real file to perform seeks.
228 ovl_inode_lock(inode
);
229 real
.file
->f_pos
= file
->f_pos
;
231 old_cred
= ovl_override_creds(inode
->i_sb
);
232 ret
= vfs_llseek(real
.file
, offset
, whence
);
233 revert_creds(old_cred
);
235 file
->f_pos
= real
.file
->f_pos
;
236 ovl_inode_unlock(inode
);
243 static void ovl_file_modified(struct file
*file
)
245 /* Update size/mtime */
246 ovl_copyattr(file_inode(file
));
249 static void ovl_file_accessed(struct file
*file
)
251 struct inode
*inode
, *upperinode
;
252 struct timespec64 ctime
, uctime
;
253 struct timespec64 mtime
, umtime
;
255 if (file
->f_flags
& O_NOATIME
)
258 inode
= file_inode(file
);
259 upperinode
= ovl_inode_upper(inode
);
264 ctime
= inode_get_ctime(inode
);
265 uctime
= inode_get_ctime(upperinode
);
266 mtime
= inode_get_mtime(inode
);
267 umtime
= inode_get_mtime(upperinode
);
268 if ((!timespec64_equal(&mtime
, &umtime
)) ||
269 !timespec64_equal(&ctime
, &uctime
)) {
270 inode_set_mtime_to_ts(inode
, inode_get_mtime(upperinode
));
271 inode_set_ctime_to_ts(inode
, uctime
);
274 touch_atime(&file
->f_path
);
277 #define OVL_IOCB_MASK \
278 (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
280 static rwf_t
iocb_to_rw_flags(int flags
)
282 return (__force rwf_t
)(flags
& OVL_IOCB_MASK
);
285 static inline void ovl_aio_put(struct ovl_aio_req
*aio_req
)
287 if (refcount_dec_and_test(&aio_req
->ref
)) {
288 fput(aio_req
->iocb
.ki_filp
);
289 kmem_cache_free(ovl_aio_request_cachep
, aio_req
);
293 static void ovl_aio_cleanup_handler(struct ovl_aio_req
*aio_req
)
295 struct kiocb
*iocb
= &aio_req
->iocb
;
296 struct kiocb
*orig_iocb
= aio_req
->orig_iocb
;
298 if (iocb
->ki_flags
& IOCB_WRITE
) {
299 kiocb_end_write(iocb
);
300 ovl_file_modified(orig_iocb
->ki_filp
);
303 orig_iocb
->ki_pos
= iocb
->ki_pos
;
304 ovl_aio_put(aio_req
);
307 static void ovl_aio_rw_complete(struct kiocb
*iocb
, long res
)
309 struct ovl_aio_req
*aio_req
= container_of(iocb
,
310 struct ovl_aio_req
, iocb
);
311 struct kiocb
*orig_iocb
= aio_req
->orig_iocb
;
313 ovl_aio_cleanup_handler(aio_req
);
314 orig_iocb
->ki_complete(orig_iocb
, res
);
317 static void ovl_aio_complete_work(struct work_struct
*work
)
319 struct ovl_aio_req
*aio_req
= container_of(work
,
320 struct ovl_aio_req
, work
);
322 ovl_aio_rw_complete(&aio_req
->iocb
, aio_req
->res
);
325 static void ovl_aio_queue_completion(struct kiocb
*iocb
, long res
)
327 struct ovl_aio_req
*aio_req
= container_of(iocb
,
328 struct ovl_aio_req
, iocb
);
329 struct kiocb
*orig_iocb
= aio_req
->orig_iocb
;
332 * Punt to a work queue to serialize updates of mtime/size.
335 INIT_WORK(&aio_req
->work
, ovl_aio_complete_work
);
336 queue_work(file_inode(orig_iocb
->ki_filp
)->i_sb
->s_dio_done_wq
,
340 static int ovl_init_aio_done_wq(struct super_block
*sb
)
342 if (sb
->s_dio_done_wq
)
345 return sb_init_dio_done_wq(sb
);
348 static ssize_t
ovl_read_iter(struct kiocb
*iocb
, struct iov_iter
*iter
)
350 struct file
*file
= iocb
->ki_filp
;
352 const struct cred
*old_cred
;
355 if (!iov_iter_count(iter
))
358 ret
= ovl_real_fdget(file
, &real
);
363 if (iocb
->ki_flags
& IOCB_DIRECT
&&
364 !(real
.file
->f_mode
& FMODE_CAN_ODIRECT
))
367 old_cred
= ovl_override_creds(file_inode(file
)->i_sb
);
368 if (is_sync_kiocb(iocb
)) {
369 rwf_t rwf
= iocb_to_rw_flags(iocb
->ki_flags
);
371 ret
= vfs_iter_read(real
.file
, iter
, &iocb
->ki_pos
, rwf
);
373 struct ovl_aio_req
*aio_req
;
376 aio_req
= kmem_cache_zalloc(ovl_aio_request_cachep
, GFP_KERNEL
);
380 aio_req
->orig_iocb
= iocb
;
381 kiocb_clone(&aio_req
->iocb
, iocb
, get_file(real
.file
));
382 aio_req
->iocb
.ki_complete
= ovl_aio_rw_complete
;
383 refcount_set(&aio_req
->ref
, 2);
384 ret
= vfs_iocb_iter_read(real
.file
, &aio_req
->iocb
, iter
);
385 ovl_aio_put(aio_req
);
386 if (ret
!= -EIOCBQUEUED
)
387 ovl_aio_cleanup_handler(aio_req
);
390 revert_creds(old_cred
);
391 ovl_file_accessed(file
);
398 static ssize_t
ovl_write_iter(struct kiocb
*iocb
, struct iov_iter
*iter
)
400 struct file
*file
= iocb
->ki_filp
;
401 struct inode
*inode
= file_inode(file
);
403 const struct cred
*old_cred
;
405 int ifl
= iocb
->ki_flags
;
407 if (!iov_iter_count(iter
))
413 ret
= file_remove_privs(file
);
417 ret
= ovl_real_fdget(file
, &real
);
422 if (iocb
->ki_flags
& IOCB_DIRECT
&&
423 !(real
.file
->f_mode
& FMODE_CAN_ODIRECT
))
426 if (!ovl_should_sync(OVL_FS(inode
->i_sb
)))
427 ifl
&= ~(IOCB_DSYNC
| IOCB_SYNC
);
430 * Overlayfs doesn't support deferred completions, don't copy
431 * this property in case it is set by the issuer.
433 ifl
&= ~IOCB_DIO_CALLER_COMP
;
435 old_cred
= ovl_override_creds(file_inode(file
)->i_sb
);
436 if (is_sync_kiocb(iocb
)) {
437 rwf_t rwf
= iocb_to_rw_flags(ifl
);
439 file_start_write(real
.file
);
440 ret
= vfs_iter_write(real
.file
, iter
, &iocb
->ki_pos
, rwf
);
441 file_end_write(real
.file
);
443 ovl_file_modified(file
);
445 struct ovl_aio_req
*aio_req
;
447 ret
= ovl_init_aio_done_wq(inode
->i_sb
);
452 aio_req
= kmem_cache_zalloc(ovl_aio_request_cachep
, GFP_KERNEL
);
456 aio_req
->orig_iocb
= iocb
;
457 kiocb_clone(&aio_req
->iocb
, iocb
, get_file(real
.file
));
458 aio_req
->iocb
.ki_flags
= ifl
;
459 aio_req
->iocb
.ki_complete
= ovl_aio_queue_completion
;
460 refcount_set(&aio_req
->ref
, 2);
461 kiocb_start_write(&aio_req
->iocb
);
462 ret
= vfs_iocb_iter_write(real
.file
, &aio_req
->iocb
, iter
);
463 ovl_aio_put(aio_req
);
464 if (ret
!= -EIOCBQUEUED
)
465 ovl_aio_cleanup_handler(aio_req
);
468 revert_creds(old_cred
);
478 static ssize_t
ovl_splice_read(struct file
*in
, loff_t
*ppos
,
479 struct pipe_inode_info
*pipe
, size_t len
,
482 const struct cred
*old_cred
;
486 ret
= ovl_real_fdget(in
, &real
);
490 old_cred
= ovl_override_creds(file_inode(in
)->i_sb
);
491 ret
= vfs_splice_read(real
.file
, ppos
, pipe
, len
, flags
);
492 revert_creds(old_cred
);
493 ovl_file_accessed(in
);
500 * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
501 * due to lock order inversion between pipe->mutex in iter_file_splice_write()
502 * and file_start_write(real.file) in ovl_write_iter().
504 * So do everything ovl_write_iter() does and call iter_file_splice_write() on
507 static ssize_t
ovl_splice_write(struct pipe_inode_info
*pipe
, struct file
*out
,
508 loff_t
*ppos
, size_t len
, unsigned int flags
)
511 const struct cred
*old_cred
;
512 struct inode
*inode
= file_inode(out
);
518 ret
= file_remove_privs(out
);
522 ret
= ovl_real_fdget(out
, &real
);
526 old_cred
= ovl_override_creds(inode
->i_sb
);
527 file_start_write(real
.file
);
529 ret
= iter_file_splice_write(pipe
, real
.file
, ppos
, len
, flags
);
531 file_end_write(real
.file
);
533 ovl_file_modified(out
);
534 revert_creds(old_cred
);
543 static int ovl_fsync(struct file
*file
, loff_t start
, loff_t end
, int datasync
)
546 const struct cred
*old_cred
;
549 ret
= ovl_sync_status(OVL_FS(file_inode(file
)->i_sb
));
553 ret
= ovl_real_fdget_meta(file
, &real
, !datasync
);
557 /* Don't sync lower file for fear of receiving EROFS error */
558 if (file_inode(real
.file
) == ovl_inode_upper(file_inode(file
))) {
559 old_cred
= ovl_override_creds(file_inode(file
)->i_sb
);
560 ret
= vfs_fsync_range(real
.file
, start
, end
, datasync
);
561 revert_creds(old_cred
);
569 static int ovl_mmap(struct file
*file
, struct vm_area_struct
*vma
)
571 struct file
*realfile
= file
->private_data
;
572 const struct cred
*old_cred
;
575 if (!realfile
->f_op
->mmap
)
578 if (WARN_ON(file
!= vma
->vm_file
))
581 vma_set_file(vma
, realfile
);
583 old_cred
= ovl_override_creds(file_inode(file
)->i_sb
);
584 ret
= call_mmap(vma
->vm_file
, vma
);
585 revert_creds(old_cred
);
586 ovl_file_accessed(file
);
591 static long ovl_fallocate(struct file
*file
, int mode
, loff_t offset
, loff_t len
)
593 struct inode
*inode
= file_inode(file
);
595 const struct cred
*old_cred
;
601 ret
= file_remove_privs(file
);
605 ret
= ovl_real_fdget(file
, &real
);
609 old_cred
= ovl_override_creds(file_inode(file
)->i_sb
);
610 ret
= vfs_fallocate(real
.file
, mode
, offset
, len
);
611 revert_creds(old_cred
);
614 ovl_file_modified(file
);
624 static int ovl_fadvise(struct file
*file
, loff_t offset
, loff_t len
, int advice
)
627 const struct cred
*old_cred
;
630 ret
= ovl_real_fdget(file
, &real
);
634 old_cred
= ovl_override_creds(file_inode(file
)->i_sb
);
635 ret
= vfs_fadvise(real
.file
, offset
, len
, advice
);
636 revert_creds(old_cred
);
649 static loff_t
ovl_copyfile(struct file
*file_in
, loff_t pos_in
,
650 struct file
*file_out
, loff_t pos_out
,
651 loff_t len
, unsigned int flags
, enum ovl_copyop op
)
653 struct inode
*inode_out
= file_inode(file_out
);
654 struct fd real_in
, real_out
;
655 const struct cred
*old_cred
;
658 inode_lock(inode_out
);
659 if (op
!= OVL_DEDUPE
) {
661 ovl_copyattr(inode_out
);
662 ret
= file_remove_privs(file_out
);
667 ret
= ovl_real_fdget(file_out
, &real_out
);
671 ret
= ovl_real_fdget(file_in
, &real_in
);
677 old_cred
= ovl_override_creds(file_inode(file_out
)->i_sb
);
680 ret
= vfs_copy_file_range(real_in
.file
, pos_in
,
681 real_out
.file
, pos_out
, len
, flags
);
685 ret
= vfs_clone_file_range(real_in
.file
, pos_in
,
686 real_out
.file
, pos_out
, len
, flags
);
690 ret
= vfs_dedupe_file_range_one(real_in
.file
, pos_in
,
691 real_out
.file
, pos_out
, len
,
695 revert_creds(old_cred
);
698 ovl_file_modified(file_out
);
704 inode_unlock(inode_out
);
709 static ssize_t
ovl_copy_file_range(struct file
*file_in
, loff_t pos_in
,
710 struct file
*file_out
, loff_t pos_out
,
711 size_t len
, unsigned int flags
)
713 return ovl_copyfile(file_in
, pos_in
, file_out
, pos_out
, len
, flags
,
717 static loff_t
ovl_remap_file_range(struct file
*file_in
, loff_t pos_in
,
718 struct file
*file_out
, loff_t pos_out
,
719 loff_t len
, unsigned int remap_flags
)
723 if (remap_flags
& ~(REMAP_FILE_DEDUP
| REMAP_FILE_ADVISORY
))
726 if (remap_flags
& REMAP_FILE_DEDUP
)
732 * Don't copy up because of a dedupe request, this wouldn't make sense
733 * most of the time (data would be duplicated instead of deduplicated).
735 if (op
== OVL_DEDUPE
&&
736 (!ovl_inode_upper(file_inode(file_in
)) ||
737 !ovl_inode_upper(file_inode(file_out
))))
740 return ovl_copyfile(file_in
, pos_in
, file_out
, pos_out
, len
,
744 static int ovl_flush(struct file
*file
, fl_owner_t id
)
747 const struct cred
*old_cred
;
750 err
= ovl_real_fdget(file
, &real
);
754 if (real
.file
->f_op
->flush
) {
755 old_cred
= ovl_override_creds(file_inode(file
)->i_sb
);
756 err
= real
.file
->f_op
->flush(real
.file
, id
);
757 revert_creds(old_cred
);
764 const struct file_operations ovl_file_operations
= {
766 .release
= ovl_release
,
767 .llseek
= ovl_llseek
,
768 .read_iter
= ovl_read_iter
,
769 .write_iter
= ovl_write_iter
,
772 .fallocate
= ovl_fallocate
,
773 .fadvise
= ovl_fadvise
,
775 .splice_read
= ovl_splice_read
,
776 .splice_write
= ovl_splice_write
,
778 .copy_file_range
= ovl_copy_file_range
,
779 .remap_file_range
= ovl_remap_file_range
,
782 int __init
ovl_aio_request_cache_init(void)
784 ovl_aio_request_cachep
= kmem_cache_create("ovl_aio_req",
785 sizeof(struct ovl_aio_req
),
786 0, SLAB_HWCACHE_ALIGN
, NULL
);
787 if (!ovl_aio_request_cachep
)
793 void ovl_aio_request_cache_destroy(void)
795 kmem_cache_destroy(ovl_aio_request_cachep
);