]> git.ipfire.org Git - thirdparty/linux.git/blob - fs/namespace.c
add statmount(2) syscall
[thirdparty/linux.git] / fs / namespace.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * linux/fs/namespace.c
4 *
5 * (C) Copyright Al Viro 2000, 2001
6 *
7 * Based on code from fs/super.c, copyright Linus Torvalds and others.
8 * Heavily rewritten.
9 */
10
11 #include <linux/syscalls.h>
12 #include <linux/export.h>
13 #include <linux/capability.h>
14 #include <linux/mnt_namespace.h>
15 #include <linux/user_namespace.h>
16 #include <linux/namei.h>
17 #include <linux/security.h>
18 #include <linux/cred.h>
19 #include <linux/idr.h>
20 #include <linux/init.h> /* init_rootfs */
21 #include <linux/fs_struct.h> /* get_fs_root et.al. */
22 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
23 #include <linux/file.h>
24 #include <linux/uaccess.h>
25 #include <linux/proc_ns.h>
26 #include <linux/magic.h>
27 #include <linux/memblock.h>
28 #include <linux/proc_fs.h>
29 #include <linux/task_work.h>
30 #include <linux/sched/task.h>
31 #include <uapi/linux/mount.h>
32 #include <linux/fs_context.h>
33 #include <linux/shmem_fs.h>
34 #include <linux/mnt_idmapping.h>
35
36 #include "pnode.h"
37 #include "internal.h"
38
39 /* Maximum number of mounts in a mount namespace */
40 static unsigned int sysctl_mount_max __read_mostly = 100000;
41
42 static unsigned int m_hash_mask __ro_after_init;
43 static unsigned int m_hash_shift __ro_after_init;
44 static unsigned int mp_hash_mask __ro_after_init;
45 static unsigned int mp_hash_shift __ro_after_init;
46
47 static __initdata unsigned long mhash_entries;
48 static int __init set_mhash_entries(char *str)
49 {
50 if (!str)
51 return 0;
52 mhash_entries = simple_strtoul(str, &str, 0);
53 return 1;
54 }
55 __setup("mhash_entries=", set_mhash_entries);
56
57 static __initdata unsigned long mphash_entries;
58 static int __init set_mphash_entries(char *str)
59 {
60 if (!str)
61 return 0;
62 mphash_entries = simple_strtoul(str, &str, 0);
63 return 1;
64 }
65 __setup("mphash_entries=", set_mphash_entries);
66
67 static u64 event;
68 static DEFINE_IDA(mnt_id_ida);
69 static DEFINE_IDA(mnt_group_ida);
70
71 /* Don't allow confusion with old 32bit mount ID */
72 static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
73
74 static struct hlist_head *mount_hashtable __ro_after_init;
75 static struct hlist_head *mountpoint_hashtable __ro_after_init;
76 static struct kmem_cache *mnt_cache __ro_after_init;
77 static DECLARE_RWSEM(namespace_sem);
78 static HLIST_HEAD(unmounted); /* protected by namespace_sem */
79 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
80
81 struct mount_kattr {
82 unsigned int attr_set;
83 unsigned int attr_clr;
84 unsigned int propagation;
85 unsigned int lookup_flags;
86 bool recurse;
87 struct user_namespace *mnt_userns;
88 struct mnt_idmap *mnt_idmap;
89 };
90
91 /* /sys/fs */
92 struct kobject *fs_kobj __ro_after_init;
93 EXPORT_SYMBOL_GPL(fs_kobj);
94
95 /*
96 * vfsmount lock may be taken for read to prevent changes to the
97 * vfsmount hash, ie. during mountpoint lookups or walking back
98 * up the tree.
99 *
100 * It should be taken for write in all cases where the vfsmount
101 * tree or hash is modified or when a vfsmount structure is modified.
102 */
103 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
104
105 static inline void lock_mount_hash(void)
106 {
107 write_seqlock(&mount_lock);
108 }
109
110 static inline void unlock_mount_hash(void)
111 {
112 write_sequnlock(&mount_lock);
113 }
114
115 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
116 {
117 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
118 tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
119 tmp = tmp + (tmp >> m_hash_shift);
120 return &mount_hashtable[tmp & m_hash_mask];
121 }
122
123 static inline struct hlist_head *mp_hash(struct dentry *dentry)
124 {
125 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
126 tmp = tmp + (tmp >> mp_hash_shift);
127 return &mountpoint_hashtable[tmp & mp_hash_mask];
128 }
129
130 static int mnt_alloc_id(struct mount *mnt)
131 {
132 int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
133
134 if (res < 0)
135 return res;
136 mnt->mnt_id = res;
137 mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
138 return 0;
139 }
140
141 static void mnt_free_id(struct mount *mnt)
142 {
143 ida_free(&mnt_id_ida, mnt->mnt_id);
144 }
145
146 /*
147 * Allocate a new peer group ID
148 */
149 static int mnt_alloc_group_id(struct mount *mnt)
150 {
151 int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
152
153 if (res < 0)
154 return res;
155 mnt->mnt_group_id = res;
156 return 0;
157 }
158
159 /*
160 * Release a peer group ID
161 */
162 void mnt_release_group_id(struct mount *mnt)
163 {
164 ida_free(&mnt_group_ida, mnt->mnt_group_id);
165 mnt->mnt_group_id = 0;
166 }
167
168 /*
169 * vfsmount lock must be held for read
170 */
171 static inline void mnt_add_count(struct mount *mnt, int n)
172 {
173 #ifdef CONFIG_SMP
174 this_cpu_add(mnt->mnt_pcp->mnt_count, n);
175 #else
176 preempt_disable();
177 mnt->mnt_count += n;
178 preempt_enable();
179 #endif
180 }
181
182 /*
183 * vfsmount lock must be held for write
184 */
185 int mnt_get_count(struct mount *mnt)
186 {
187 #ifdef CONFIG_SMP
188 int count = 0;
189 int cpu;
190
191 for_each_possible_cpu(cpu) {
192 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
193 }
194
195 return count;
196 #else
197 return mnt->mnt_count;
198 #endif
199 }
200
201 static struct mount *alloc_vfsmnt(const char *name)
202 {
203 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
204 if (mnt) {
205 int err;
206
207 err = mnt_alloc_id(mnt);
208 if (err)
209 goto out_free_cache;
210
211 if (name) {
212 mnt->mnt_devname = kstrdup_const(name,
213 GFP_KERNEL_ACCOUNT);
214 if (!mnt->mnt_devname)
215 goto out_free_id;
216 }
217
218 #ifdef CONFIG_SMP
219 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
220 if (!mnt->mnt_pcp)
221 goto out_free_devname;
222
223 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
224 #else
225 mnt->mnt_count = 1;
226 mnt->mnt_writers = 0;
227 #endif
228
229 INIT_HLIST_NODE(&mnt->mnt_hash);
230 INIT_LIST_HEAD(&mnt->mnt_child);
231 INIT_LIST_HEAD(&mnt->mnt_mounts);
232 INIT_LIST_HEAD(&mnt->mnt_list);
233 INIT_LIST_HEAD(&mnt->mnt_expire);
234 INIT_LIST_HEAD(&mnt->mnt_share);
235 INIT_LIST_HEAD(&mnt->mnt_slave_list);
236 INIT_LIST_HEAD(&mnt->mnt_slave);
237 INIT_HLIST_NODE(&mnt->mnt_mp_list);
238 INIT_LIST_HEAD(&mnt->mnt_umounting);
239 INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
240 mnt->mnt.mnt_idmap = &nop_mnt_idmap;
241 }
242 return mnt;
243
244 #ifdef CONFIG_SMP
245 out_free_devname:
246 kfree_const(mnt->mnt_devname);
247 #endif
248 out_free_id:
249 mnt_free_id(mnt);
250 out_free_cache:
251 kmem_cache_free(mnt_cache, mnt);
252 return NULL;
253 }
254
255 /*
256 * Most r/o checks on a fs are for operations that take
257 * discrete amounts of time, like a write() or unlink().
258 * We must keep track of when those operations start
259 * (for permission checks) and when they end, so that
260 * we can determine when writes are able to occur to
261 * a filesystem.
262 */
263 /*
264 * __mnt_is_readonly: check whether a mount is read-only
265 * @mnt: the mount to check for its write status
266 *
267 * This shouldn't be used directly ouside of the VFS.
268 * It does not guarantee that the filesystem will stay
269 * r/w, just that it is right *now*. This can not and
270 * should not be used in place of IS_RDONLY(inode).
271 * mnt_want/drop_write() will _keep_ the filesystem
272 * r/w.
273 */
274 bool __mnt_is_readonly(struct vfsmount *mnt)
275 {
276 return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
277 }
278 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
279
280 static inline void mnt_inc_writers(struct mount *mnt)
281 {
282 #ifdef CONFIG_SMP
283 this_cpu_inc(mnt->mnt_pcp->mnt_writers);
284 #else
285 mnt->mnt_writers++;
286 #endif
287 }
288
289 static inline void mnt_dec_writers(struct mount *mnt)
290 {
291 #ifdef CONFIG_SMP
292 this_cpu_dec(mnt->mnt_pcp->mnt_writers);
293 #else
294 mnt->mnt_writers--;
295 #endif
296 }
297
298 static unsigned int mnt_get_writers(struct mount *mnt)
299 {
300 #ifdef CONFIG_SMP
301 unsigned int count = 0;
302 int cpu;
303
304 for_each_possible_cpu(cpu) {
305 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
306 }
307
308 return count;
309 #else
310 return mnt->mnt_writers;
311 #endif
312 }
313
314 static int mnt_is_readonly(struct vfsmount *mnt)
315 {
316 if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
317 return 1;
318 /*
319 * The barrier pairs with the barrier in sb_start_ro_state_change()
320 * making sure if we don't see s_readonly_remount set yet, we also will
321 * not see any superblock / mount flag changes done by remount.
322 * It also pairs with the barrier in sb_end_ro_state_change()
323 * assuring that if we see s_readonly_remount already cleared, we will
324 * see the values of superblock / mount flags updated by remount.
325 */
326 smp_rmb();
327 return __mnt_is_readonly(mnt);
328 }
329
330 /*
331 * Most r/o & frozen checks on a fs are for operations that take discrete
332 * amounts of time, like a write() or unlink(). We must keep track of when
333 * those operations start (for permission checks) and when they end, so that we
334 * can determine when writes are able to occur to a filesystem.
335 */
336 /**
337 * mnt_get_write_access - get write access to a mount without freeze protection
338 * @m: the mount on which to take a write
339 *
340 * This tells the low-level filesystem that a write is about to be performed to
341 * it, and makes sure that writes are allowed (mnt it read-write) before
342 * returning success. This operation does not protect against filesystem being
343 * frozen. When the write operation is finished, mnt_put_write_access() must be
344 * called. This is effectively a refcount.
345 */
346 int mnt_get_write_access(struct vfsmount *m)
347 {
348 struct mount *mnt = real_mount(m);
349 int ret = 0;
350
351 preempt_disable();
352 mnt_inc_writers(mnt);
353 /*
354 * The store to mnt_inc_writers must be visible before we pass
355 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
356 * incremented count after it has set MNT_WRITE_HOLD.
357 */
358 smp_mb();
359 might_lock(&mount_lock.lock);
360 while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
361 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
362 cpu_relax();
363 } else {
364 /*
365 * This prevents priority inversion, if the task
366 * setting MNT_WRITE_HOLD got preempted on a remote
367 * CPU, and it prevents life lock if the task setting
368 * MNT_WRITE_HOLD has a lower priority and is bound to
369 * the same CPU as the task that is spinning here.
370 */
371 preempt_enable();
372 lock_mount_hash();
373 unlock_mount_hash();
374 preempt_disable();
375 }
376 }
377 /*
378 * The barrier pairs with the barrier sb_start_ro_state_change() making
379 * sure that if we see MNT_WRITE_HOLD cleared, we will also see
380 * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
381 * mnt_is_readonly() and bail in case we are racing with remount
382 * read-only.
383 */
384 smp_rmb();
385 if (mnt_is_readonly(m)) {
386 mnt_dec_writers(mnt);
387 ret = -EROFS;
388 }
389 preempt_enable();
390
391 return ret;
392 }
393 EXPORT_SYMBOL_GPL(mnt_get_write_access);
394
395 /**
396 * mnt_want_write - get write access to a mount
397 * @m: the mount on which to take a write
398 *
399 * This tells the low-level filesystem that a write is about to be performed to
400 * it, and makes sure that writes are allowed (mount is read-write, filesystem
401 * is not frozen) before returning success. When the write operation is
402 * finished, mnt_drop_write() must be called. This is effectively a refcount.
403 */
404 int mnt_want_write(struct vfsmount *m)
405 {
406 int ret;
407
408 sb_start_write(m->mnt_sb);
409 ret = mnt_get_write_access(m);
410 if (ret)
411 sb_end_write(m->mnt_sb);
412 return ret;
413 }
414 EXPORT_SYMBOL_GPL(mnt_want_write);
415
416 /**
417 * mnt_get_write_access_file - get write access to a file's mount
418 * @file: the file who's mount on which to take a write
419 *
420 * This is like mnt_get_write_access, but if @file is already open for write it
421 * skips incrementing mnt_writers (since the open file already has a reference)
422 * and instead only does the check for emergency r/o remounts. This must be
423 * paired with mnt_put_write_access_file.
424 */
425 int mnt_get_write_access_file(struct file *file)
426 {
427 if (file->f_mode & FMODE_WRITER) {
428 /*
429 * Superblock may have become readonly while there are still
430 * writable fd's, e.g. due to a fs error with errors=remount-ro
431 */
432 if (__mnt_is_readonly(file->f_path.mnt))
433 return -EROFS;
434 return 0;
435 }
436 return mnt_get_write_access(file->f_path.mnt);
437 }
438
439 /**
440 * mnt_want_write_file - get write access to a file's mount
441 * @file: the file who's mount on which to take a write
442 *
443 * This is like mnt_want_write, but if the file is already open for writing it
444 * skips incrementing mnt_writers (since the open file already has a reference)
445 * and instead only does the freeze protection and the check for emergency r/o
446 * remounts. This must be paired with mnt_drop_write_file.
447 */
448 int mnt_want_write_file(struct file *file)
449 {
450 int ret;
451
452 sb_start_write(file_inode(file)->i_sb);
453 ret = mnt_get_write_access_file(file);
454 if (ret)
455 sb_end_write(file_inode(file)->i_sb);
456 return ret;
457 }
458 EXPORT_SYMBOL_GPL(mnt_want_write_file);
459
460 /**
461 * mnt_put_write_access - give up write access to a mount
462 * @mnt: the mount on which to give up write access
463 *
464 * Tells the low-level filesystem that we are done
465 * performing writes to it. Must be matched with
466 * mnt_get_write_access() call above.
467 */
468 void mnt_put_write_access(struct vfsmount *mnt)
469 {
470 preempt_disable();
471 mnt_dec_writers(real_mount(mnt));
472 preempt_enable();
473 }
474 EXPORT_SYMBOL_GPL(mnt_put_write_access);
475
476 /**
477 * mnt_drop_write - give up write access to a mount
478 * @mnt: the mount on which to give up write access
479 *
480 * Tells the low-level filesystem that we are done performing writes to it and
481 * also allows filesystem to be frozen again. Must be matched with
482 * mnt_want_write() call above.
483 */
484 void mnt_drop_write(struct vfsmount *mnt)
485 {
486 mnt_put_write_access(mnt);
487 sb_end_write(mnt->mnt_sb);
488 }
489 EXPORT_SYMBOL_GPL(mnt_drop_write);
490
491 void mnt_put_write_access_file(struct file *file)
492 {
493 if (!(file->f_mode & FMODE_WRITER))
494 mnt_put_write_access(file->f_path.mnt);
495 }
496
497 void mnt_drop_write_file(struct file *file)
498 {
499 mnt_put_write_access_file(file);
500 sb_end_write(file_inode(file)->i_sb);
501 }
502 EXPORT_SYMBOL(mnt_drop_write_file);
503
504 /**
505 * mnt_hold_writers - prevent write access to the given mount
506 * @mnt: mnt to prevent write access to
507 *
508 * Prevents write access to @mnt if there are no active writers for @mnt.
509 * This function needs to be called and return successfully before changing
510 * properties of @mnt that need to remain stable for callers with write access
511 * to @mnt.
512 *
513 * After this functions has been called successfully callers must pair it with
514 * a call to mnt_unhold_writers() in order to stop preventing write access to
515 * @mnt.
516 *
517 * Context: This function expects lock_mount_hash() to be held serializing
518 * setting MNT_WRITE_HOLD.
519 * Return: On success 0 is returned.
520 * On error, -EBUSY is returned.
521 */
522 static inline int mnt_hold_writers(struct mount *mnt)
523 {
524 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
525 /*
526 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
527 * should be visible before we do.
528 */
529 smp_mb();
530
531 /*
532 * With writers on hold, if this value is zero, then there are
533 * definitely no active writers (although held writers may subsequently
534 * increment the count, they'll have to wait, and decrement it after
535 * seeing MNT_READONLY).
536 *
537 * It is OK to have counter incremented on one CPU and decremented on
538 * another: the sum will add up correctly. The danger would be when we
539 * sum up each counter, if we read a counter before it is incremented,
540 * but then read another CPU's count which it has been subsequently
541 * decremented from -- we would see more decrements than we should.
542 * MNT_WRITE_HOLD protects against this scenario, because
543 * mnt_want_write first increments count, then smp_mb, then spins on
544 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
545 * we're counting up here.
546 */
547 if (mnt_get_writers(mnt) > 0)
548 return -EBUSY;
549
550 return 0;
551 }
552
553 /**
554 * mnt_unhold_writers - stop preventing write access to the given mount
555 * @mnt: mnt to stop preventing write access to
556 *
557 * Stop preventing write access to @mnt allowing callers to gain write access
558 * to @mnt again.
559 *
560 * This function can only be called after a successful call to
561 * mnt_hold_writers().
562 *
563 * Context: This function expects lock_mount_hash() to be held.
564 */
565 static inline void mnt_unhold_writers(struct mount *mnt)
566 {
567 /*
568 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
569 * that become unheld will see MNT_READONLY.
570 */
571 smp_wmb();
572 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
573 }
574
575 static int mnt_make_readonly(struct mount *mnt)
576 {
577 int ret;
578
579 ret = mnt_hold_writers(mnt);
580 if (!ret)
581 mnt->mnt.mnt_flags |= MNT_READONLY;
582 mnt_unhold_writers(mnt);
583 return ret;
584 }
585
586 int sb_prepare_remount_readonly(struct super_block *sb)
587 {
588 struct mount *mnt;
589 int err = 0;
590
591 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
592 if (atomic_long_read(&sb->s_remove_count))
593 return -EBUSY;
594
595 lock_mount_hash();
596 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
597 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
598 err = mnt_hold_writers(mnt);
599 if (err)
600 break;
601 }
602 }
603 if (!err && atomic_long_read(&sb->s_remove_count))
604 err = -EBUSY;
605
606 if (!err)
607 sb_start_ro_state_change(sb);
608 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
609 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
610 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
611 }
612 unlock_mount_hash();
613
614 return err;
615 }
616
617 static void free_vfsmnt(struct mount *mnt)
618 {
619 mnt_idmap_put(mnt_idmap(&mnt->mnt));
620 kfree_const(mnt->mnt_devname);
621 #ifdef CONFIG_SMP
622 free_percpu(mnt->mnt_pcp);
623 #endif
624 kmem_cache_free(mnt_cache, mnt);
625 }
626
627 static void delayed_free_vfsmnt(struct rcu_head *head)
628 {
629 free_vfsmnt(container_of(head, struct mount, mnt_rcu));
630 }
631
632 /* call under rcu_read_lock */
633 int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
634 {
635 struct mount *mnt;
636 if (read_seqretry(&mount_lock, seq))
637 return 1;
638 if (bastard == NULL)
639 return 0;
640 mnt = real_mount(bastard);
641 mnt_add_count(mnt, 1);
642 smp_mb(); // see mntput_no_expire()
643 if (likely(!read_seqretry(&mount_lock, seq)))
644 return 0;
645 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
646 mnt_add_count(mnt, -1);
647 return 1;
648 }
649 lock_mount_hash();
650 if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
651 mnt_add_count(mnt, -1);
652 unlock_mount_hash();
653 return 1;
654 }
655 unlock_mount_hash();
656 /* caller will mntput() */
657 return -1;
658 }
659
660 /* call under rcu_read_lock */
661 static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
662 {
663 int res = __legitimize_mnt(bastard, seq);
664 if (likely(!res))
665 return true;
666 if (unlikely(res < 0)) {
667 rcu_read_unlock();
668 mntput(bastard);
669 rcu_read_lock();
670 }
671 return false;
672 }
673
674 /**
675 * __lookup_mnt - find first child mount
676 * @mnt: parent mount
677 * @dentry: mountpoint
678 *
679 * If @mnt has a child mount @c mounted @dentry find and return it.
680 *
681 * Note that the child mount @c need not be unique. There are cases
682 * where shadow mounts are created. For example, during mount
683 * propagation when a source mount @mnt whose root got overmounted by a
684 * mount @o after path lookup but before @namespace_sem could be
685 * acquired gets copied and propagated. So @mnt gets copied including
686 * @o. When @mnt is propagated to a destination mount @d that already
687 * has another mount @n mounted at the same mountpoint then the source
688 * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
689 * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
690 * on @dentry.
691 *
692 * Return: The first child of @mnt mounted @dentry or NULL.
693 */
694 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
695 {
696 struct hlist_head *head = m_hash(mnt, dentry);
697 struct mount *p;
698
699 hlist_for_each_entry_rcu(p, head, mnt_hash)
700 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
701 return p;
702 return NULL;
703 }
704
705 /*
706 * lookup_mnt - Return the first child mount mounted at path
707 *
708 * "First" means first mounted chronologically. If you create the
709 * following mounts:
710 *
711 * mount /dev/sda1 /mnt
712 * mount /dev/sda2 /mnt
713 * mount /dev/sda3 /mnt
714 *
715 * Then lookup_mnt() on the base /mnt dentry in the root mount will
716 * return successively the root dentry and vfsmount of /dev/sda1, then
717 * /dev/sda2, then /dev/sda3, then NULL.
718 *
719 * lookup_mnt takes a reference to the found vfsmount.
720 */
721 struct vfsmount *lookup_mnt(const struct path *path)
722 {
723 struct mount *child_mnt;
724 struct vfsmount *m;
725 unsigned seq;
726
727 rcu_read_lock();
728 do {
729 seq = read_seqbegin(&mount_lock);
730 child_mnt = __lookup_mnt(path->mnt, path->dentry);
731 m = child_mnt ? &child_mnt->mnt : NULL;
732 } while (!legitimize_mnt(m, seq));
733 rcu_read_unlock();
734 return m;
735 }
736
737 /*
738 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
739 * current mount namespace.
740 *
741 * The common case is dentries are not mountpoints at all and that
742 * test is handled inline. For the slow case when we are actually
743 * dealing with a mountpoint of some kind, walk through all of the
744 * mounts in the current mount namespace and test to see if the dentry
745 * is a mountpoint.
746 *
747 * The mount_hashtable is not usable in the context because we
748 * need to identify all mounts that may be in the current mount
749 * namespace not just a mount that happens to have some specified
750 * parent mount.
751 */
752 bool __is_local_mountpoint(struct dentry *dentry)
753 {
754 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
755 struct mount *mnt, *n;
756 bool is_covered = false;
757
758 down_read(&namespace_sem);
759 rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
760 is_covered = (mnt->mnt_mountpoint == dentry);
761 if (is_covered)
762 break;
763 }
764 up_read(&namespace_sem);
765
766 return is_covered;
767 }
768
769 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
770 {
771 struct hlist_head *chain = mp_hash(dentry);
772 struct mountpoint *mp;
773
774 hlist_for_each_entry(mp, chain, m_hash) {
775 if (mp->m_dentry == dentry) {
776 mp->m_count++;
777 return mp;
778 }
779 }
780 return NULL;
781 }
782
783 static struct mountpoint *get_mountpoint(struct dentry *dentry)
784 {
785 struct mountpoint *mp, *new = NULL;
786 int ret;
787
788 if (d_mountpoint(dentry)) {
789 /* might be worth a WARN_ON() */
790 if (d_unlinked(dentry))
791 return ERR_PTR(-ENOENT);
792 mountpoint:
793 read_seqlock_excl(&mount_lock);
794 mp = lookup_mountpoint(dentry);
795 read_sequnlock_excl(&mount_lock);
796 if (mp)
797 goto done;
798 }
799
800 if (!new)
801 new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
802 if (!new)
803 return ERR_PTR(-ENOMEM);
804
805
806 /* Exactly one processes may set d_mounted */
807 ret = d_set_mounted(dentry);
808
809 /* Someone else set d_mounted? */
810 if (ret == -EBUSY)
811 goto mountpoint;
812
813 /* The dentry is not available as a mountpoint? */
814 mp = ERR_PTR(ret);
815 if (ret)
816 goto done;
817
818 /* Add the new mountpoint to the hash table */
819 read_seqlock_excl(&mount_lock);
820 new->m_dentry = dget(dentry);
821 new->m_count = 1;
822 hlist_add_head(&new->m_hash, mp_hash(dentry));
823 INIT_HLIST_HEAD(&new->m_list);
824 read_sequnlock_excl(&mount_lock);
825
826 mp = new;
827 new = NULL;
828 done:
829 kfree(new);
830 return mp;
831 }
832
833 /*
834 * vfsmount lock must be held. Additionally, the caller is responsible
835 * for serializing calls for given disposal list.
836 */
837 static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
838 {
839 if (!--mp->m_count) {
840 struct dentry *dentry = mp->m_dentry;
841 BUG_ON(!hlist_empty(&mp->m_list));
842 spin_lock(&dentry->d_lock);
843 dentry->d_flags &= ~DCACHE_MOUNTED;
844 spin_unlock(&dentry->d_lock);
845 dput_to_list(dentry, list);
846 hlist_del(&mp->m_hash);
847 kfree(mp);
848 }
849 }
850
851 /* called with namespace_lock and vfsmount lock */
852 static void put_mountpoint(struct mountpoint *mp)
853 {
854 __put_mountpoint(mp, &ex_mountpoints);
855 }
856
857 static inline int check_mnt(struct mount *mnt)
858 {
859 return mnt->mnt_ns == current->nsproxy->mnt_ns;
860 }
861
862 /*
863 * vfsmount lock must be held for write
864 */
865 static void touch_mnt_namespace(struct mnt_namespace *ns)
866 {
867 if (ns) {
868 ns->event = ++event;
869 wake_up_interruptible(&ns->poll);
870 }
871 }
872
873 /*
874 * vfsmount lock must be held for write
875 */
876 static void __touch_mnt_namespace(struct mnt_namespace *ns)
877 {
878 if (ns && ns->event != event) {
879 ns->event = event;
880 wake_up_interruptible(&ns->poll);
881 }
882 }
883
884 /*
885 * vfsmount lock must be held for write
886 */
887 static struct mountpoint *unhash_mnt(struct mount *mnt)
888 {
889 struct mountpoint *mp;
890 mnt->mnt_parent = mnt;
891 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
892 list_del_init(&mnt->mnt_child);
893 hlist_del_init_rcu(&mnt->mnt_hash);
894 hlist_del_init(&mnt->mnt_mp_list);
895 mp = mnt->mnt_mp;
896 mnt->mnt_mp = NULL;
897 return mp;
898 }
899
900 /*
901 * vfsmount lock must be held for write
902 */
903 static void umount_mnt(struct mount *mnt)
904 {
905 put_mountpoint(unhash_mnt(mnt));
906 }
907
908 /*
909 * vfsmount lock must be held for write
910 */
911 void mnt_set_mountpoint(struct mount *mnt,
912 struct mountpoint *mp,
913 struct mount *child_mnt)
914 {
915 mp->m_count++;
916 mnt_add_count(mnt, 1); /* essentially, that's mntget */
917 child_mnt->mnt_mountpoint = mp->m_dentry;
918 child_mnt->mnt_parent = mnt;
919 child_mnt->mnt_mp = mp;
920 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
921 }
922
923 /**
924 * mnt_set_mountpoint_beneath - mount a mount beneath another one
925 *
926 * @new_parent: the source mount
927 * @top_mnt: the mount beneath which @new_parent is mounted
928 * @new_mp: the new mountpoint of @top_mnt on @new_parent
929 *
930 * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
931 * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
932 * @new_mp. And mount @new_parent on the old parent and old
933 * mountpoint of @top_mnt.
934 *
935 * Context: This function expects namespace_lock() and lock_mount_hash()
936 * to have been acquired in that order.
937 */
938 static void mnt_set_mountpoint_beneath(struct mount *new_parent,
939 struct mount *top_mnt,
940 struct mountpoint *new_mp)
941 {
942 struct mount *old_top_parent = top_mnt->mnt_parent;
943 struct mountpoint *old_top_mp = top_mnt->mnt_mp;
944
945 mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
946 mnt_change_mountpoint(new_parent, new_mp, top_mnt);
947 }
948
949
950 static void __attach_mnt(struct mount *mnt, struct mount *parent)
951 {
952 hlist_add_head_rcu(&mnt->mnt_hash,
953 m_hash(&parent->mnt, mnt->mnt_mountpoint));
954 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
955 }
956
957 /**
958 * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
959 * list of child mounts
960 * @parent: the parent
961 * @mnt: the new mount
962 * @mp: the new mountpoint
963 * @beneath: whether to mount @mnt beneath or on top of @parent
964 *
965 * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
966 * to @parent's child mount list and to @mount_hashtable.
967 *
968 * If @beneath is true, remove @mnt from its current parent and
969 * mountpoint and mount it on @mp on @parent, and mount @parent on the
970 * old parent and old mountpoint of @mnt. Finally, attach @parent to
971 * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
972 *
973 * Note, when __attach_mnt() is called @mnt->mnt_parent already points
974 * to the correct parent.
975 *
976 * Context: This function expects namespace_lock() and lock_mount_hash()
977 * to have been acquired in that order.
978 */
979 static void attach_mnt(struct mount *mnt, struct mount *parent,
980 struct mountpoint *mp, bool beneath)
981 {
982 if (beneath)
983 mnt_set_mountpoint_beneath(mnt, parent, mp);
984 else
985 mnt_set_mountpoint(parent, mp, mnt);
986 /*
987 * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
988 * beneath @parent then @mnt will need to be attached to
989 * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
990 * isn't the same mount as @parent.
991 */
992 __attach_mnt(mnt, mnt->mnt_parent);
993 }
994
995 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
996 {
997 struct mountpoint *old_mp = mnt->mnt_mp;
998 struct mount *old_parent = mnt->mnt_parent;
999
1000 list_del_init(&mnt->mnt_child);
1001 hlist_del_init(&mnt->mnt_mp_list);
1002 hlist_del_init_rcu(&mnt->mnt_hash);
1003
1004 attach_mnt(mnt, parent, mp, false);
1005
1006 put_mountpoint(old_mp);
1007 mnt_add_count(old_parent, -1);
1008 }
1009
1010 static inline struct mount *node_to_mount(struct rb_node *node)
1011 {
1012 return rb_entry(node, struct mount, mnt_node);
1013 }
1014
1015 static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
1016 {
1017 struct rb_node **link = &ns->mounts.rb_node;
1018 struct rb_node *parent = NULL;
1019
1020 WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
1021 mnt->mnt_ns = ns;
1022 while (*link) {
1023 parent = *link;
1024 if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
1025 link = &parent->rb_left;
1026 else
1027 link = &parent->rb_right;
1028 }
1029 rb_link_node(&mnt->mnt_node, parent, link);
1030 rb_insert_color(&mnt->mnt_node, &ns->mounts);
1031 mnt->mnt.mnt_flags |= MNT_ONRB;
1032 }
1033
1034 /*
1035 * vfsmount lock must be held for write
1036 */
1037 static void commit_tree(struct mount *mnt)
1038 {
1039 struct mount *parent = mnt->mnt_parent;
1040 struct mount *m;
1041 LIST_HEAD(head);
1042 struct mnt_namespace *n = parent->mnt_ns;
1043
1044 BUG_ON(parent == mnt);
1045
1046 list_add_tail(&head, &mnt->mnt_list);
1047 while (!list_empty(&head)) {
1048 m = list_first_entry(&head, typeof(*m), mnt_list);
1049 list_del(&m->mnt_list);
1050
1051 mnt_add_to_ns(n, m);
1052 }
1053 n->nr_mounts += n->pending_mounts;
1054 n->pending_mounts = 0;
1055
1056 __attach_mnt(mnt, parent);
1057 touch_mnt_namespace(n);
1058 }
1059
1060 static struct mount *next_mnt(struct mount *p, struct mount *root)
1061 {
1062 struct list_head *next = p->mnt_mounts.next;
1063 if (next == &p->mnt_mounts) {
1064 while (1) {
1065 if (p == root)
1066 return NULL;
1067 next = p->mnt_child.next;
1068 if (next != &p->mnt_parent->mnt_mounts)
1069 break;
1070 p = p->mnt_parent;
1071 }
1072 }
1073 return list_entry(next, struct mount, mnt_child);
1074 }
1075
1076 static struct mount *skip_mnt_tree(struct mount *p)
1077 {
1078 struct list_head *prev = p->mnt_mounts.prev;
1079 while (prev != &p->mnt_mounts) {
1080 p = list_entry(prev, struct mount, mnt_child);
1081 prev = p->mnt_mounts.prev;
1082 }
1083 return p;
1084 }
1085
1086 /**
1087 * vfs_create_mount - Create a mount for a configured superblock
1088 * @fc: The configuration context with the superblock attached
1089 *
1090 * Create a mount to an already configured superblock. If necessary, the
1091 * caller should invoke vfs_get_tree() before calling this.
1092 *
1093 * Note that this does not attach the mount to anything.
1094 */
1095 struct vfsmount *vfs_create_mount(struct fs_context *fc)
1096 {
1097 struct mount *mnt;
1098
1099 if (!fc->root)
1100 return ERR_PTR(-EINVAL);
1101
1102 mnt = alloc_vfsmnt(fc->source ?: "none");
1103 if (!mnt)
1104 return ERR_PTR(-ENOMEM);
1105
1106 if (fc->sb_flags & SB_KERNMOUNT)
1107 mnt->mnt.mnt_flags = MNT_INTERNAL;
1108
1109 atomic_inc(&fc->root->d_sb->s_active);
1110 mnt->mnt.mnt_sb = fc->root->d_sb;
1111 mnt->mnt.mnt_root = dget(fc->root);
1112 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1113 mnt->mnt_parent = mnt;
1114
1115 lock_mount_hash();
1116 list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
1117 unlock_mount_hash();
1118 return &mnt->mnt;
1119 }
1120 EXPORT_SYMBOL(vfs_create_mount);
1121
1122 struct vfsmount *fc_mount(struct fs_context *fc)
1123 {
1124 int err = vfs_get_tree(fc);
1125 if (!err) {
1126 up_write(&fc->root->d_sb->s_umount);
1127 return vfs_create_mount(fc);
1128 }
1129 return ERR_PTR(err);
1130 }
1131 EXPORT_SYMBOL(fc_mount);
1132
1133 struct vfsmount *vfs_kern_mount(struct file_system_type *type,
1134 int flags, const char *name,
1135 void *data)
1136 {
1137 struct fs_context *fc;
1138 struct vfsmount *mnt;
1139 int ret = 0;
1140
1141 if (!type)
1142 return ERR_PTR(-EINVAL);
1143
1144 fc = fs_context_for_mount(type, flags);
1145 if (IS_ERR(fc))
1146 return ERR_CAST(fc);
1147
1148 if (name)
1149 ret = vfs_parse_fs_string(fc, "source",
1150 name, strlen(name));
1151 if (!ret)
1152 ret = parse_monolithic_mount_data(fc, data);
1153 if (!ret)
1154 mnt = fc_mount(fc);
1155 else
1156 mnt = ERR_PTR(ret);
1157
1158 put_fs_context(fc);
1159 return mnt;
1160 }
1161 EXPORT_SYMBOL_GPL(vfs_kern_mount);
1162
1163 struct vfsmount *
1164 vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
1165 const char *name, void *data)
1166 {
1167 /* Until it is worked out how to pass the user namespace
1168 * through from the parent mount to the submount don't support
1169 * unprivileged mounts with submounts.
1170 */
1171 if (mountpoint->d_sb->s_user_ns != &init_user_ns)
1172 return ERR_PTR(-EPERM);
1173
1174 return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
1175 }
1176 EXPORT_SYMBOL_GPL(vfs_submount);
1177
1178 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
1179 int flag)
1180 {
1181 struct super_block *sb = old->mnt.mnt_sb;
1182 struct mount *mnt;
1183 int err;
1184
1185 mnt = alloc_vfsmnt(old->mnt_devname);
1186 if (!mnt)
1187 return ERR_PTR(-ENOMEM);
1188
1189 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
1190 mnt->mnt_group_id = 0; /* not a peer of original */
1191 else
1192 mnt->mnt_group_id = old->mnt_group_id;
1193
1194 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
1195 err = mnt_alloc_group_id(mnt);
1196 if (err)
1197 goto out_free;
1198 }
1199
1200 mnt->mnt.mnt_flags = old->mnt.mnt_flags;
1201 mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
1202
1203 atomic_inc(&sb->s_active);
1204 mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
1205
1206 mnt->mnt.mnt_sb = sb;
1207 mnt->mnt.mnt_root = dget(root);
1208 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1209 mnt->mnt_parent = mnt;
1210 lock_mount_hash();
1211 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
1212 unlock_mount_hash();
1213
1214 if ((flag & CL_SLAVE) ||
1215 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
1216 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
1217 mnt->mnt_master = old;
1218 CLEAR_MNT_SHARED(mnt);
1219 } else if (!(flag & CL_PRIVATE)) {
1220 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
1221 list_add(&mnt->mnt_share, &old->mnt_share);
1222 if (IS_MNT_SLAVE(old))
1223 list_add(&mnt->mnt_slave, &old->mnt_slave);
1224 mnt->mnt_master = old->mnt_master;
1225 } else {
1226 CLEAR_MNT_SHARED(mnt);
1227 }
1228 if (flag & CL_MAKE_SHARED)
1229 set_mnt_shared(mnt);
1230
1231 /* stick the duplicate mount on the same expiry list
1232 * as the original if that was on one */
1233 if (flag & CL_EXPIRE) {
1234 if (!list_empty(&old->mnt_expire))
1235 list_add(&mnt->mnt_expire, &old->mnt_expire);
1236 }
1237
1238 return mnt;
1239
1240 out_free:
1241 mnt_free_id(mnt);
1242 free_vfsmnt(mnt);
1243 return ERR_PTR(err);
1244 }
1245
1246 static void cleanup_mnt(struct mount *mnt)
1247 {
1248 struct hlist_node *p;
1249 struct mount *m;
1250 /*
1251 * The warning here probably indicates that somebody messed
1252 * up a mnt_want/drop_write() pair. If this happens, the
1253 * filesystem was probably unable to make r/w->r/o transitions.
1254 * The locking used to deal with mnt_count decrement provides barriers,
1255 * so mnt_get_writers() below is safe.
1256 */
1257 WARN_ON(mnt_get_writers(mnt));
1258 if (unlikely(mnt->mnt_pins.first))
1259 mnt_pin_kill(mnt);
1260 hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
1261 hlist_del(&m->mnt_umount);
1262 mntput(&m->mnt);
1263 }
1264 fsnotify_vfsmount_delete(&mnt->mnt);
1265 dput(mnt->mnt.mnt_root);
1266 deactivate_super(mnt->mnt.mnt_sb);
1267 mnt_free_id(mnt);
1268 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1269 }
1270
1271 static void __cleanup_mnt(struct rcu_head *head)
1272 {
1273 cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1274 }
1275
1276 static LLIST_HEAD(delayed_mntput_list);
1277 static void delayed_mntput(struct work_struct *unused)
1278 {
1279 struct llist_node *node = llist_del_all(&delayed_mntput_list);
1280 struct mount *m, *t;
1281
1282 llist_for_each_entry_safe(m, t, node, mnt_llist)
1283 cleanup_mnt(m);
1284 }
1285 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1286
1287 static void mntput_no_expire(struct mount *mnt)
1288 {
1289 LIST_HEAD(list);
1290 int count;
1291
1292 rcu_read_lock();
1293 if (likely(READ_ONCE(mnt->mnt_ns))) {
1294 /*
1295 * Since we don't do lock_mount_hash() here,
1296 * ->mnt_ns can change under us. However, if it's
1297 * non-NULL, then there's a reference that won't
1298 * be dropped until after an RCU delay done after
1299 * turning ->mnt_ns NULL. So if we observe it
1300 * non-NULL under rcu_read_lock(), the reference
1301 * we are dropping is not the final one.
1302 */
1303 mnt_add_count(mnt, -1);
1304 rcu_read_unlock();
1305 return;
1306 }
1307 lock_mount_hash();
1308 /*
1309 * make sure that if __legitimize_mnt() has not seen us grab
1310 * mount_lock, we'll see their refcount increment here.
1311 */
1312 smp_mb();
1313 mnt_add_count(mnt, -1);
1314 count = mnt_get_count(mnt);
1315 if (count != 0) {
1316 WARN_ON(count < 0);
1317 rcu_read_unlock();
1318 unlock_mount_hash();
1319 return;
1320 }
1321 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1322 rcu_read_unlock();
1323 unlock_mount_hash();
1324 return;
1325 }
1326 mnt->mnt.mnt_flags |= MNT_DOOMED;
1327 rcu_read_unlock();
1328
1329 list_del(&mnt->mnt_instance);
1330
1331 if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1332 struct mount *p, *tmp;
1333 list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1334 __put_mountpoint(unhash_mnt(p), &list);
1335 hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
1336 }
1337 }
1338 unlock_mount_hash();
1339 shrink_dentry_list(&list);
1340
1341 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1342 struct task_struct *task = current;
1343 if (likely(!(task->flags & PF_KTHREAD))) {
1344 init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1345 if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
1346 return;
1347 }
1348 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
1349 schedule_delayed_work(&delayed_mntput_work, 1);
1350 return;
1351 }
1352 cleanup_mnt(mnt);
1353 }
1354
1355 void mntput(struct vfsmount *mnt)
1356 {
1357 if (mnt) {
1358 struct mount *m = real_mount(mnt);
1359 /* avoid cacheline pingpong */
1360 if (unlikely(m->mnt_expiry_mark))
1361 WRITE_ONCE(m->mnt_expiry_mark, 0);
1362 mntput_no_expire(m);
1363 }
1364 }
1365 EXPORT_SYMBOL(mntput);
1366
1367 struct vfsmount *mntget(struct vfsmount *mnt)
1368 {
1369 if (mnt)
1370 mnt_add_count(real_mount(mnt), 1);
1371 return mnt;
1372 }
1373 EXPORT_SYMBOL(mntget);
1374
1375 /*
1376 * Make a mount point inaccessible to new lookups.
1377 * Because there may still be current users, the caller MUST WAIT
1378 * for an RCU grace period before destroying the mount point.
1379 */
1380 void mnt_make_shortterm(struct vfsmount *mnt)
1381 {
1382 if (mnt)
1383 real_mount(mnt)->mnt_ns = NULL;
1384 }
1385
1386 /**
1387 * path_is_mountpoint() - Check if path is a mount in the current namespace.
1388 * @path: path to check
1389 *
1390 * d_mountpoint() can only be used reliably to establish if a dentry is
1391 * not mounted in any namespace and that common case is handled inline.
1392 * d_mountpoint() isn't aware of the possibility there may be multiple
1393 * mounts using a given dentry in a different namespace. This function
1394 * checks if the passed in path is a mountpoint rather than the dentry
1395 * alone.
1396 */
1397 bool path_is_mountpoint(const struct path *path)
1398 {
1399 unsigned seq;
1400 bool res;
1401
1402 if (!d_mountpoint(path->dentry))
1403 return false;
1404
1405 rcu_read_lock();
1406 do {
1407 seq = read_seqbegin(&mount_lock);
1408 res = __path_is_mountpoint(path);
1409 } while (read_seqretry(&mount_lock, seq));
1410 rcu_read_unlock();
1411
1412 return res;
1413 }
1414 EXPORT_SYMBOL(path_is_mountpoint);
1415
1416 struct vfsmount *mnt_clone_internal(const struct path *path)
1417 {
1418 struct mount *p;
1419 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1420 if (IS_ERR(p))
1421 return ERR_CAST(p);
1422 p->mnt.mnt_flags |= MNT_INTERNAL;
1423 return &p->mnt;
1424 }
1425
1426 /*
1427 * Returns the mount which either has the specified mnt_id, or has the next
1428 * smallest id afer the specified one.
1429 */
1430 static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
1431 {
1432 struct rb_node *node = ns->mounts.rb_node;
1433 struct mount *ret = NULL;
1434
1435 while (node) {
1436 struct mount *m = node_to_mount(node);
1437
1438 if (mnt_id <= m->mnt_id_unique) {
1439 ret = node_to_mount(node);
1440 if (mnt_id == m->mnt_id_unique)
1441 break;
1442 node = node->rb_left;
1443 } else {
1444 node = node->rb_right;
1445 }
1446 }
1447 return ret;
1448 }
1449
1450 #ifdef CONFIG_PROC_FS
1451
1452 /* iterator; we want it to have access to namespace_sem, thus here... */
1453 static void *m_start(struct seq_file *m, loff_t *pos)
1454 {
1455 struct proc_mounts *p = m->private;
1456
1457 down_read(&namespace_sem);
1458
1459 return mnt_find_id_at(p->ns, *pos);
1460 }
1461
1462 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1463 {
1464 struct mount *next = NULL, *mnt = v;
1465 struct rb_node *node = rb_next(&mnt->mnt_node);
1466
1467 ++*pos;
1468 if (node) {
1469 next = node_to_mount(node);
1470 *pos = next->mnt_id_unique;
1471 }
1472 return next;
1473 }
1474
1475 static void m_stop(struct seq_file *m, void *v)
1476 {
1477 up_read(&namespace_sem);
1478 }
1479
1480 static int m_show(struct seq_file *m, void *v)
1481 {
1482 struct proc_mounts *p = m->private;
1483 struct mount *r = v;
1484 return p->show(m, &r->mnt);
1485 }
1486
1487 const struct seq_operations mounts_op = {
1488 .start = m_start,
1489 .next = m_next,
1490 .stop = m_stop,
1491 .show = m_show,
1492 };
1493
1494 #endif /* CONFIG_PROC_FS */
1495
1496 /**
1497 * may_umount_tree - check if a mount tree is busy
1498 * @m: root of mount tree
1499 *
1500 * This is called to check if a tree of mounts has any
1501 * open files, pwds, chroots or sub mounts that are
1502 * busy.
1503 */
1504 int may_umount_tree(struct vfsmount *m)
1505 {
1506 struct mount *mnt = real_mount(m);
1507 int actual_refs = 0;
1508 int minimum_refs = 0;
1509 struct mount *p;
1510 BUG_ON(!m);
1511
1512 /* write lock needed for mnt_get_count */
1513 lock_mount_hash();
1514 for (p = mnt; p; p = next_mnt(p, mnt)) {
1515 actual_refs += mnt_get_count(p);
1516 minimum_refs += 2;
1517 }
1518 unlock_mount_hash();
1519
1520 if (actual_refs > minimum_refs)
1521 return 0;
1522
1523 return 1;
1524 }
1525
1526 EXPORT_SYMBOL(may_umount_tree);
1527
1528 /**
1529 * may_umount - check if a mount point is busy
1530 * @mnt: root of mount
1531 *
1532 * This is called to check if a mount point has any
1533 * open files, pwds, chroots or sub mounts. If the
1534 * mount has sub mounts this will return busy
1535 * regardless of whether the sub mounts are busy.
1536 *
1537 * Doesn't take quota and stuff into account. IOW, in some cases it will
1538 * give false negatives. The main reason why it's here is that we need
1539 * a non-destructive way to look for easily umountable filesystems.
1540 */
1541 int may_umount(struct vfsmount *mnt)
1542 {
1543 int ret = 1;
1544 down_read(&namespace_sem);
1545 lock_mount_hash();
1546 if (propagate_mount_busy(real_mount(mnt), 2))
1547 ret = 0;
1548 unlock_mount_hash();
1549 up_read(&namespace_sem);
1550 return ret;
1551 }
1552
1553 EXPORT_SYMBOL(may_umount);
1554
1555 static void namespace_unlock(void)
1556 {
1557 struct hlist_head head;
1558 struct hlist_node *p;
1559 struct mount *m;
1560 LIST_HEAD(list);
1561
1562 hlist_move_list(&unmounted, &head);
1563 list_splice_init(&ex_mountpoints, &list);
1564
1565 up_write(&namespace_sem);
1566
1567 shrink_dentry_list(&list);
1568
1569 if (likely(hlist_empty(&head)))
1570 return;
1571
1572 synchronize_rcu_expedited();
1573
1574 hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
1575 hlist_del(&m->mnt_umount);
1576 mntput(&m->mnt);
1577 }
1578 }
1579
1580 static inline void namespace_lock(void)
1581 {
1582 down_write(&namespace_sem);
1583 }
1584
1585 enum umount_tree_flags {
1586 UMOUNT_SYNC = 1,
1587 UMOUNT_PROPAGATE = 2,
1588 UMOUNT_CONNECTED = 4,
1589 };
1590
1591 static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
1592 {
1593 /* Leaving mounts connected is only valid for lazy umounts */
1594 if (how & UMOUNT_SYNC)
1595 return true;
1596
1597 /* A mount without a parent has nothing to be connected to */
1598 if (!mnt_has_parent(mnt))
1599 return true;
1600
1601 /* Because the reference counting rules change when mounts are
1602 * unmounted and connected, umounted mounts may not be
1603 * connected to mounted mounts.
1604 */
1605 if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
1606 return true;
1607
1608 /* Has it been requested that the mount remain connected? */
1609 if (how & UMOUNT_CONNECTED)
1610 return false;
1611
1612 /* Is the mount locked such that it needs to remain connected? */
1613 if (IS_MNT_LOCKED(mnt))
1614 return false;
1615
1616 /* By default disconnect the mount */
1617 return true;
1618 }
1619
1620 /*
1621 * mount_lock must be held
1622 * namespace_sem must be held for write
1623 */
1624 static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
1625 {
1626 LIST_HEAD(tmp_list);
1627 struct mount *p;
1628
1629 if (how & UMOUNT_PROPAGATE)
1630 propagate_mount_unlock(mnt);
1631
1632 /* Gather the mounts to umount */
1633 for (p = mnt; p; p = next_mnt(p, mnt)) {
1634 p->mnt.mnt_flags |= MNT_UMOUNT;
1635 if (p->mnt.mnt_flags & MNT_ONRB)
1636 move_from_ns(p, &tmp_list);
1637 else
1638 list_move(&p->mnt_list, &tmp_list);
1639 }
1640
1641 /* Hide the mounts from mnt_mounts */
1642 list_for_each_entry(p, &tmp_list, mnt_list) {
1643 list_del_init(&p->mnt_child);
1644 }
1645
1646 /* Add propogated mounts to the tmp_list */
1647 if (how & UMOUNT_PROPAGATE)
1648 propagate_umount(&tmp_list);
1649
1650 while (!list_empty(&tmp_list)) {
1651 struct mnt_namespace *ns;
1652 bool disconnect;
1653 p = list_first_entry(&tmp_list, struct mount, mnt_list);
1654 list_del_init(&p->mnt_expire);
1655 list_del_init(&p->mnt_list);
1656 ns = p->mnt_ns;
1657 if (ns) {
1658 ns->nr_mounts--;
1659 __touch_mnt_namespace(ns);
1660 }
1661 p->mnt_ns = NULL;
1662 if (how & UMOUNT_SYNC)
1663 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1664
1665 disconnect = disconnect_mount(p, how);
1666 if (mnt_has_parent(p)) {
1667 mnt_add_count(p->mnt_parent, -1);
1668 if (!disconnect) {
1669 /* Don't forget about p */
1670 list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
1671 } else {
1672 umount_mnt(p);
1673 }
1674 }
1675 change_mnt_propagation(p, MS_PRIVATE);
1676 if (disconnect)
1677 hlist_add_head(&p->mnt_umount, &unmounted);
1678 }
1679 }
1680
1681 static void shrink_submounts(struct mount *mnt);
1682
1683 static int do_umount_root(struct super_block *sb)
1684 {
1685 int ret = 0;
1686
1687 down_write(&sb->s_umount);
1688 if (!sb_rdonly(sb)) {
1689 struct fs_context *fc;
1690
1691 fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
1692 SB_RDONLY);
1693 if (IS_ERR(fc)) {
1694 ret = PTR_ERR(fc);
1695 } else {
1696 ret = parse_monolithic_mount_data(fc, NULL);
1697 if (!ret)
1698 ret = reconfigure_super(fc);
1699 put_fs_context(fc);
1700 }
1701 }
1702 up_write(&sb->s_umount);
1703 return ret;
1704 }
1705
1706 static int do_umount(struct mount *mnt, int flags)
1707 {
1708 struct super_block *sb = mnt->mnt.mnt_sb;
1709 int retval;
1710
1711 retval = security_sb_umount(&mnt->mnt, flags);
1712 if (retval)
1713 return retval;
1714
1715 /*
1716 * Allow userspace to request a mountpoint be expired rather than
1717 * unmounting unconditionally. Unmount only happens if:
1718 * (1) the mark is already set (the mark is cleared by mntput())
1719 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1720 */
1721 if (flags & MNT_EXPIRE) {
1722 if (&mnt->mnt == current->fs->root.mnt ||
1723 flags & (MNT_FORCE | MNT_DETACH))
1724 return -EINVAL;
1725
1726 /*
1727 * probably don't strictly need the lock here if we examined
1728 * all race cases, but it's a slowpath.
1729 */
1730 lock_mount_hash();
1731 if (mnt_get_count(mnt) != 2) {
1732 unlock_mount_hash();
1733 return -EBUSY;
1734 }
1735 unlock_mount_hash();
1736
1737 if (!xchg(&mnt->mnt_expiry_mark, 1))
1738 return -EAGAIN;
1739 }
1740
1741 /*
1742 * If we may have to abort operations to get out of this
1743 * mount, and they will themselves hold resources we must
1744 * allow the fs to do things. In the Unix tradition of
1745 * 'Gee thats tricky lets do it in userspace' the umount_begin
1746 * might fail to complete on the first run through as other tasks
1747 * must return, and the like. Thats for the mount program to worry
1748 * about for the moment.
1749 */
1750
1751 if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1752 sb->s_op->umount_begin(sb);
1753 }
1754
1755 /*
1756 * No sense to grab the lock for this test, but test itself looks
1757 * somewhat bogus. Suggestions for better replacement?
1758 * Ho-hum... In principle, we might treat that as umount + switch
1759 * to rootfs. GC would eventually take care of the old vfsmount.
1760 * Actually it makes sense, especially if rootfs would contain a
1761 * /reboot - static binary that would close all descriptors and
1762 * call reboot(9). Then init(8) could umount root and exec /reboot.
1763 */
1764 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1765 /*
1766 * Special case for "unmounting" root ...
1767 * we just try to remount it readonly.
1768 */
1769 if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
1770 return -EPERM;
1771 return do_umount_root(sb);
1772 }
1773
1774 namespace_lock();
1775 lock_mount_hash();
1776
1777 /* Recheck MNT_LOCKED with the locks held */
1778 retval = -EINVAL;
1779 if (mnt->mnt.mnt_flags & MNT_LOCKED)
1780 goto out;
1781
1782 event++;
1783 if (flags & MNT_DETACH) {
1784 if (mnt->mnt.mnt_flags & MNT_ONRB ||
1785 !list_empty(&mnt->mnt_list))
1786 umount_tree(mnt, UMOUNT_PROPAGATE);
1787 retval = 0;
1788 } else {
1789 shrink_submounts(mnt);
1790 retval = -EBUSY;
1791 if (!propagate_mount_busy(mnt, 2)) {
1792 if (mnt->mnt.mnt_flags & MNT_ONRB ||
1793 !list_empty(&mnt->mnt_list))
1794 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
1795 retval = 0;
1796 }
1797 }
1798 out:
1799 unlock_mount_hash();
1800 namespace_unlock();
1801 return retval;
1802 }
1803
1804 /*
1805 * __detach_mounts - lazily unmount all mounts on the specified dentry
1806 *
1807 * During unlink, rmdir, and d_drop it is possible to loose the path
1808 * to an existing mountpoint, and wind up leaking the mount.
1809 * detach_mounts allows lazily unmounting those mounts instead of
1810 * leaking them.
1811 *
1812 * The caller may hold dentry->d_inode->i_mutex.
1813 */
1814 void __detach_mounts(struct dentry *dentry)
1815 {
1816 struct mountpoint *mp;
1817 struct mount *mnt;
1818
1819 namespace_lock();
1820 lock_mount_hash();
1821 mp = lookup_mountpoint(dentry);
1822 if (!mp)
1823 goto out_unlock;
1824
1825 event++;
1826 while (!hlist_empty(&mp->m_list)) {
1827 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1828 if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1829 umount_mnt(mnt);
1830 hlist_add_head(&mnt->mnt_umount, &unmounted);
1831 }
1832 else umount_tree(mnt, UMOUNT_CONNECTED);
1833 }
1834 put_mountpoint(mp);
1835 out_unlock:
1836 unlock_mount_hash();
1837 namespace_unlock();
1838 }
1839
1840 /*
1841 * Is the caller allowed to modify his namespace?
1842 */
1843 bool may_mount(void)
1844 {
1845 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
1846 }
1847
1848 /**
1849 * path_mounted - check whether path is mounted
1850 * @path: path to check
1851 *
1852 * Determine whether @path refers to the root of a mount.
1853 *
1854 * Return: true if @path is the root of a mount, false if not.
1855 */
1856 static inline bool path_mounted(const struct path *path)
1857 {
1858 return path->mnt->mnt_root == path->dentry;
1859 }
1860
1861 static void warn_mandlock(void)
1862 {
1863 pr_warn_once("=======================================================\n"
1864 "WARNING: The mand mount option has been deprecated and\n"
1865 " and is ignored by this kernel. Remove the mand\n"
1866 " option from the mount to silence this warning.\n"
1867 "=======================================================\n");
1868 }
1869
1870 static int can_umount(const struct path *path, int flags)
1871 {
1872 struct mount *mnt = real_mount(path->mnt);
1873
1874 if (!may_mount())
1875 return -EPERM;
1876 if (!path_mounted(path))
1877 return -EINVAL;
1878 if (!check_mnt(mnt))
1879 return -EINVAL;
1880 if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
1881 return -EINVAL;
1882 if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
1883 return -EPERM;
1884 return 0;
1885 }
1886
1887 // caller is responsible for flags being sane
1888 int path_umount(struct path *path, int flags)
1889 {
1890 struct mount *mnt = real_mount(path->mnt);
1891 int ret;
1892
1893 ret = can_umount(path, flags);
1894 if (!ret)
1895 ret = do_umount(mnt, flags);
1896
1897 /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1898 dput(path->dentry);
1899 mntput_no_expire(mnt);
1900 return ret;
1901 }
1902
1903 static int ksys_umount(char __user *name, int flags)
1904 {
1905 int lookup_flags = LOOKUP_MOUNTPOINT;
1906 struct path path;
1907 int ret;
1908
1909 // basic validity checks done first
1910 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1911 return -EINVAL;
1912
1913 if (!(flags & UMOUNT_NOFOLLOW))
1914 lookup_flags |= LOOKUP_FOLLOW;
1915 ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
1916 if (ret)
1917 return ret;
1918 return path_umount(&path, flags);
1919 }
1920
1921 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1922 {
1923 return ksys_umount(name, flags);
1924 }
1925
1926 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
1927
1928 /*
1929 * The 2.0 compatible umount. No flags.
1930 */
1931 SYSCALL_DEFINE1(oldumount, char __user *, name)
1932 {
1933 return ksys_umount(name, 0);
1934 }
1935
1936 #endif
1937
1938 static bool is_mnt_ns_file(struct dentry *dentry)
1939 {
1940 /* Is this a proxy for a mount namespace? */
1941 return dentry->d_op == &ns_dentry_operations &&
1942 dentry->d_fsdata == &mntns_operations;
1943 }
1944
1945 static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
1946 {
1947 return container_of(ns, struct mnt_namespace, ns);
1948 }
1949
1950 struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
1951 {
1952 return &mnt->ns;
1953 }
1954
1955 static bool mnt_ns_loop(struct dentry *dentry)
1956 {
1957 /* Could bind mounting the mount namespace inode cause a
1958 * mount namespace loop?
1959 */
1960 struct mnt_namespace *mnt_ns;
1961 if (!is_mnt_ns_file(dentry))
1962 return false;
1963
1964 mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
1965 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1966 }
1967
1968 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1969 int flag)
1970 {
1971 struct mount *res, *p, *q, *r, *parent;
1972
1973 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1974 return ERR_PTR(-EINVAL);
1975
1976 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
1977 return ERR_PTR(-EINVAL);
1978
1979 res = q = clone_mnt(mnt, dentry, flag);
1980 if (IS_ERR(q))
1981 return q;
1982
1983 q->mnt_mountpoint = mnt->mnt_mountpoint;
1984
1985 p = mnt;
1986 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
1987 struct mount *s;
1988 if (!is_subdir(r->mnt_mountpoint, dentry))
1989 continue;
1990
1991 for (s = r; s; s = next_mnt(s, r)) {
1992 if (!(flag & CL_COPY_UNBINDABLE) &&
1993 IS_MNT_UNBINDABLE(s)) {
1994 if (s->mnt.mnt_flags & MNT_LOCKED) {
1995 /* Both unbindable and locked. */
1996 q = ERR_PTR(-EPERM);
1997 goto out;
1998 } else {
1999 s = skip_mnt_tree(s);
2000 continue;
2001 }
2002 }
2003 if (!(flag & CL_COPY_MNT_NS_FILE) &&
2004 is_mnt_ns_file(s->mnt.mnt_root)) {
2005 s = skip_mnt_tree(s);
2006 continue;
2007 }
2008 while (p != s->mnt_parent) {
2009 p = p->mnt_parent;
2010 q = q->mnt_parent;
2011 }
2012 p = s;
2013 parent = q;
2014 q = clone_mnt(p, p->mnt.mnt_root, flag);
2015 if (IS_ERR(q))
2016 goto out;
2017 lock_mount_hash();
2018 list_add_tail(&q->mnt_list, &res->mnt_list);
2019 attach_mnt(q, parent, p->mnt_mp, false);
2020 unlock_mount_hash();
2021 }
2022 }
2023 return res;
2024 out:
2025 if (res) {
2026 lock_mount_hash();
2027 umount_tree(res, UMOUNT_SYNC);
2028 unlock_mount_hash();
2029 }
2030 return q;
2031 }
2032
2033 /* Caller should check returned pointer for errors */
2034
2035 struct vfsmount *collect_mounts(const struct path *path)
2036 {
2037 struct mount *tree;
2038 namespace_lock();
2039 if (!check_mnt(real_mount(path->mnt)))
2040 tree = ERR_PTR(-EINVAL);
2041 else
2042 tree = copy_tree(real_mount(path->mnt), path->dentry,
2043 CL_COPY_ALL | CL_PRIVATE);
2044 namespace_unlock();
2045 if (IS_ERR(tree))
2046 return ERR_CAST(tree);
2047 return &tree->mnt;
2048 }
2049
2050 static void free_mnt_ns(struct mnt_namespace *);
2051 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
2052
2053 void dissolve_on_fput(struct vfsmount *mnt)
2054 {
2055 struct mnt_namespace *ns;
2056 namespace_lock();
2057 lock_mount_hash();
2058 ns = real_mount(mnt)->mnt_ns;
2059 if (ns) {
2060 if (is_anon_ns(ns))
2061 umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
2062 else
2063 ns = NULL;
2064 }
2065 unlock_mount_hash();
2066 namespace_unlock();
2067 if (ns)
2068 free_mnt_ns(ns);
2069 }
2070
2071 void drop_collected_mounts(struct vfsmount *mnt)
2072 {
2073 namespace_lock();
2074 lock_mount_hash();
2075 umount_tree(real_mount(mnt), 0);
2076 unlock_mount_hash();
2077 namespace_unlock();
2078 }
2079
2080 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
2081 {
2082 struct mount *child;
2083
2084 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2085 if (!is_subdir(child->mnt_mountpoint, dentry))
2086 continue;
2087
2088 if (child->mnt.mnt_flags & MNT_LOCKED)
2089 return true;
2090 }
2091 return false;
2092 }
2093
2094 /**
2095 * clone_private_mount - create a private clone of a path
2096 * @path: path to clone
2097 *
2098 * This creates a new vfsmount, which will be the clone of @path. The new mount
2099 * will not be attached anywhere in the namespace and will be private (i.e.
2100 * changes to the originating mount won't be propagated into this).
2101 *
2102 * Release with mntput().
2103 */
2104 struct vfsmount *clone_private_mount(const struct path *path)
2105 {
2106 struct mount *old_mnt = real_mount(path->mnt);
2107 struct mount *new_mnt;
2108
2109 down_read(&namespace_sem);
2110 if (IS_MNT_UNBINDABLE(old_mnt))
2111 goto invalid;
2112
2113 if (!check_mnt(old_mnt))
2114 goto invalid;
2115
2116 if (has_locked_children(old_mnt, path->dentry))
2117 goto invalid;
2118
2119 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
2120 up_read(&namespace_sem);
2121
2122 if (IS_ERR(new_mnt))
2123 return ERR_CAST(new_mnt);
2124
2125 /* Longterm mount to be removed by kern_unmount*() */
2126 new_mnt->mnt_ns = MNT_NS_INTERNAL;
2127
2128 return &new_mnt->mnt;
2129
2130 invalid:
2131 up_read(&namespace_sem);
2132 return ERR_PTR(-EINVAL);
2133 }
2134 EXPORT_SYMBOL_GPL(clone_private_mount);
2135
2136 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
2137 struct vfsmount *root)
2138 {
2139 struct mount *mnt;
2140 int res = f(root, arg);
2141 if (res)
2142 return res;
2143 list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
2144 res = f(&mnt->mnt, arg);
2145 if (res)
2146 return res;
2147 }
2148 return 0;
2149 }
2150
2151 static void lock_mnt_tree(struct mount *mnt)
2152 {
2153 struct mount *p;
2154
2155 for (p = mnt; p; p = next_mnt(p, mnt)) {
2156 int flags = p->mnt.mnt_flags;
2157 /* Don't allow unprivileged users to change mount flags */
2158 flags |= MNT_LOCK_ATIME;
2159
2160 if (flags & MNT_READONLY)
2161 flags |= MNT_LOCK_READONLY;
2162
2163 if (flags & MNT_NODEV)
2164 flags |= MNT_LOCK_NODEV;
2165
2166 if (flags & MNT_NOSUID)
2167 flags |= MNT_LOCK_NOSUID;
2168
2169 if (flags & MNT_NOEXEC)
2170 flags |= MNT_LOCK_NOEXEC;
2171 /* Don't allow unprivileged users to reveal what is under a mount */
2172 if (list_empty(&p->mnt_expire))
2173 flags |= MNT_LOCKED;
2174 p->mnt.mnt_flags = flags;
2175 }
2176 }
2177
2178 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
2179 {
2180 struct mount *p;
2181
2182 for (p = mnt; p != end; p = next_mnt(p, mnt)) {
2183 if (p->mnt_group_id && !IS_MNT_SHARED(p))
2184 mnt_release_group_id(p);
2185 }
2186 }
2187
2188 static int invent_group_ids(struct mount *mnt, bool recurse)
2189 {
2190 struct mount *p;
2191
2192 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
2193 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
2194 int err = mnt_alloc_group_id(p);
2195 if (err) {
2196 cleanup_group_ids(mnt, p);
2197 return err;
2198 }
2199 }
2200 }
2201
2202 return 0;
2203 }
2204
2205 int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
2206 {
2207 unsigned int max = READ_ONCE(sysctl_mount_max);
2208 unsigned int mounts = 0;
2209 struct mount *p;
2210
2211 if (ns->nr_mounts >= max)
2212 return -ENOSPC;
2213 max -= ns->nr_mounts;
2214 if (ns->pending_mounts >= max)
2215 return -ENOSPC;
2216 max -= ns->pending_mounts;
2217
2218 for (p = mnt; p; p = next_mnt(p, mnt))
2219 mounts++;
2220
2221 if (mounts > max)
2222 return -ENOSPC;
2223
2224 ns->pending_mounts += mounts;
2225 return 0;
2226 }
2227
2228 enum mnt_tree_flags_t {
2229 MNT_TREE_MOVE = BIT(0),
2230 MNT_TREE_BENEATH = BIT(1),
2231 };
2232
2233 /**
2234 * attach_recursive_mnt - attach a source mount tree
2235 * @source_mnt: mount tree to be attached
2236 * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath
2237 * @dest_mp: the mountpoint @source_mnt will be mounted at
2238 * @flags: modify how @source_mnt is supposed to be attached
2239 *
2240 * NOTE: in the table below explains the semantics when a source mount
2241 * of a given type is attached to a destination mount of a given type.
2242 * ---------------------------------------------------------------------------
2243 * | BIND MOUNT OPERATION |
2244 * |**************************************************************************
2245 * | source-->| shared | private | slave | unbindable |
2246 * | dest | | | | |
2247 * | | | | | | |
2248 * | v | | | | |
2249 * |**************************************************************************
2250 * | shared | shared (++) | shared (+) | shared(+++)| invalid |
2251 * | | | | | |
2252 * |non-shared| shared (+) | private | slave (*) | invalid |
2253 * ***************************************************************************
2254 * A bind operation clones the source mount and mounts the clone on the
2255 * destination mount.
2256 *
2257 * (++) the cloned mount is propagated to all the mounts in the propagation
2258 * tree of the destination mount and the cloned mount is added to
2259 * the peer group of the source mount.
2260 * (+) the cloned mount is created under the destination mount and is marked
2261 * as shared. The cloned mount is added to the peer group of the source
2262 * mount.
2263 * (+++) the mount is propagated to all the mounts in the propagation tree
2264 * of the destination mount and the cloned mount is made slave
2265 * of the same master as that of the source mount. The cloned mount
2266 * is marked as 'shared and slave'.
2267 * (*) the cloned mount is made a slave of the same master as that of the
2268 * source mount.
2269 *
2270 * ---------------------------------------------------------------------------
2271 * | MOVE MOUNT OPERATION |
2272 * |**************************************************************************
2273 * | source-->| shared | private | slave | unbindable |
2274 * | dest | | | | |
2275 * | | | | | | |
2276 * | v | | | | |
2277 * |**************************************************************************
2278 * | shared | shared (+) | shared (+) | shared(+++) | invalid |
2279 * | | | | | |
2280 * |non-shared| shared (+*) | private | slave (*) | unbindable |
2281 * ***************************************************************************
2282 *
2283 * (+) the mount is moved to the destination. And is then propagated to
2284 * all the mounts in the propagation tree of the destination mount.
2285 * (+*) the mount is moved to the destination.
2286 * (+++) the mount is moved to the destination and is then propagated to
2287 * all the mounts belonging to the destination mount's propagation tree.
2288 * the mount is marked as 'shared and slave'.
2289 * (*) the mount continues to be a slave at the new location.
2290 *
2291 * if the source mount is a tree, the operations explained above is
2292 * applied to each mount in the tree.
2293 * Must be called without spinlocks held, since this function can sleep
2294 * in allocations.
2295 *
2296 * Context: The function expects namespace_lock() to be held.
2297 * Return: If @source_mnt was successfully attached 0 is returned.
2298 * Otherwise a negative error code is returned.
2299 */
2300 static int attach_recursive_mnt(struct mount *source_mnt,
2301 struct mount *top_mnt,
2302 struct mountpoint *dest_mp,
2303 enum mnt_tree_flags_t flags)
2304 {
2305 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2306 HLIST_HEAD(tree_list);
2307 struct mnt_namespace *ns = top_mnt->mnt_ns;
2308 struct mountpoint *smp;
2309 struct mount *child, *dest_mnt, *p;
2310 struct hlist_node *n;
2311 int err = 0;
2312 bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
2313
2314 /*
2315 * Preallocate a mountpoint in case the new mounts need to be
2316 * mounted beneath mounts on the same mountpoint.
2317 */
2318 smp = get_mountpoint(source_mnt->mnt.mnt_root);
2319 if (IS_ERR(smp))
2320 return PTR_ERR(smp);
2321
2322 /* Is there space to add these mounts to the mount namespace? */
2323 if (!moving) {
2324 err = count_mounts(ns, source_mnt);
2325 if (err)
2326 goto out;
2327 }
2328
2329 if (beneath)
2330 dest_mnt = top_mnt->mnt_parent;
2331 else
2332 dest_mnt = top_mnt;
2333
2334 if (IS_MNT_SHARED(dest_mnt)) {
2335 err = invent_group_ids(source_mnt, true);
2336 if (err)
2337 goto out;
2338 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
2339 }
2340 lock_mount_hash();
2341 if (err)
2342 goto out_cleanup_ids;
2343
2344 if (IS_MNT_SHARED(dest_mnt)) {
2345 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
2346 set_mnt_shared(p);
2347 }
2348
2349 if (moving) {
2350 if (beneath)
2351 dest_mp = smp;
2352 unhash_mnt(source_mnt);
2353 attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
2354 touch_mnt_namespace(source_mnt->mnt_ns);
2355 } else {
2356 if (source_mnt->mnt_ns) {
2357 LIST_HEAD(head);
2358
2359 /* move from anon - the caller will destroy */
2360 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
2361 move_from_ns(p, &head);
2362 list_del_init(&head);
2363 }
2364 if (beneath)
2365 mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
2366 else
2367 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2368 commit_tree(source_mnt);
2369 }
2370
2371 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2372 struct mount *q;
2373 hlist_del_init(&child->mnt_hash);
2374 q = __lookup_mnt(&child->mnt_parent->mnt,
2375 child->mnt_mountpoint);
2376 if (q)
2377 mnt_change_mountpoint(child, smp, q);
2378 /* Notice when we are propagating across user namespaces */
2379 if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2380 lock_mnt_tree(child);
2381 child->mnt.mnt_flags &= ~MNT_LOCKED;
2382 commit_tree(child);
2383 }
2384 put_mountpoint(smp);
2385 unlock_mount_hash();
2386
2387 return 0;
2388
2389 out_cleanup_ids:
2390 while (!hlist_empty(&tree_list)) {
2391 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
2392 child->mnt_parent->mnt_ns->pending_mounts = 0;
2393 umount_tree(child, UMOUNT_SYNC);
2394 }
2395 unlock_mount_hash();
2396 cleanup_group_ids(source_mnt, NULL);
2397 out:
2398 ns->pending_mounts = 0;
2399
2400 read_seqlock_excl(&mount_lock);
2401 put_mountpoint(smp);
2402 read_sequnlock_excl(&mount_lock);
2403
2404 return err;
2405 }
2406
2407 /**
2408 * do_lock_mount - lock mount and mountpoint
2409 * @path: target path
2410 * @beneath: whether the intention is to mount beneath @path
2411 *
2412 * Follow the mount stack on @path until the top mount @mnt is found. If
2413 * the initial @path->{mnt,dentry} is a mountpoint lookup the first
2414 * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
2415 * until nothing is stacked on top of it anymore.
2416 *
2417 * Acquire the inode_lock() on the top mount's ->mnt_root to protect
2418 * against concurrent removal of the new mountpoint from another mount
2419 * namespace.
2420 *
2421 * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
2422 * @mp on @mnt->mnt_parent must be acquired. This protects against a
2423 * concurrent unlink of @mp->mnt_dentry from another mount namespace
2424 * where @mnt doesn't have a child mount mounted @mp. A concurrent
2425 * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
2426 * on top of it for @beneath.
2427 *
2428 * In addition, @beneath needs to make sure that @mnt hasn't been
2429 * unmounted or moved from its current mountpoint in between dropping
2430 * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
2431 * being unmounted would be detected later by e.g., calling
2432 * check_mnt(mnt) in the function it's called from. For the @beneath
2433 * case however, it's useful to detect it directly in do_lock_mount().
2434 * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
2435 * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
2436 * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
2437 *
2438 * Return: Either the target mountpoint on the top mount or the top
2439 * mount's mountpoint.
2440 */
2441 static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
2442 {
2443 struct vfsmount *mnt = path->mnt;
2444 struct dentry *dentry;
2445 struct mountpoint *mp = ERR_PTR(-ENOENT);
2446
2447 for (;;) {
2448 struct mount *m;
2449
2450 if (beneath) {
2451 m = real_mount(mnt);
2452 read_seqlock_excl(&mount_lock);
2453 dentry = dget(m->mnt_mountpoint);
2454 read_sequnlock_excl(&mount_lock);
2455 } else {
2456 dentry = path->dentry;
2457 }
2458
2459 inode_lock(dentry->d_inode);
2460 if (unlikely(cant_mount(dentry))) {
2461 inode_unlock(dentry->d_inode);
2462 goto out;
2463 }
2464
2465 namespace_lock();
2466
2467 if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
2468 namespace_unlock();
2469 inode_unlock(dentry->d_inode);
2470 goto out;
2471 }
2472
2473 mnt = lookup_mnt(path);
2474 if (likely(!mnt))
2475 break;
2476
2477 namespace_unlock();
2478 inode_unlock(dentry->d_inode);
2479 if (beneath)
2480 dput(dentry);
2481 path_put(path);
2482 path->mnt = mnt;
2483 path->dentry = dget(mnt->mnt_root);
2484 }
2485
2486 mp = get_mountpoint(dentry);
2487 if (IS_ERR(mp)) {
2488 namespace_unlock();
2489 inode_unlock(dentry->d_inode);
2490 }
2491
2492 out:
2493 if (beneath)
2494 dput(dentry);
2495
2496 return mp;
2497 }
2498
2499 static inline struct mountpoint *lock_mount(struct path *path)
2500 {
2501 return do_lock_mount(path, false);
2502 }
2503
2504 static void unlock_mount(struct mountpoint *where)
2505 {
2506 struct dentry *dentry = where->m_dentry;
2507
2508 read_seqlock_excl(&mount_lock);
2509 put_mountpoint(where);
2510 read_sequnlock_excl(&mount_lock);
2511
2512 namespace_unlock();
2513 inode_unlock(dentry->d_inode);
2514 }
2515
2516 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
2517 {
2518 if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
2519 return -EINVAL;
2520
2521 if (d_is_dir(mp->m_dentry) !=
2522 d_is_dir(mnt->mnt.mnt_root))
2523 return -ENOTDIR;
2524
2525 return attach_recursive_mnt(mnt, p, mp, 0);
2526 }
2527
2528 /*
2529 * Sanity check the flags to change_mnt_propagation.
2530 */
2531
2532 static int flags_to_propagation_type(int ms_flags)
2533 {
2534 int type = ms_flags & ~(MS_REC | MS_SILENT);
2535
2536 /* Fail if any non-propagation flags are set */
2537 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2538 return 0;
2539 /* Only one propagation flag should be set */
2540 if (!is_power_of_2(type))
2541 return 0;
2542 return type;
2543 }
2544
2545 /*
2546 * recursively change the type of the mountpoint.
2547 */
2548 static int do_change_type(struct path *path, int ms_flags)
2549 {
2550 struct mount *m;
2551 struct mount *mnt = real_mount(path->mnt);
2552 int recurse = ms_flags & MS_REC;
2553 int type;
2554 int err = 0;
2555
2556 if (!path_mounted(path))
2557 return -EINVAL;
2558
2559 type = flags_to_propagation_type(ms_flags);
2560 if (!type)
2561 return -EINVAL;
2562
2563 namespace_lock();
2564 if (type == MS_SHARED) {
2565 err = invent_group_ids(mnt, recurse);
2566 if (err)
2567 goto out_unlock;
2568 }
2569
2570 lock_mount_hash();
2571 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
2572 change_mnt_propagation(m, type);
2573 unlock_mount_hash();
2574
2575 out_unlock:
2576 namespace_unlock();
2577 return err;
2578 }
2579
2580 static struct mount *__do_loopback(struct path *old_path, int recurse)
2581 {
2582 struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
2583
2584 if (IS_MNT_UNBINDABLE(old))
2585 return mnt;
2586
2587 if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
2588 return mnt;
2589
2590 if (!recurse && has_locked_children(old, old_path->dentry))
2591 return mnt;
2592
2593 if (recurse)
2594 mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
2595 else
2596 mnt = clone_mnt(old, old_path->dentry, 0);
2597
2598 if (!IS_ERR(mnt))
2599 mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2600
2601 return mnt;
2602 }
2603
2604 /*
2605 * do loopback mount.
2606 */
2607 static int do_loopback(struct path *path, const char *old_name,
2608 int recurse)
2609 {
2610 struct path old_path;
2611 struct mount *mnt = NULL, *parent;
2612 struct mountpoint *mp;
2613 int err;
2614 if (!old_name || !*old_name)
2615 return -EINVAL;
2616 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
2617 if (err)
2618 return err;
2619
2620 err = -EINVAL;
2621 if (mnt_ns_loop(old_path.dentry))
2622 goto out;
2623
2624 mp = lock_mount(path);
2625 if (IS_ERR(mp)) {
2626 err = PTR_ERR(mp);
2627 goto out;
2628 }
2629
2630 parent = real_mount(path->mnt);
2631 if (!check_mnt(parent))
2632 goto out2;
2633
2634 mnt = __do_loopback(&old_path, recurse);
2635 if (IS_ERR(mnt)) {
2636 err = PTR_ERR(mnt);
2637 goto out2;
2638 }
2639
2640 err = graft_tree(mnt, parent, mp);
2641 if (err) {
2642 lock_mount_hash();
2643 umount_tree(mnt, UMOUNT_SYNC);
2644 unlock_mount_hash();
2645 }
2646 out2:
2647 unlock_mount(mp);
2648 out:
2649 path_put(&old_path);
2650 return err;
2651 }
2652
2653 static struct file *open_detached_copy(struct path *path, bool recursive)
2654 {
2655 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2656 struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
2657 struct mount *mnt, *p;
2658 struct file *file;
2659
2660 if (IS_ERR(ns))
2661 return ERR_CAST(ns);
2662
2663 namespace_lock();
2664 mnt = __do_loopback(path, recursive);
2665 if (IS_ERR(mnt)) {
2666 namespace_unlock();
2667 free_mnt_ns(ns);
2668 return ERR_CAST(mnt);
2669 }
2670
2671 lock_mount_hash();
2672 for (p = mnt; p; p = next_mnt(p, mnt)) {
2673 mnt_add_to_ns(ns, p);
2674 ns->nr_mounts++;
2675 }
2676 ns->root = mnt;
2677 mntget(&mnt->mnt);
2678 unlock_mount_hash();
2679 namespace_unlock();
2680
2681 mntput(path->mnt);
2682 path->mnt = &mnt->mnt;
2683 file = dentry_open(path, O_PATH, current_cred());
2684 if (IS_ERR(file))
2685 dissolve_on_fput(path->mnt);
2686 else
2687 file->f_mode |= FMODE_NEED_UNMOUNT;
2688 return file;
2689 }
2690
2691 SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
2692 {
2693 struct file *file;
2694 struct path path;
2695 int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
2696 bool detached = flags & OPEN_TREE_CLONE;
2697 int error;
2698 int fd;
2699
2700 BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
2701
2702 if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
2703 AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
2704 OPEN_TREE_CLOEXEC))
2705 return -EINVAL;
2706
2707 if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
2708 return -EINVAL;
2709
2710 if (flags & AT_NO_AUTOMOUNT)
2711 lookup_flags &= ~LOOKUP_AUTOMOUNT;
2712 if (flags & AT_SYMLINK_NOFOLLOW)
2713 lookup_flags &= ~LOOKUP_FOLLOW;
2714 if (flags & AT_EMPTY_PATH)
2715 lookup_flags |= LOOKUP_EMPTY;
2716
2717 if (detached && !may_mount())
2718 return -EPERM;
2719
2720 fd = get_unused_fd_flags(flags & O_CLOEXEC);
2721 if (fd < 0)
2722 return fd;
2723
2724 error = user_path_at(dfd, filename, lookup_flags, &path);
2725 if (unlikely(error)) {
2726 file = ERR_PTR(error);
2727 } else {
2728 if (detached)
2729 file = open_detached_copy(&path, flags & AT_RECURSIVE);
2730 else
2731 file = dentry_open(&path, O_PATH, current_cred());
2732 path_put(&path);
2733 }
2734 if (IS_ERR(file)) {
2735 put_unused_fd(fd);
2736 return PTR_ERR(file);
2737 }
2738 fd_install(fd, file);
2739 return fd;
2740 }
2741
2742 /*
2743 * Don't allow locked mount flags to be cleared.
2744 *
2745 * No locks need to be held here while testing the various MNT_LOCK
2746 * flags because those flags can never be cleared once they are set.
2747 */
2748 static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
2749 {
2750 unsigned int fl = mnt->mnt.mnt_flags;
2751
2752 if ((fl & MNT_LOCK_READONLY) &&
2753 !(mnt_flags & MNT_READONLY))
2754 return false;
2755
2756 if ((fl & MNT_LOCK_NODEV) &&
2757 !(mnt_flags & MNT_NODEV))
2758 return false;
2759
2760 if ((fl & MNT_LOCK_NOSUID) &&
2761 !(mnt_flags & MNT_NOSUID))
2762 return false;
2763
2764 if ((fl & MNT_LOCK_NOEXEC) &&
2765 !(mnt_flags & MNT_NOEXEC))
2766 return false;
2767
2768 if ((fl & MNT_LOCK_ATIME) &&
2769 ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
2770 return false;
2771
2772 return true;
2773 }
2774
2775 static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
2776 {
2777 bool readonly_request = (mnt_flags & MNT_READONLY);
2778
2779 if (readonly_request == __mnt_is_readonly(&mnt->mnt))
2780 return 0;
2781
2782 if (readonly_request)
2783 return mnt_make_readonly(mnt);
2784
2785 mnt->mnt.mnt_flags &= ~MNT_READONLY;
2786 return 0;
2787 }
2788
2789 static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
2790 {
2791 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2792 mnt->mnt.mnt_flags = mnt_flags;
2793 touch_mnt_namespace(mnt->mnt_ns);
2794 }
2795
2796 static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
2797 {
2798 struct super_block *sb = mnt->mnt_sb;
2799
2800 if (!__mnt_is_readonly(mnt) &&
2801 (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
2802 (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
2803 char *buf = (char *)__get_free_page(GFP_KERNEL);
2804 char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
2805
2806 pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
2807 sb->s_type->name,
2808 is_mounted(mnt) ? "remounted" : "mounted",
2809 mntpath, &sb->s_time_max,
2810 (unsigned long long)sb->s_time_max);
2811
2812 free_page((unsigned long)buf);
2813 sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
2814 }
2815 }
2816
2817 /*
2818 * Handle reconfiguration of the mountpoint only without alteration of the
2819 * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
2820 * to mount(2).
2821 */
2822 static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
2823 {
2824 struct super_block *sb = path->mnt->mnt_sb;
2825 struct mount *mnt = real_mount(path->mnt);
2826 int ret;
2827
2828 if (!check_mnt(mnt))
2829 return -EINVAL;
2830
2831 if (!path_mounted(path))
2832 return -EINVAL;
2833
2834 if (!can_change_locked_flags(mnt, mnt_flags))
2835 return -EPERM;
2836
2837 /*
2838 * We're only checking whether the superblock is read-only not
2839 * changing it, so only take down_read(&sb->s_umount).
2840 */
2841 down_read(&sb->s_umount);
2842 lock_mount_hash();
2843 ret = change_mount_ro_state(mnt, mnt_flags);
2844 if (ret == 0)
2845 set_mount_attributes(mnt, mnt_flags);
2846 unlock_mount_hash();
2847 up_read(&sb->s_umount);
2848
2849 mnt_warn_timestamp_expiry(path, &mnt->mnt);
2850
2851 return ret;
2852 }
2853
2854 /*
2855 * change filesystem flags. dir should be a physical root of filesystem.
2856 * If you've mounted a non-root directory somewhere and want to do remount
2857 * on it - tough luck.
2858 */
2859 static int do_remount(struct path *path, int ms_flags, int sb_flags,
2860 int mnt_flags, void *data)
2861 {
2862 int err;
2863 struct super_block *sb = path->mnt->mnt_sb;
2864 struct mount *mnt = real_mount(path->mnt);
2865 struct fs_context *fc;
2866
2867 if (!check_mnt(mnt))
2868 return -EINVAL;
2869
2870 if (!path_mounted(path))
2871 return -EINVAL;
2872
2873 if (!can_change_locked_flags(mnt, mnt_flags))
2874 return -EPERM;
2875
2876 fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
2877 if (IS_ERR(fc))
2878 return PTR_ERR(fc);
2879
2880 fc->oldapi = true;
2881 err = parse_monolithic_mount_data(fc, data);
2882 if (!err) {
2883 down_write(&sb->s_umount);
2884 err = -EPERM;
2885 if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
2886 err = reconfigure_super(fc);
2887 if (!err) {
2888 lock_mount_hash();
2889 set_mount_attributes(mnt, mnt_flags);
2890 unlock_mount_hash();
2891 }
2892 }
2893 up_write(&sb->s_umount);
2894 }
2895
2896 mnt_warn_timestamp_expiry(path, &mnt->mnt);
2897
2898 put_fs_context(fc);
2899 return err;
2900 }
2901
2902 static inline int tree_contains_unbindable(struct mount *mnt)
2903 {
2904 struct mount *p;
2905 for (p = mnt; p; p = next_mnt(p, mnt)) {
2906 if (IS_MNT_UNBINDABLE(p))
2907 return 1;
2908 }
2909 return 0;
2910 }
2911
2912 /*
2913 * Check that there aren't references to earlier/same mount namespaces in the
2914 * specified subtree. Such references can act as pins for mount namespaces
2915 * that aren't checked by the mount-cycle checking code, thereby allowing
2916 * cycles to be made.
2917 */
2918 static bool check_for_nsfs_mounts(struct mount *subtree)
2919 {
2920 struct mount *p;
2921 bool ret = false;
2922
2923 lock_mount_hash();
2924 for (p = subtree; p; p = next_mnt(p, subtree))
2925 if (mnt_ns_loop(p->mnt.mnt_root))
2926 goto out;
2927
2928 ret = true;
2929 out:
2930 unlock_mount_hash();
2931 return ret;
2932 }
2933
2934 static int do_set_group(struct path *from_path, struct path *to_path)
2935 {
2936 struct mount *from, *to;
2937 int err;
2938
2939 from = real_mount(from_path->mnt);
2940 to = real_mount(to_path->mnt);
2941
2942 namespace_lock();
2943
2944 err = -EINVAL;
2945 /* To and From must be mounted */
2946 if (!is_mounted(&from->mnt))
2947 goto out;
2948 if (!is_mounted(&to->mnt))
2949 goto out;
2950
2951 err = -EPERM;
2952 /* We should be allowed to modify mount namespaces of both mounts */
2953 if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
2954 goto out;
2955 if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
2956 goto out;
2957
2958 err = -EINVAL;
2959 /* To and From paths should be mount roots */
2960 if (!path_mounted(from_path))
2961 goto out;
2962 if (!path_mounted(to_path))
2963 goto out;
2964
2965 /* Setting sharing groups is only allowed across same superblock */
2966 if (from->mnt.mnt_sb != to->mnt.mnt_sb)
2967 goto out;
2968
2969 /* From mount root should be wider than To mount root */
2970 if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
2971 goto out;
2972
2973 /* From mount should not have locked children in place of To's root */
2974 if (has_locked_children(from, to->mnt.mnt_root))
2975 goto out;
2976
2977 /* Setting sharing groups is only allowed on private mounts */
2978 if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
2979 goto out;
2980
2981 /* From should not be private */
2982 if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
2983 goto out;
2984
2985 if (IS_MNT_SLAVE(from)) {
2986 struct mount *m = from->mnt_master;
2987
2988 list_add(&to->mnt_slave, &m->mnt_slave_list);
2989 to->mnt_master = m;
2990 }
2991
2992 if (IS_MNT_SHARED(from)) {
2993 to->mnt_group_id = from->mnt_group_id;
2994 list_add(&to->mnt_share, &from->mnt_share);
2995 lock_mount_hash();
2996 set_mnt_shared(to);
2997 unlock_mount_hash();
2998 }
2999
3000 err = 0;
3001 out:
3002 namespace_unlock();
3003 return err;
3004 }
3005
3006 /**
3007 * path_overmounted - check if path is overmounted
3008 * @path: path to check
3009 *
3010 * Check if path is overmounted, i.e., if there's a mount on top of
3011 * @path->mnt with @path->dentry as mountpoint.
3012 *
3013 * Context: This function expects namespace_lock() to be held.
3014 * Return: If path is overmounted true is returned, false if not.
3015 */
3016 static inline bool path_overmounted(const struct path *path)
3017 {
3018 rcu_read_lock();
3019 if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
3020 rcu_read_unlock();
3021 return true;
3022 }
3023 rcu_read_unlock();
3024 return false;
3025 }
3026
3027 /**
3028 * can_move_mount_beneath - check that we can mount beneath the top mount
3029 * @from: mount to mount beneath
3030 * @to: mount under which to mount
3031 *
3032 * - Make sure that @to->dentry is actually the root of a mount under
3033 * which we can mount another mount.
3034 * - Make sure that nothing can be mounted beneath the caller's current
3035 * root or the rootfs of the namespace.
3036 * - Make sure that the caller can unmount the topmost mount ensuring
3037 * that the caller could reveal the underlying mountpoint.
3038 * - Ensure that nothing has been mounted on top of @from before we
3039 * grabbed @namespace_sem to avoid creating pointless shadow mounts.
3040 * - Prevent mounting beneath a mount if the propagation relationship
3041 * between the source mount, parent mount, and top mount would lead to
3042 * nonsensical mount trees.
3043 *
3044 * Context: This function expects namespace_lock() to be held.
3045 * Return: On success 0, and on error a negative error code is returned.
3046 */
3047 static int can_move_mount_beneath(const struct path *from,
3048 const struct path *to,
3049 const struct mountpoint *mp)
3050 {
3051 struct mount *mnt_from = real_mount(from->mnt),
3052 *mnt_to = real_mount(to->mnt),
3053 *parent_mnt_to = mnt_to->mnt_parent;
3054
3055 if (!mnt_has_parent(mnt_to))
3056 return -EINVAL;
3057
3058 if (!path_mounted(to))
3059 return -EINVAL;
3060
3061 if (IS_MNT_LOCKED(mnt_to))
3062 return -EINVAL;
3063
3064 /* Avoid creating shadow mounts during mount propagation. */
3065 if (path_overmounted(from))
3066 return -EINVAL;
3067
3068 /*
3069 * Mounting beneath the rootfs only makes sense when the
3070 * semantics of pivot_root(".", ".") are used.
3071 */
3072 if (&mnt_to->mnt == current->fs->root.mnt)
3073 return -EINVAL;
3074 if (parent_mnt_to == current->nsproxy->mnt_ns->root)
3075 return -EINVAL;
3076
3077 for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
3078 if (p == mnt_to)
3079 return -EINVAL;
3080
3081 /*
3082 * If the parent mount propagates to the child mount this would
3083 * mean mounting @mnt_from on @mnt_to->mnt_parent and then
3084 * propagating a copy @c of @mnt_from on top of @mnt_to. This
3085 * defeats the whole purpose of mounting beneath another mount.
3086 */
3087 if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
3088 return -EINVAL;
3089
3090 /*
3091 * If @mnt_to->mnt_parent propagates to @mnt_from this would
3092 * mean propagating a copy @c of @mnt_from on top of @mnt_from.
3093 * Afterwards @mnt_from would be mounted on top of
3094 * @mnt_to->mnt_parent and @mnt_to would be unmounted from
3095 * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
3096 * already mounted on @mnt_from, @mnt_to would ultimately be
3097 * remounted on top of @c. Afterwards, @mnt_from would be
3098 * covered by a copy @c of @mnt_from and @c would be covered by
3099 * @mnt_from itself. This defeats the whole purpose of mounting
3100 * @mnt_from beneath @mnt_to.
3101 */
3102 if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
3103 return -EINVAL;
3104
3105 return 0;
3106 }
3107
3108 static int do_move_mount(struct path *old_path, struct path *new_path,
3109 bool beneath)
3110 {
3111 struct mnt_namespace *ns;
3112 struct mount *p;
3113 struct mount *old;
3114 struct mount *parent;
3115 struct mountpoint *mp, *old_mp;
3116 int err;
3117 bool attached;
3118 enum mnt_tree_flags_t flags = 0;
3119
3120 mp = do_lock_mount(new_path, beneath);
3121 if (IS_ERR(mp))
3122 return PTR_ERR(mp);
3123
3124 old = real_mount(old_path->mnt);
3125 p = real_mount(new_path->mnt);
3126 parent = old->mnt_parent;
3127 attached = mnt_has_parent(old);
3128 if (attached)
3129 flags |= MNT_TREE_MOVE;
3130 old_mp = old->mnt_mp;
3131 ns = old->mnt_ns;
3132
3133 err = -EINVAL;
3134 /* The mountpoint must be in our namespace. */
3135 if (!check_mnt(p))
3136 goto out;
3137
3138 /* The thing moved must be mounted... */
3139 if (!is_mounted(&old->mnt))
3140 goto out;
3141
3142 /* ... and either ours or the root of anon namespace */
3143 if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
3144 goto out;
3145
3146 if (old->mnt.mnt_flags & MNT_LOCKED)
3147 goto out;
3148
3149 if (!path_mounted(old_path))
3150 goto out;
3151
3152 if (d_is_dir(new_path->dentry) !=
3153 d_is_dir(old_path->dentry))
3154 goto out;
3155 /*
3156 * Don't move a mount residing in a shared parent.
3157 */
3158 if (attached && IS_MNT_SHARED(parent))
3159 goto out;
3160
3161 if (beneath) {
3162 err = can_move_mount_beneath(old_path, new_path, mp);
3163 if (err)
3164 goto out;
3165
3166 err = -EINVAL;
3167 p = p->mnt_parent;
3168 flags |= MNT_TREE_BENEATH;
3169 }
3170
3171 /*
3172 * Don't move a mount tree containing unbindable mounts to a destination
3173 * mount which is shared.
3174 */
3175 if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
3176 goto out;
3177 err = -ELOOP;
3178 if (!check_for_nsfs_mounts(old))
3179 goto out;
3180 for (; mnt_has_parent(p); p = p->mnt_parent)
3181 if (p == old)
3182 goto out;
3183
3184 err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
3185 if (err)
3186 goto out;
3187
3188 /* if the mount is moved, it should no longer be expire
3189 * automatically */
3190 list_del_init(&old->mnt_expire);
3191 if (attached)
3192 put_mountpoint(old_mp);
3193 out:
3194 unlock_mount(mp);
3195 if (!err) {
3196 if (attached)
3197 mntput_no_expire(parent);
3198 else
3199 free_mnt_ns(ns);
3200 }
3201 return err;
3202 }
3203
3204 static int do_move_mount_old(struct path *path, const char *old_name)
3205 {
3206 struct path old_path;
3207 int err;
3208
3209 if (!old_name || !*old_name)
3210 return -EINVAL;
3211
3212 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
3213 if (err)
3214 return err;
3215
3216 err = do_move_mount(&old_path, path, false);
3217 path_put(&old_path);
3218 return err;
3219 }
3220
3221 /*
3222 * add a mount into a namespace's mount tree
3223 */
3224 static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
3225 const struct path *path, int mnt_flags)
3226 {
3227 struct mount *parent = real_mount(path->mnt);
3228
3229 mnt_flags &= ~MNT_INTERNAL_FLAGS;
3230
3231 if (unlikely(!check_mnt(parent))) {
3232 /* that's acceptable only for automounts done in private ns */
3233 if (!(mnt_flags & MNT_SHRINKABLE))
3234 return -EINVAL;
3235 /* ... and for those we'd better have mountpoint still alive */
3236 if (!parent->mnt_ns)
3237 return -EINVAL;
3238 }
3239
3240 /* Refuse the same filesystem on the same mount point */
3241 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
3242 return -EBUSY;
3243
3244 if (d_is_symlink(newmnt->mnt.mnt_root))
3245 return -EINVAL;
3246
3247 newmnt->mnt.mnt_flags = mnt_flags;
3248 return graft_tree(newmnt, parent, mp);
3249 }
3250
3251 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
3252
3253 /*
3254 * Create a new mount using a superblock configuration and request it
3255 * be added to the namespace tree.
3256 */
3257 static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
3258 unsigned int mnt_flags)
3259 {
3260 struct vfsmount *mnt;
3261 struct mountpoint *mp;
3262 struct super_block *sb = fc->root->d_sb;
3263 int error;
3264
3265 error = security_sb_kern_mount(sb);
3266 if (!error && mount_too_revealing(sb, &mnt_flags))
3267 error = -EPERM;
3268
3269 if (unlikely(error)) {
3270 fc_drop_locked(fc);
3271 return error;
3272 }
3273
3274 up_write(&sb->s_umount);
3275
3276 mnt = vfs_create_mount(fc);
3277 if (IS_ERR(mnt))
3278 return PTR_ERR(mnt);
3279
3280 mnt_warn_timestamp_expiry(mountpoint, mnt);
3281
3282 mp = lock_mount(mountpoint);
3283 if (IS_ERR(mp)) {
3284 mntput(mnt);
3285 return PTR_ERR(mp);
3286 }
3287 error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
3288 unlock_mount(mp);
3289 if (error < 0)
3290 mntput(mnt);
3291 return error;
3292 }
3293
3294 /*
3295 * create a new mount for userspace and request it to be added into the
3296 * namespace's tree
3297 */
3298 static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
3299 int mnt_flags, const char *name, void *data)
3300 {
3301 struct file_system_type *type;
3302 struct fs_context *fc;
3303 const char *subtype = NULL;
3304 int err = 0;
3305
3306 if (!fstype)
3307 return -EINVAL;
3308
3309 type = get_fs_type(fstype);
3310 if (!type)
3311 return -ENODEV;
3312
3313 if (type->fs_flags & FS_HAS_SUBTYPE) {
3314 subtype = strchr(fstype, '.');
3315 if (subtype) {
3316 subtype++;
3317 if (!*subtype) {
3318 put_filesystem(type);
3319 return -EINVAL;
3320 }
3321 }
3322 }
3323
3324 fc = fs_context_for_mount(type, sb_flags);
3325 put_filesystem(type);
3326 if (IS_ERR(fc))
3327 return PTR_ERR(fc);
3328
3329 if (subtype)
3330 err = vfs_parse_fs_string(fc, "subtype",
3331 subtype, strlen(subtype));
3332 if (!err && name)
3333 err = vfs_parse_fs_string(fc, "source", name, strlen(name));
3334 if (!err)
3335 err = parse_monolithic_mount_data(fc, data);
3336 if (!err && !mount_capable(fc))
3337 err = -EPERM;
3338 if (!err)
3339 err = vfs_get_tree(fc);
3340 if (!err)
3341 err = do_new_mount_fc(fc, path, mnt_flags);
3342
3343 put_fs_context(fc);
3344 return err;
3345 }
3346
3347 int finish_automount(struct vfsmount *m, const struct path *path)
3348 {
3349 struct dentry *dentry = path->dentry;
3350 struct mountpoint *mp;
3351 struct mount *mnt;
3352 int err;
3353
3354 if (!m)
3355 return 0;
3356 if (IS_ERR(m))
3357 return PTR_ERR(m);
3358
3359 mnt = real_mount(m);
3360 /* The new mount record should have at least 2 refs to prevent it being
3361 * expired before we get a chance to add it
3362 */
3363 BUG_ON(mnt_get_count(mnt) < 2);
3364
3365 if (m->mnt_sb == path->mnt->mnt_sb &&
3366 m->mnt_root == dentry) {
3367 err = -ELOOP;
3368 goto discard;
3369 }
3370
3371 /*
3372 * we don't want to use lock_mount() - in this case finding something
3373 * that overmounts our mountpoint to be means "quitely drop what we've
3374 * got", not "try to mount it on top".
3375 */
3376 inode_lock(dentry->d_inode);
3377 namespace_lock();
3378 if (unlikely(cant_mount(dentry))) {
3379 err = -ENOENT;
3380 goto discard_locked;
3381 }
3382 if (path_overmounted(path)) {
3383 err = 0;
3384 goto discard_locked;
3385 }
3386 mp = get_mountpoint(dentry);
3387 if (IS_ERR(mp)) {
3388 err = PTR_ERR(mp);
3389 goto discard_locked;
3390 }
3391
3392 err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
3393 unlock_mount(mp);
3394 if (unlikely(err))
3395 goto discard;
3396 mntput(m);
3397 return 0;
3398
3399 discard_locked:
3400 namespace_unlock();
3401 inode_unlock(dentry->d_inode);
3402 discard:
3403 /* remove m from any expiration list it may be on */
3404 if (!list_empty(&mnt->mnt_expire)) {
3405 namespace_lock();
3406 list_del_init(&mnt->mnt_expire);
3407 namespace_unlock();
3408 }
3409 mntput(m);
3410 mntput(m);
3411 return err;
3412 }
3413
3414 /**
3415 * mnt_set_expiry - Put a mount on an expiration list
3416 * @mnt: The mount to list.
3417 * @expiry_list: The list to add the mount to.
3418 */
3419 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
3420 {
3421 namespace_lock();
3422
3423 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
3424
3425 namespace_unlock();
3426 }
3427 EXPORT_SYMBOL(mnt_set_expiry);
3428
3429 /*
3430 * process a list of expirable mountpoints with the intent of discarding any
3431 * mountpoints that aren't in use and haven't been touched since last we came
3432 * here
3433 */
3434 void mark_mounts_for_expiry(struct list_head *mounts)
3435 {
3436 struct mount *mnt, *next;
3437 LIST_HEAD(graveyard);
3438
3439 if (list_empty(mounts))
3440 return;
3441
3442 namespace_lock();
3443 lock_mount_hash();
3444
3445 /* extract from the expiration list every vfsmount that matches the
3446 * following criteria:
3447 * - only referenced by its parent vfsmount
3448 * - still marked for expiry (marked on the last call here; marks are
3449 * cleared by mntput())
3450 */
3451 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
3452 if (!xchg(&mnt->mnt_expiry_mark, 1) ||
3453 propagate_mount_busy(mnt, 1))
3454 continue;
3455 list_move(&mnt->mnt_expire, &graveyard);
3456 }
3457 while (!list_empty(&graveyard)) {
3458 mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
3459 touch_mnt_namespace(mnt->mnt_ns);
3460 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3461 }
3462 unlock_mount_hash();
3463 namespace_unlock();
3464 }
3465
3466 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
3467
3468 /*
3469 * Ripoff of 'select_parent()'
3470 *
3471 * search the list of submounts for a given mountpoint, and move any
3472 * shrinkable submounts to the 'graveyard' list.
3473 */
3474 static int select_submounts(struct mount *parent, struct list_head *graveyard)
3475 {
3476 struct mount *this_parent = parent;
3477 struct list_head *next;
3478 int found = 0;
3479
3480 repeat:
3481 next = this_parent->mnt_mounts.next;
3482 resume:
3483 while (next != &this_parent->mnt_mounts) {
3484 struct list_head *tmp = next;
3485 struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
3486
3487 next = tmp->next;
3488 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
3489 continue;
3490 /*
3491 * Descend a level if the d_mounts list is non-empty.
3492 */
3493 if (!list_empty(&mnt->mnt_mounts)) {
3494 this_parent = mnt;
3495 goto repeat;
3496 }
3497
3498 if (!propagate_mount_busy(mnt, 1)) {
3499 list_move_tail(&mnt->mnt_expire, graveyard);
3500 found++;
3501 }
3502 }
3503 /*
3504 * All done at this level ... ascend and resume the search
3505 */
3506 if (this_parent != parent) {
3507 next = this_parent->mnt_child.next;
3508 this_parent = this_parent->mnt_parent;
3509 goto resume;
3510 }
3511 return found;
3512 }
3513
3514 /*
3515 * process a list of expirable mountpoints with the intent of discarding any
3516 * submounts of a specific parent mountpoint
3517 *
3518 * mount_lock must be held for write
3519 */
3520 static void shrink_submounts(struct mount *mnt)
3521 {
3522 LIST_HEAD(graveyard);
3523 struct mount *m;
3524
3525 /* extract submounts of 'mountpoint' from the expiration list */
3526 while (select_submounts(mnt, &graveyard)) {
3527 while (!list_empty(&graveyard)) {
3528 m = list_first_entry(&graveyard, struct mount,
3529 mnt_expire);
3530 touch_mnt_namespace(m->mnt_ns);
3531 umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3532 }
3533 }
3534 }
3535
3536 static void *copy_mount_options(const void __user * data)
3537 {
3538 char *copy;
3539 unsigned left, offset;
3540
3541 if (!data)
3542 return NULL;
3543
3544 copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
3545 if (!copy)
3546 return ERR_PTR(-ENOMEM);
3547
3548 left = copy_from_user(copy, data, PAGE_SIZE);
3549
3550 /*
3551 * Not all architectures have an exact copy_from_user(). Resort to
3552 * byte at a time.
3553 */
3554 offset = PAGE_SIZE - left;
3555 while (left) {
3556 char c;
3557 if (get_user(c, (const char __user *)data + offset))
3558 break;
3559 copy[offset] = c;
3560 left--;
3561 offset++;
3562 }
3563
3564 if (left == PAGE_SIZE) {
3565 kfree(copy);
3566 return ERR_PTR(-EFAULT);
3567 }
3568
3569 return copy;
3570 }
3571
3572 static char *copy_mount_string(const void __user *data)
3573 {
3574 return data ? strndup_user(data, PATH_MAX) : NULL;
3575 }
3576
3577 /*
3578 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
3579 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
3580 *
3581 * data is a (void *) that can point to any structure up to
3582 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
3583 * information (or be NULL).
3584 *
3585 * Pre-0.97 versions of mount() didn't have a flags word.
3586 * When the flags word was introduced its top half was required
3587 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
3588 * Therefore, if this magic number is present, it carries no information
3589 * and must be discarded.
3590 */
3591 int path_mount(const char *dev_name, struct path *path,
3592 const char *type_page, unsigned long flags, void *data_page)
3593 {
3594 unsigned int mnt_flags = 0, sb_flags;
3595 int ret;
3596
3597 /* Discard magic */
3598 if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
3599 flags &= ~MS_MGC_MSK;
3600
3601 /* Basic sanity checks */
3602 if (data_page)
3603 ((char *)data_page)[PAGE_SIZE - 1] = 0;
3604
3605 if (flags & MS_NOUSER)
3606 return -EINVAL;
3607
3608 ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
3609 if (ret)
3610 return ret;
3611 if (!may_mount())
3612 return -EPERM;
3613 if (flags & SB_MANDLOCK)
3614 warn_mandlock();
3615
3616 /* Default to relatime unless overriden */
3617 if (!(flags & MS_NOATIME))
3618 mnt_flags |= MNT_RELATIME;
3619
3620 /* Separate the per-mountpoint flags */
3621 if (flags & MS_NOSUID)
3622 mnt_flags |= MNT_NOSUID;
3623 if (flags & MS_NODEV)
3624 mnt_flags |= MNT_NODEV;
3625 if (flags & MS_NOEXEC)
3626 mnt_flags |= MNT_NOEXEC;
3627 if (flags & MS_NOATIME)
3628 mnt_flags |= MNT_NOATIME;
3629 if (flags & MS_NODIRATIME)
3630 mnt_flags |= MNT_NODIRATIME;
3631 if (flags & MS_STRICTATIME)
3632 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
3633 if (flags & MS_RDONLY)
3634 mnt_flags |= MNT_READONLY;
3635 if (flags & MS_NOSYMFOLLOW)
3636 mnt_flags |= MNT_NOSYMFOLLOW;
3637
3638 /* The default atime for remount is preservation */
3639 if ((flags & MS_REMOUNT) &&
3640 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
3641 MS_STRICTATIME)) == 0)) {
3642 mnt_flags &= ~MNT_ATIME_MASK;
3643 mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
3644 }
3645
3646 sb_flags = flags & (SB_RDONLY |
3647 SB_SYNCHRONOUS |
3648 SB_MANDLOCK |
3649 SB_DIRSYNC |
3650 SB_SILENT |
3651 SB_POSIXACL |
3652 SB_LAZYTIME |
3653 SB_I_VERSION);
3654
3655 if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
3656 return do_reconfigure_mnt(path, mnt_flags);
3657 if (flags & MS_REMOUNT)
3658 return do_remount(path, flags, sb_flags, mnt_flags, data_page);
3659 if (flags & MS_BIND)
3660 return do_loopback(path, dev_name, flags & MS_REC);
3661 if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
3662 return do_change_type(path, flags);
3663 if (flags & MS_MOVE)
3664 return do_move_mount_old(path, dev_name);
3665
3666 return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
3667 data_page);
3668 }
3669
3670 long do_mount(const char *dev_name, const char __user *dir_name,
3671 const char *type_page, unsigned long flags, void *data_page)
3672 {
3673 struct path path;
3674 int ret;
3675
3676 ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
3677 if (ret)
3678 return ret;
3679 ret = path_mount(dev_name, &path, type_page, flags, data_page);
3680 path_put(&path);
3681 return ret;
3682 }
3683
3684 static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
3685 {
3686 return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
3687 }
3688
3689 static void dec_mnt_namespaces(struct ucounts *ucounts)
3690 {
3691 dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
3692 }
3693
3694 static void free_mnt_ns(struct mnt_namespace *ns)
3695 {
3696 if (!is_anon_ns(ns))
3697 ns_free_inum(&ns->ns);
3698 dec_mnt_namespaces(ns->ucounts);
3699 put_user_ns(ns->user_ns);
3700 kfree(ns);
3701 }
3702
3703 /*
3704 * Assign a sequence number so we can detect when we attempt to bind
3705 * mount a reference to an older mount namespace into the current
3706 * mount namespace, preventing reference counting loops. A 64bit
3707 * number incrementing at 10Ghz will take 12,427 years to wrap which
3708 * is effectively never, so we can ignore the possibility.
3709 */
3710 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
3711
3712 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
3713 {
3714 struct mnt_namespace *new_ns;
3715 struct ucounts *ucounts;
3716 int ret;
3717
3718 ucounts = inc_mnt_namespaces(user_ns);
3719 if (!ucounts)
3720 return ERR_PTR(-ENOSPC);
3721
3722 new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
3723 if (!new_ns) {
3724 dec_mnt_namespaces(ucounts);
3725 return ERR_PTR(-ENOMEM);
3726 }
3727 if (!anon) {
3728 ret = ns_alloc_inum(&new_ns->ns);
3729 if (ret) {
3730 kfree(new_ns);
3731 dec_mnt_namespaces(ucounts);
3732 return ERR_PTR(ret);
3733 }
3734 }
3735 new_ns->ns.ops = &mntns_operations;
3736 if (!anon)
3737 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
3738 refcount_set(&new_ns->ns.count, 1);
3739 new_ns->mounts = RB_ROOT;
3740 init_waitqueue_head(&new_ns->poll);
3741 new_ns->user_ns = get_user_ns(user_ns);
3742 new_ns->ucounts = ucounts;
3743 return new_ns;
3744 }
3745
3746 __latent_entropy
3747 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
3748 struct user_namespace *user_ns, struct fs_struct *new_fs)
3749 {
3750 struct mnt_namespace *new_ns;
3751 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
3752 struct mount *p, *q;
3753 struct mount *old;
3754 struct mount *new;
3755 int copy_flags;
3756
3757 BUG_ON(!ns);
3758
3759 if (likely(!(flags & CLONE_NEWNS))) {
3760 get_mnt_ns(ns);
3761 return ns;
3762 }
3763
3764 old = ns->root;
3765
3766 new_ns = alloc_mnt_ns(user_ns, false);
3767 if (IS_ERR(new_ns))
3768 return new_ns;
3769
3770 namespace_lock();
3771 /* First pass: copy the tree topology */
3772 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
3773 if (user_ns != ns->user_ns)
3774 copy_flags |= CL_SHARED_TO_SLAVE;
3775 new = copy_tree(old, old->mnt.mnt_root, copy_flags);
3776 if (IS_ERR(new)) {
3777 namespace_unlock();
3778 free_mnt_ns(new_ns);
3779 return ERR_CAST(new);
3780 }
3781 if (user_ns != ns->user_ns) {
3782 lock_mount_hash();
3783 lock_mnt_tree(new);
3784 unlock_mount_hash();
3785 }
3786 new_ns->root = new;
3787
3788 /*
3789 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
3790 * as belonging to new namespace. We have already acquired a private
3791 * fs_struct, so tsk->fs->lock is not needed.
3792 */
3793 p = old;
3794 q = new;
3795 while (p) {
3796 mnt_add_to_ns(new_ns, q);
3797 new_ns->nr_mounts++;
3798 if (new_fs) {
3799 if (&p->mnt == new_fs->root.mnt) {
3800 new_fs->root.mnt = mntget(&q->mnt);
3801 rootmnt = &p->mnt;
3802 }
3803 if (&p->mnt == new_fs->pwd.mnt) {
3804 new_fs->pwd.mnt = mntget(&q->mnt);
3805 pwdmnt = &p->mnt;
3806 }
3807 }
3808 p = next_mnt(p, old);
3809 q = next_mnt(q, new);
3810 if (!q)
3811 break;
3812 // an mntns binding we'd skipped?
3813 while (p->mnt.mnt_root != q->mnt.mnt_root)
3814 p = next_mnt(skip_mnt_tree(p), old);
3815 }
3816 namespace_unlock();
3817
3818 if (rootmnt)
3819 mntput(rootmnt);
3820 if (pwdmnt)
3821 mntput(pwdmnt);
3822
3823 return new_ns;
3824 }
3825
3826 struct dentry *mount_subtree(struct vfsmount *m, const char *name)
3827 {
3828 struct mount *mnt = real_mount(m);
3829 struct mnt_namespace *ns;
3830 struct super_block *s;
3831 struct path path;
3832 int err;
3833
3834 ns = alloc_mnt_ns(&init_user_ns, true);
3835 if (IS_ERR(ns)) {
3836 mntput(m);
3837 return ERR_CAST(ns);
3838 }
3839 ns->root = mnt;
3840 ns->nr_mounts++;
3841 mnt_add_to_ns(ns, mnt);
3842
3843 err = vfs_path_lookup(m->mnt_root, m,
3844 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
3845
3846 put_mnt_ns(ns);
3847
3848 if (err)
3849 return ERR_PTR(err);
3850
3851 /* trade a vfsmount reference for active sb one */
3852 s = path.mnt->mnt_sb;
3853 atomic_inc(&s->s_active);
3854 mntput(path.mnt);
3855 /* lock the sucker */
3856 down_write(&s->s_umount);
3857 /* ... and return the root of (sub)tree on it */
3858 return path.dentry;
3859 }
3860 EXPORT_SYMBOL(mount_subtree);
3861
3862 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
3863 char __user *, type, unsigned long, flags, void __user *, data)
3864 {
3865 int ret;
3866 char *kernel_type;
3867 char *kernel_dev;
3868 void *options;
3869
3870 kernel_type = copy_mount_string(type);
3871 ret = PTR_ERR(kernel_type);
3872 if (IS_ERR(kernel_type))
3873 goto out_type;
3874
3875 kernel_dev = copy_mount_string(dev_name);
3876 ret = PTR_ERR(kernel_dev);
3877 if (IS_ERR(kernel_dev))
3878 goto out_dev;
3879
3880 options = copy_mount_options(data);
3881 ret = PTR_ERR(options);
3882 if (IS_ERR(options))
3883 goto out_data;
3884
3885 ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
3886
3887 kfree(options);
3888 out_data:
3889 kfree(kernel_dev);
3890 out_dev:
3891 kfree(kernel_type);
3892 out_type:
3893 return ret;
3894 }
3895
3896 #define FSMOUNT_VALID_FLAGS \
3897 (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
3898 MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \
3899 MOUNT_ATTR_NOSYMFOLLOW)
3900
3901 #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
3902
3903 #define MOUNT_SETATTR_PROPAGATION_FLAGS \
3904 (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
3905
3906 static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
3907 {
3908 unsigned int mnt_flags = 0;
3909
3910 if (attr_flags & MOUNT_ATTR_RDONLY)
3911 mnt_flags |= MNT_READONLY;
3912 if (attr_flags & MOUNT_ATTR_NOSUID)
3913 mnt_flags |= MNT_NOSUID;
3914 if (attr_flags & MOUNT_ATTR_NODEV)
3915 mnt_flags |= MNT_NODEV;
3916 if (attr_flags & MOUNT_ATTR_NOEXEC)
3917 mnt_flags |= MNT_NOEXEC;
3918 if (attr_flags & MOUNT_ATTR_NODIRATIME)
3919 mnt_flags |= MNT_NODIRATIME;
3920 if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
3921 mnt_flags |= MNT_NOSYMFOLLOW;
3922
3923 return mnt_flags;
3924 }
3925
3926 /*
3927 * Create a kernel mount representation for a new, prepared superblock
3928 * (specified by fs_fd) and attach to an open_tree-like file descriptor.
3929 */
3930 SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
3931 unsigned int, attr_flags)
3932 {
3933 struct mnt_namespace *ns;
3934 struct fs_context *fc;
3935 struct file *file;
3936 struct path newmount;
3937 struct mount *mnt;
3938 struct fd f;
3939 unsigned int mnt_flags = 0;
3940 long ret;
3941
3942 if (!may_mount())
3943 return -EPERM;
3944
3945 if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
3946 return -EINVAL;
3947
3948 if (attr_flags & ~FSMOUNT_VALID_FLAGS)
3949 return -EINVAL;
3950
3951 mnt_flags = attr_flags_to_mnt_flags(attr_flags);
3952
3953 switch (attr_flags & MOUNT_ATTR__ATIME) {
3954 case MOUNT_ATTR_STRICTATIME:
3955 break;
3956 case MOUNT_ATTR_NOATIME:
3957 mnt_flags |= MNT_NOATIME;
3958 break;
3959 case MOUNT_ATTR_RELATIME:
3960 mnt_flags |= MNT_RELATIME;
3961 break;
3962 default:
3963 return -EINVAL;
3964 }
3965
3966 f = fdget(fs_fd);
3967 if (!f.file)
3968 return -EBADF;
3969
3970 ret = -EINVAL;
3971 if (f.file->f_op != &fscontext_fops)
3972 goto err_fsfd;
3973
3974 fc = f.file->private_data;
3975
3976 ret = mutex_lock_interruptible(&fc->uapi_mutex);
3977 if (ret < 0)
3978 goto err_fsfd;
3979
3980 /* There must be a valid superblock or we can't mount it */
3981 ret = -EINVAL;
3982 if (!fc->root)
3983 goto err_unlock;
3984
3985 ret = -EPERM;
3986 if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
3987 pr_warn("VFS: Mount too revealing\n");
3988 goto err_unlock;
3989 }
3990
3991 ret = -EBUSY;
3992 if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
3993 goto err_unlock;
3994
3995 if (fc->sb_flags & SB_MANDLOCK)
3996 warn_mandlock();
3997
3998 newmount.mnt = vfs_create_mount(fc);
3999 if (IS_ERR(newmount.mnt)) {
4000 ret = PTR_ERR(newmount.mnt);
4001 goto err_unlock;
4002 }
4003 newmount.dentry = dget(fc->root);
4004 newmount.mnt->mnt_flags = mnt_flags;
4005
4006 /* We've done the mount bit - now move the file context into more or
4007 * less the same state as if we'd done an fspick(). We don't want to
4008 * do any memory allocation or anything like that at this point as we
4009 * don't want to have to handle any errors incurred.
4010 */
4011 vfs_clean_context(fc);
4012
4013 ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
4014 if (IS_ERR(ns)) {
4015 ret = PTR_ERR(ns);
4016 goto err_path;
4017 }
4018 mnt = real_mount(newmount.mnt);
4019 ns->root = mnt;
4020 ns->nr_mounts = 1;
4021 mnt_add_to_ns(ns, mnt);
4022 mntget(newmount.mnt);
4023
4024 /* Attach to an apparent O_PATH fd with a note that we need to unmount
4025 * it, not just simply put it.
4026 */
4027 file = dentry_open(&newmount, O_PATH, fc->cred);
4028 if (IS_ERR(file)) {
4029 dissolve_on_fput(newmount.mnt);
4030 ret = PTR_ERR(file);
4031 goto err_path;
4032 }
4033 file->f_mode |= FMODE_NEED_UNMOUNT;
4034
4035 ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
4036 if (ret >= 0)
4037 fd_install(ret, file);
4038 else
4039 fput(file);
4040
4041 err_path:
4042 path_put(&newmount);
4043 err_unlock:
4044 mutex_unlock(&fc->uapi_mutex);
4045 err_fsfd:
4046 fdput(f);
4047 return ret;
4048 }
4049
4050 /*
4051 * Move a mount from one place to another. In combination with
4052 * fsopen()/fsmount() this is used to install a new mount and in combination
4053 * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
4054 * a mount subtree.
4055 *
4056 * Note the flags value is a combination of MOVE_MOUNT_* flags.
4057 */
4058 SYSCALL_DEFINE5(move_mount,
4059 int, from_dfd, const char __user *, from_pathname,
4060 int, to_dfd, const char __user *, to_pathname,
4061 unsigned int, flags)
4062 {
4063 struct path from_path, to_path;
4064 unsigned int lflags;
4065 int ret = 0;
4066
4067 if (!may_mount())
4068 return -EPERM;
4069
4070 if (flags & ~MOVE_MOUNT__MASK)
4071 return -EINVAL;
4072
4073 if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
4074 (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
4075 return -EINVAL;
4076
4077 /* If someone gives a pathname, they aren't permitted to move
4078 * from an fd that requires unmount as we can't get at the flag
4079 * to clear it afterwards.
4080 */
4081 lflags = 0;
4082 if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
4083 if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
4084 if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
4085
4086 ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
4087 if (ret < 0)
4088 return ret;
4089
4090 lflags = 0;
4091 if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
4092 if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
4093 if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
4094
4095 ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
4096 if (ret < 0)
4097 goto out_from;
4098
4099 ret = security_move_mount(&from_path, &to_path);
4100 if (ret < 0)
4101 goto out_to;
4102
4103 if (flags & MOVE_MOUNT_SET_GROUP)
4104 ret = do_set_group(&from_path, &to_path);
4105 else
4106 ret = do_move_mount(&from_path, &to_path,
4107 (flags & MOVE_MOUNT_BENEATH));
4108
4109 out_to:
4110 path_put(&to_path);
4111 out_from:
4112 path_put(&from_path);
4113 return ret;
4114 }
4115
4116 /*
4117 * Return true if path is reachable from root
4118 *
4119 * namespace_sem or mount_lock is held
4120 */
4121 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
4122 const struct path *root)
4123 {
4124 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
4125 dentry = mnt->mnt_mountpoint;
4126 mnt = mnt->mnt_parent;
4127 }
4128 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
4129 }
4130
4131 bool path_is_under(const struct path *path1, const struct path *path2)
4132 {
4133 bool res;
4134 read_seqlock_excl(&mount_lock);
4135 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
4136 read_sequnlock_excl(&mount_lock);
4137 return res;
4138 }
4139 EXPORT_SYMBOL(path_is_under);
4140
4141 /*
4142 * pivot_root Semantics:
4143 * Moves the root file system of the current process to the directory put_old,
4144 * makes new_root as the new root file system of the current process, and sets
4145 * root/cwd of all processes which had them on the current root to new_root.
4146 *
4147 * Restrictions:
4148 * The new_root and put_old must be directories, and must not be on the
4149 * same file system as the current process root. The put_old must be
4150 * underneath new_root, i.e. adding a non-zero number of /.. to the string
4151 * pointed to by put_old must yield the same directory as new_root. No other
4152 * file system may be mounted on put_old. After all, new_root is a mountpoint.
4153 *
4154 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
4155 * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
4156 * in this situation.
4157 *
4158 * Notes:
4159 * - we don't move root/cwd if they are not at the root (reason: if something
4160 * cared enough to change them, it's probably wrong to force them elsewhere)
4161 * - it's okay to pick a root that isn't the root of a file system, e.g.
4162 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
4163 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
4164 * first.
4165 */
4166 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
4167 const char __user *, put_old)
4168 {
4169 struct path new, old, root;
4170 struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
4171 struct mountpoint *old_mp, *root_mp;
4172 int error;
4173
4174 if (!may_mount())
4175 return -EPERM;
4176
4177 error = user_path_at(AT_FDCWD, new_root,
4178 LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
4179 if (error)
4180 goto out0;
4181
4182 error = user_path_at(AT_FDCWD, put_old,
4183 LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
4184 if (error)
4185 goto out1;
4186
4187 error = security_sb_pivotroot(&old, &new);
4188 if (error)
4189 goto out2;
4190
4191 get_fs_root(current->fs, &root);
4192 old_mp = lock_mount(&old);
4193 error = PTR_ERR(old_mp);
4194 if (IS_ERR(old_mp))
4195 goto out3;
4196
4197 error = -EINVAL;
4198 new_mnt = real_mount(new.mnt);
4199 root_mnt = real_mount(root.mnt);
4200 old_mnt = real_mount(old.mnt);
4201 ex_parent = new_mnt->mnt_parent;
4202 root_parent = root_mnt->mnt_parent;
4203 if (IS_MNT_SHARED(old_mnt) ||
4204 IS_MNT_SHARED(ex_parent) ||
4205 IS_MNT_SHARED(root_parent))
4206 goto out4;
4207 if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
4208 goto out4;
4209 if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
4210 goto out4;
4211 error = -ENOENT;
4212 if (d_unlinked(new.dentry))
4213 goto out4;
4214 error = -EBUSY;
4215 if (new_mnt == root_mnt || old_mnt == root_mnt)
4216 goto out4; /* loop, on the same file system */
4217 error = -EINVAL;
4218 if (!path_mounted(&root))
4219 goto out4; /* not a mountpoint */
4220 if (!mnt_has_parent(root_mnt))
4221 goto out4; /* not attached */
4222 if (!path_mounted(&new))
4223 goto out4; /* not a mountpoint */
4224 if (!mnt_has_parent(new_mnt))
4225 goto out4; /* not attached */
4226 /* make sure we can reach put_old from new_root */
4227 if (!is_path_reachable(old_mnt, old.dentry, &new))
4228 goto out4;
4229 /* make certain new is below the root */
4230 if (!is_path_reachable(new_mnt, new.dentry, &root))
4231 goto out4;
4232 lock_mount_hash();
4233 umount_mnt(new_mnt);
4234 root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
4235 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
4236 new_mnt->mnt.mnt_flags |= MNT_LOCKED;
4237 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
4238 }
4239 /* mount old root on put_old */
4240 attach_mnt(root_mnt, old_mnt, old_mp, false);
4241 /* mount new_root on / */
4242 attach_mnt(new_mnt, root_parent, root_mp, false);
4243 mnt_add_count(root_parent, -1);
4244 touch_mnt_namespace(current->nsproxy->mnt_ns);
4245 /* A moved mount should not expire automatically */
4246 list_del_init(&new_mnt->mnt_expire);
4247 put_mountpoint(root_mp);
4248 unlock_mount_hash();
4249 chroot_fs_refs(&root, &new);
4250 error = 0;
4251 out4:
4252 unlock_mount(old_mp);
4253 if (!error)
4254 mntput_no_expire(ex_parent);
4255 out3:
4256 path_put(&root);
4257 out2:
4258 path_put(&old);
4259 out1:
4260 path_put(&new);
4261 out0:
4262 return error;
4263 }
4264
4265 static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
4266 {
4267 unsigned int flags = mnt->mnt.mnt_flags;
4268
4269 /* flags to clear */
4270 flags &= ~kattr->attr_clr;
4271 /* flags to raise */
4272 flags |= kattr->attr_set;
4273
4274 return flags;
4275 }
4276
4277 static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
4278 {
4279 struct vfsmount *m = &mnt->mnt;
4280 struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
4281
4282 if (!kattr->mnt_idmap)
4283 return 0;
4284
4285 /*
4286 * Creating an idmapped mount with the filesystem wide idmapping
4287 * doesn't make sense so block that. We don't allow mushy semantics.
4288 */
4289 if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb))
4290 return -EINVAL;
4291
4292 /*
4293 * Once a mount has been idmapped we don't allow it to change its
4294 * mapping. It makes things simpler and callers can just create
4295 * another bind-mount they can idmap if they want to.
4296 */
4297 if (is_idmapped_mnt(m))
4298 return -EPERM;
4299
4300 /* The underlying filesystem doesn't support idmapped mounts yet. */
4301 if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
4302 return -EINVAL;
4303
4304 /* We're not controlling the superblock. */
4305 if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
4306 return -EPERM;
4307
4308 /* Mount has already been visible in the filesystem hierarchy. */
4309 if (!is_anon_ns(mnt->mnt_ns))
4310 return -EINVAL;
4311
4312 return 0;
4313 }
4314
4315 /**
4316 * mnt_allow_writers() - check whether the attribute change allows writers
4317 * @kattr: the new mount attributes
4318 * @mnt: the mount to which @kattr will be applied
4319 *
4320 * Check whether thew new mount attributes in @kattr allow concurrent writers.
4321 *
4322 * Return: true if writers need to be held, false if not
4323 */
4324 static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
4325 const struct mount *mnt)
4326 {
4327 return (!(kattr->attr_set & MNT_READONLY) ||
4328 (mnt->mnt.mnt_flags & MNT_READONLY)) &&
4329 !kattr->mnt_idmap;
4330 }
4331
4332 static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
4333 {
4334 struct mount *m;
4335 int err;
4336
4337 for (m = mnt; m; m = next_mnt(m, mnt)) {
4338 if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
4339 err = -EPERM;
4340 break;
4341 }
4342
4343 err = can_idmap_mount(kattr, m);
4344 if (err)
4345 break;
4346
4347 if (!mnt_allow_writers(kattr, m)) {
4348 err = mnt_hold_writers(m);
4349 if (err)
4350 break;
4351 }
4352
4353 if (!kattr->recurse)
4354 return 0;
4355 }
4356
4357 if (err) {
4358 struct mount *p;
4359
4360 /*
4361 * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
4362 * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
4363 * mounts and needs to take care to include the first mount.
4364 */
4365 for (p = mnt; p; p = next_mnt(p, mnt)) {
4366 /* If we had to hold writers unblock them. */
4367 if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
4368 mnt_unhold_writers(p);
4369
4370 /*
4371 * We're done once the first mount we changed got
4372 * MNT_WRITE_HOLD unset.
4373 */
4374 if (p == m)
4375 break;
4376 }
4377 }
4378 return err;
4379 }
4380
4381 static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
4382 {
4383 if (!kattr->mnt_idmap)
4384 return;
4385
4386 /*
4387 * Pairs with smp_load_acquire() in mnt_idmap().
4388 *
4389 * Since we only allow a mount to change the idmapping once and
4390 * verified this in can_idmap_mount() we know that the mount has
4391 * @nop_mnt_idmap attached to it. So there's no need to drop any
4392 * references.
4393 */
4394 smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
4395 }
4396
4397 static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
4398 {
4399 struct mount *m;
4400
4401 for (m = mnt; m; m = next_mnt(m, mnt)) {
4402 unsigned int flags;
4403
4404 do_idmap_mount(kattr, m);
4405 flags = recalc_flags(kattr, m);
4406 WRITE_ONCE(m->mnt.mnt_flags, flags);
4407
4408 /* If we had to hold writers unblock them. */
4409 if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
4410 mnt_unhold_writers(m);
4411
4412 if (kattr->propagation)
4413 change_mnt_propagation(m, kattr->propagation);
4414 if (!kattr->recurse)
4415 break;
4416 }
4417 touch_mnt_namespace(mnt->mnt_ns);
4418 }
4419
4420 static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
4421 {
4422 struct mount *mnt = real_mount(path->mnt);
4423 int err = 0;
4424
4425 if (!path_mounted(path))
4426 return -EINVAL;
4427
4428 if (kattr->mnt_userns) {
4429 struct mnt_idmap *mnt_idmap;
4430
4431 mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
4432 if (IS_ERR(mnt_idmap))
4433 return PTR_ERR(mnt_idmap);
4434 kattr->mnt_idmap = mnt_idmap;
4435 }
4436
4437 if (kattr->propagation) {
4438 /*
4439 * Only take namespace_lock() if we're actually changing
4440 * propagation.
4441 */
4442 namespace_lock();
4443 if (kattr->propagation == MS_SHARED) {
4444 err = invent_group_ids(mnt, kattr->recurse);
4445 if (err) {
4446 namespace_unlock();
4447 return err;
4448 }
4449 }
4450 }
4451
4452 err = -EINVAL;
4453 lock_mount_hash();
4454
4455 /* Ensure that this isn't anything purely vfs internal. */
4456 if (!is_mounted(&mnt->mnt))
4457 goto out;
4458
4459 /*
4460 * If this is an attached mount make sure it's located in the callers
4461 * mount namespace. If it's not don't let the caller interact with it.
4462 * If this is a detached mount make sure it has an anonymous mount
4463 * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
4464 */
4465 if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
4466 goto out;
4467
4468 /*
4469 * First, we get the mount tree in a shape where we can change mount
4470 * properties without failure. If we succeeded to do so we commit all
4471 * changes and if we failed we clean up.
4472 */
4473 err = mount_setattr_prepare(kattr, mnt);
4474 if (!err)
4475 mount_setattr_commit(kattr, mnt);
4476
4477 out:
4478 unlock_mount_hash();
4479
4480 if (kattr->propagation) {
4481 if (err)
4482 cleanup_group_ids(mnt, NULL);
4483 namespace_unlock();
4484 }
4485
4486 return err;
4487 }
4488
4489 static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
4490 struct mount_kattr *kattr, unsigned int flags)
4491 {
4492 int err = 0;
4493 struct ns_common *ns;
4494 struct user_namespace *mnt_userns;
4495 struct fd f;
4496
4497 if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
4498 return 0;
4499
4500 /*
4501 * We currently do not support clearing an idmapped mount. If this ever
4502 * is a use-case we can revisit this but for now let's keep it simple
4503 * and not allow it.
4504 */
4505 if (attr->attr_clr & MOUNT_ATTR_IDMAP)
4506 return -EINVAL;
4507
4508 if (attr->userns_fd > INT_MAX)
4509 return -EINVAL;
4510
4511 f = fdget(attr->userns_fd);
4512 if (!f.file)
4513 return -EBADF;
4514
4515 if (!proc_ns_file(f.file)) {
4516 err = -EINVAL;
4517 goto out_fput;
4518 }
4519
4520 ns = get_proc_ns(file_inode(f.file));
4521 if (ns->ops->type != CLONE_NEWUSER) {
4522 err = -EINVAL;
4523 goto out_fput;
4524 }
4525
4526 /*
4527 * The initial idmapping cannot be used to create an idmapped
4528 * mount. We use the initial idmapping as an indicator of a mount
4529 * that is not idmapped. It can simply be passed into helpers that
4530 * are aware of idmapped mounts as a convenient shortcut. A user
4531 * can just create a dedicated identity mapping to achieve the same
4532 * result.
4533 */
4534 mnt_userns = container_of(ns, struct user_namespace, ns);
4535 if (mnt_userns == &init_user_ns) {
4536 err = -EPERM;
4537 goto out_fput;
4538 }
4539
4540 /* We're not controlling the target namespace. */
4541 if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
4542 err = -EPERM;
4543 goto out_fput;
4544 }
4545
4546 kattr->mnt_userns = get_user_ns(mnt_userns);
4547
4548 out_fput:
4549 fdput(f);
4550 return err;
4551 }
4552
4553 static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
4554 struct mount_kattr *kattr, unsigned int flags)
4555 {
4556 unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
4557
4558 if (flags & AT_NO_AUTOMOUNT)
4559 lookup_flags &= ~LOOKUP_AUTOMOUNT;
4560 if (flags & AT_SYMLINK_NOFOLLOW)
4561 lookup_flags &= ~LOOKUP_FOLLOW;
4562 if (flags & AT_EMPTY_PATH)
4563 lookup_flags |= LOOKUP_EMPTY;
4564
4565 *kattr = (struct mount_kattr) {
4566 .lookup_flags = lookup_flags,
4567 .recurse = !!(flags & AT_RECURSIVE),
4568 };
4569
4570 if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
4571 return -EINVAL;
4572 if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
4573 return -EINVAL;
4574 kattr->propagation = attr->propagation;
4575
4576 if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
4577 return -EINVAL;
4578
4579 kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
4580 kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
4581
4582 /*
4583 * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
4584 * users wanting to transition to a different atime setting cannot
4585 * simply specify the atime setting in @attr_set, but must also
4586 * specify MOUNT_ATTR__ATIME in the @attr_clr field.
4587 * So ensure that MOUNT_ATTR__ATIME can't be partially set in
4588 * @attr_clr and that @attr_set can't have any atime bits set if
4589 * MOUNT_ATTR__ATIME isn't set in @attr_clr.
4590 */
4591 if (attr->attr_clr & MOUNT_ATTR__ATIME) {
4592 if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
4593 return -EINVAL;
4594
4595 /*
4596 * Clear all previous time settings as they are mutually
4597 * exclusive.
4598 */
4599 kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
4600 switch (attr->attr_set & MOUNT_ATTR__ATIME) {
4601 case MOUNT_ATTR_RELATIME:
4602 kattr->attr_set |= MNT_RELATIME;
4603 break;
4604 case MOUNT_ATTR_NOATIME:
4605 kattr->attr_set |= MNT_NOATIME;
4606 break;
4607 case MOUNT_ATTR_STRICTATIME:
4608 break;
4609 default:
4610 return -EINVAL;
4611 }
4612 } else {
4613 if (attr->attr_set & MOUNT_ATTR__ATIME)
4614 return -EINVAL;
4615 }
4616
4617 return build_mount_idmapped(attr, usize, kattr, flags);
4618 }
4619
4620 static void finish_mount_kattr(struct mount_kattr *kattr)
4621 {
4622 put_user_ns(kattr->mnt_userns);
4623 kattr->mnt_userns = NULL;
4624
4625 if (kattr->mnt_idmap)
4626 mnt_idmap_put(kattr->mnt_idmap);
4627 }
4628
4629 SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
4630 unsigned int, flags, struct mount_attr __user *, uattr,
4631 size_t, usize)
4632 {
4633 int err;
4634 struct path target;
4635 struct mount_attr attr;
4636 struct mount_kattr kattr;
4637
4638 BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
4639
4640 if (flags & ~(AT_EMPTY_PATH |
4641 AT_RECURSIVE |
4642 AT_SYMLINK_NOFOLLOW |
4643 AT_NO_AUTOMOUNT))
4644 return -EINVAL;
4645
4646 if (unlikely(usize > PAGE_SIZE))
4647 return -E2BIG;
4648 if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
4649 return -EINVAL;
4650
4651 if (!may_mount())
4652 return -EPERM;
4653
4654 err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
4655 if (err)
4656 return err;
4657
4658 /* Don't bother walking through the mounts if this is a nop. */
4659 if (attr.attr_set == 0 &&
4660 attr.attr_clr == 0 &&
4661 attr.propagation == 0)
4662 return 0;
4663
4664 err = build_mount_kattr(&attr, usize, &kattr, flags);
4665 if (err)
4666 return err;
4667
4668 err = user_path_at(dfd, path, kattr.lookup_flags, &target);
4669 if (!err) {
4670 err = do_mount_setattr(&target, &kattr);
4671 path_put(&target);
4672 }
4673 finish_mount_kattr(&kattr);
4674 return err;
4675 }
4676
4677 int show_path(struct seq_file *m, struct dentry *root)
4678 {
4679 if (root->d_sb->s_op->show_path)
4680 return root->d_sb->s_op->show_path(m, root);
4681
4682 seq_dentry(m, root, " \t\n\\");
4683 return 0;
4684 }
4685
4686 static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
4687 {
4688 struct mount *mnt = mnt_find_id_at(ns, id);
4689
4690 if (!mnt || mnt->mnt_id_unique != id)
4691 return NULL;
4692
4693 return &mnt->mnt;
4694 }
4695
4696 struct kstatmount {
4697 struct statmount __user *const buf;
4698 size_t const bufsize;
4699 struct vfsmount *const mnt;
4700 u64 const mask;
4701 struct seq_file seq;
4702 struct path root;
4703 struct statmount sm;
4704 size_t pos;
4705 int err;
4706 };
4707
4708 typedef int (*statmount_func_t)(struct kstatmount *);
4709
4710 static int statmount_string_seq(struct kstatmount *s, statmount_func_t func)
4711 {
4712 size_t rem = s->bufsize - s->pos - sizeof(s->sm);
4713 struct seq_file *seq = &s->seq;
4714 int ret;
4715
4716 seq->count = 0;
4717 seq->size = min(seq->size, rem);
4718 seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT);
4719 if (!seq->buf)
4720 return -ENOMEM;
4721
4722 ret = func(s);
4723 if (ret)
4724 return ret;
4725
4726 if (seq_has_overflowed(seq)) {
4727 if (seq->size == rem)
4728 return -EOVERFLOW;
4729 seq->size *= 2;
4730 if (seq->size > MAX_RW_COUNT)
4731 return -ENOMEM;
4732 kvfree(seq->buf);
4733 return 0;
4734 }
4735
4736 /* Done */
4737 return 1;
4738 }
4739
4740 static void statmount_string(struct kstatmount *s, u64 mask, statmount_func_t func,
4741 u32 *str)
4742 {
4743 int ret = s->pos + sizeof(s->sm) >= s->bufsize ? -EOVERFLOW : 0;
4744 struct statmount *sm = &s->sm;
4745 struct seq_file *seq = &s->seq;
4746
4747 if (s->err || !(s->mask & mask))
4748 return;
4749
4750 seq->size = PAGE_SIZE;
4751 while (!ret)
4752 ret = statmount_string_seq(s, func);
4753
4754 if (ret < 0) {
4755 s->err = ret;
4756 } else {
4757 seq->buf[seq->count++] = '\0';
4758 if (copy_to_user(s->buf->str + s->pos, seq->buf, seq->count)) {
4759 s->err = -EFAULT;
4760 } else {
4761 *str = s->pos;
4762 s->pos += seq->count;
4763 }
4764 }
4765 kvfree(seq->buf);
4766 sm->mask |= mask;
4767 }
4768
4769 static void statmount_numeric(struct kstatmount *s, u64 mask, statmount_func_t func)
4770 {
4771 if (s->err || !(s->mask & mask))
4772 return;
4773
4774 s->err = func(s);
4775 s->sm.mask |= mask;
4776 }
4777
4778 static u64 mnt_to_attr_flags(struct vfsmount *mnt)
4779 {
4780 unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
4781 u64 attr_flags = 0;
4782
4783 if (mnt_flags & MNT_READONLY)
4784 attr_flags |= MOUNT_ATTR_RDONLY;
4785 if (mnt_flags & MNT_NOSUID)
4786 attr_flags |= MOUNT_ATTR_NOSUID;
4787 if (mnt_flags & MNT_NODEV)
4788 attr_flags |= MOUNT_ATTR_NODEV;
4789 if (mnt_flags & MNT_NOEXEC)
4790 attr_flags |= MOUNT_ATTR_NOEXEC;
4791 if (mnt_flags & MNT_NODIRATIME)
4792 attr_flags |= MOUNT_ATTR_NODIRATIME;
4793 if (mnt_flags & MNT_NOSYMFOLLOW)
4794 attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
4795
4796 if (mnt_flags & MNT_NOATIME)
4797 attr_flags |= MOUNT_ATTR_NOATIME;
4798 else if (mnt_flags & MNT_RELATIME)
4799 attr_flags |= MOUNT_ATTR_RELATIME;
4800 else
4801 attr_flags |= MOUNT_ATTR_STRICTATIME;
4802
4803 if (is_idmapped_mnt(mnt))
4804 attr_flags |= MOUNT_ATTR_IDMAP;
4805
4806 return attr_flags;
4807 }
4808
4809 static u64 mnt_to_propagation_flags(struct mount *m)
4810 {
4811 u64 propagation = 0;
4812
4813 if (IS_MNT_SHARED(m))
4814 propagation |= MS_SHARED;
4815 if (IS_MNT_SLAVE(m))
4816 propagation |= MS_SLAVE;
4817 if (IS_MNT_UNBINDABLE(m))
4818 propagation |= MS_UNBINDABLE;
4819 if (!propagation)
4820 propagation |= MS_PRIVATE;
4821
4822 return propagation;
4823 }
4824
4825 static int statmount_sb_basic(struct kstatmount *s)
4826 {
4827 struct super_block *sb = s->mnt->mnt_sb;
4828
4829 s->sm.sb_dev_major = MAJOR(sb->s_dev);
4830 s->sm.sb_dev_minor = MINOR(sb->s_dev);
4831 s->sm.sb_magic = sb->s_magic;
4832 s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
4833
4834 return 0;
4835 }
4836
4837 static int statmount_mnt_basic(struct kstatmount *s)
4838 {
4839 struct mount *m = real_mount(s->mnt);
4840
4841 s->sm.mnt_id = m->mnt_id_unique;
4842 s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
4843 s->sm.mnt_id_old = m->mnt_id;
4844 s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
4845 s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
4846 s->sm.mnt_propagation = mnt_to_propagation_flags(m);
4847 s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
4848 s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
4849
4850 return 0;
4851 }
4852
4853 static int statmount_propagate_from(struct kstatmount *s)
4854 {
4855 struct mount *m = real_mount(s->mnt);
4856
4857 if (!IS_MNT_SLAVE(m))
4858 return 0;
4859
4860 s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
4861
4862 return 0;
4863 }
4864
4865 static int statmount_mnt_root(struct kstatmount *s)
4866 {
4867 struct seq_file *seq = &s->seq;
4868 int err = show_path(seq, s->mnt->mnt_root);
4869
4870 if (!err && !seq_has_overflowed(seq)) {
4871 seq->buf[seq->count] = '\0';
4872 seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL);
4873 }
4874 return err;
4875 }
4876
4877 static int statmount_mnt_point(struct kstatmount *s)
4878 {
4879 struct vfsmount *mnt = s->mnt;
4880 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
4881 int err = seq_path_root(&s->seq, &mnt_path, &s->root, "");
4882
4883 return err == SEQ_SKIP ? 0 : err;
4884 }
4885
4886 static int statmount_fs_type(struct kstatmount *s)
4887 {
4888 struct seq_file *seq = &s->seq;
4889 struct super_block *sb = s->mnt->mnt_sb;
4890
4891 seq_puts(seq, sb->s_type->name);
4892 return 0;
4893 }
4894
4895 static int do_statmount(struct kstatmount *s)
4896 {
4897 struct statmount *sm = &s->sm;
4898 struct mount *m = real_mount(s->mnt);
4899 size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
4900 int err;
4901
4902 /*
4903 * Don't trigger audit denials. We just want to determine what
4904 * mounts to show users.
4905 */
4906 if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
4907 !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
4908 return -EPERM;
4909
4910 err = security_sb_statfs(s->mnt->mnt_root);
4911 if (err)
4912 return err;
4913
4914 statmount_numeric(s, STATMOUNT_SB_BASIC, statmount_sb_basic);
4915 statmount_numeric(s, STATMOUNT_MNT_BASIC, statmount_mnt_basic);
4916 statmount_numeric(s, STATMOUNT_PROPAGATE_FROM, statmount_propagate_from);
4917 statmount_string(s, STATMOUNT_FS_TYPE, statmount_fs_type, &sm->fs_type);
4918 statmount_string(s, STATMOUNT_MNT_ROOT, statmount_mnt_root, &sm->mnt_root);
4919 statmount_string(s, STATMOUNT_MNT_POINT, statmount_mnt_point, &sm->mnt_point);
4920
4921 if (s->err)
4922 return s->err;
4923
4924 /* Return the number of bytes copied to the buffer */
4925 sm->size = copysize + s->pos;
4926
4927 if (copy_to_user(s->buf, sm, copysize))
4928 return -EFAULT;
4929
4930 return 0;
4931 }
4932
4933 SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
4934 struct statmount __user *, buf, size_t, bufsize,
4935 unsigned int, flags)
4936 {
4937 struct vfsmount *mnt;
4938 struct mnt_id_req kreq;
4939 int ret;
4940
4941 if (flags)
4942 return -EINVAL;
4943
4944 if (copy_from_user(&kreq, req, sizeof(kreq)))
4945 return -EFAULT;
4946
4947 down_read(&namespace_sem);
4948 mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
4949 ret = -ENOENT;
4950 if (mnt) {
4951 struct kstatmount s = {
4952 .mask = kreq.request_mask,
4953 .buf = buf,
4954 .bufsize = bufsize,
4955 .mnt = mnt,
4956 };
4957
4958 get_fs_root(current->fs, &s.root);
4959 ret = do_statmount(&s);
4960 path_put(&s.root);
4961 }
4962 up_read(&namespace_sem);
4963
4964 return ret;
4965 }
4966
4967 static void __init init_mount_tree(void)
4968 {
4969 struct vfsmount *mnt;
4970 struct mount *m;
4971 struct mnt_namespace *ns;
4972 struct path root;
4973
4974 mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
4975 if (IS_ERR(mnt))
4976 panic("Can't create rootfs");
4977
4978 ns = alloc_mnt_ns(&init_user_ns, false);
4979 if (IS_ERR(ns))
4980 panic("Can't allocate initial namespace");
4981 m = real_mount(mnt);
4982 ns->root = m;
4983 ns->nr_mounts = 1;
4984 mnt_add_to_ns(ns, m);
4985 init_task.nsproxy->mnt_ns = ns;
4986 get_mnt_ns(ns);
4987
4988 root.mnt = mnt;
4989 root.dentry = mnt->mnt_root;
4990 mnt->mnt_flags |= MNT_LOCKED;
4991
4992 set_fs_pwd(current->fs, &root);
4993 set_fs_root(current->fs, &root);
4994 }
4995
4996 void __init mnt_init(void)
4997 {
4998 int err;
4999
5000 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
5001 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
5002
5003 mount_hashtable = alloc_large_system_hash("Mount-cache",
5004 sizeof(struct hlist_head),
5005 mhash_entries, 19,
5006 HASH_ZERO,
5007 &m_hash_shift, &m_hash_mask, 0, 0);
5008 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
5009 sizeof(struct hlist_head),
5010 mphash_entries, 19,
5011 HASH_ZERO,
5012 &mp_hash_shift, &mp_hash_mask, 0, 0);
5013
5014 if (!mount_hashtable || !mountpoint_hashtable)
5015 panic("Failed to allocate mount hash table\n");
5016
5017 kernfs_init();
5018
5019 err = sysfs_init();
5020 if (err)
5021 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
5022 __func__, err);
5023 fs_kobj = kobject_create_and_add("fs", NULL);
5024 if (!fs_kobj)
5025 printk(KERN_WARNING "%s: kobj create error\n", __func__);
5026 shmem_init();
5027 init_rootfs();
5028 init_mount_tree();
5029 }
5030
5031 void put_mnt_ns(struct mnt_namespace *ns)
5032 {
5033 if (!refcount_dec_and_test(&ns->ns.count))
5034 return;
5035 drop_collected_mounts(&ns->root->mnt);
5036 free_mnt_ns(ns);
5037 }
5038
5039 struct vfsmount *kern_mount(struct file_system_type *type)
5040 {
5041 struct vfsmount *mnt;
5042 mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
5043 if (!IS_ERR(mnt)) {
5044 /*
5045 * it is a longterm mount, don't release mnt until
5046 * we unmount before file sys is unregistered
5047 */
5048 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
5049 }
5050 return mnt;
5051 }
5052 EXPORT_SYMBOL_GPL(kern_mount);
5053
5054 void kern_unmount(struct vfsmount *mnt)
5055 {
5056 /* release long term mount so mount point can be released */
5057 if (!IS_ERR(mnt)) {
5058 mnt_make_shortterm(mnt);
5059 synchronize_rcu(); /* yecchhh... */
5060 mntput(mnt);
5061 }
5062 }
5063 EXPORT_SYMBOL(kern_unmount);
5064
5065 void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
5066 {
5067 unsigned int i;
5068
5069 for (i = 0; i < num; i++)
5070 mnt_make_shortterm(mnt[i]);
5071 synchronize_rcu_expedited();
5072 for (i = 0; i < num; i++)
5073 mntput(mnt[i]);
5074 }
5075 EXPORT_SYMBOL(kern_unmount_array);
5076
5077 bool our_mnt(struct vfsmount *mnt)
5078 {
5079 return check_mnt(real_mount(mnt));
5080 }
5081
5082 bool current_chrooted(void)
5083 {
5084 /* Does the current process have a non-standard root */
5085 struct path ns_root;
5086 struct path fs_root;
5087 bool chrooted;
5088
5089 /* Find the namespace root */
5090 ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
5091 ns_root.dentry = ns_root.mnt->mnt_root;
5092 path_get(&ns_root);
5093 while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
5094 ;
5095
5096 get_fs_root(current->fs, &fs_root);
5097
5098 chrooted = !path_equal(&fs_root, &ns_root);
5099
5100 path_put(&fs_root);
5101 path_put(&ns_root);
5102
5103 return chrooted;
5104 }
5105
5106 static bool mnt_already_visible(struct mnt_namespace *ns,
5107 const struct super_block *sb,
5108 int *new_mnt_flags)
5109 {
5110 int new_flags = *new_mnt_flags;
5111 struct mount *mnt, *n;
5112 bool visible = false;
5113
5114 down_read(&namespace_sem);
5115 rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
5116 struct mount *child;
5117 int mnt_flags;
5118
5119 if (mnt->mnt.mnt_sb->s_type != sb->s_type)
5120 continue;
5121
5122 /* This mount is not fully visible if it's root directory
5123 * is not the root directory of the filesystem.
5124 */
5125 if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
5126 continue;
5127
5128 /* A local view of the mount flags */
5129 mnt_flags = mnt->mnt.mnt_flags;
5130
5131 /* Don't miss readonly hidden in the superblock flags */
5132 if (sb_rdonly(mnt->mnt.mnt_sb))
5133 mnt_flags |= MNT_LOCK_READONLY;
5134
5135 /* Verify the mount flags are equal to or more permissive
5136 * than the proposed new mount.
5137 */
5138 if ((mnt_flags & MNT_LOCK_READONLY) &&
5139 !(new_flags & MNT_READONLY))
5140 continue;
5141 if ((mnt_flags & MNT_LOCK_ATIME) &&
5142 ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
5143 continue;
5144
5145 /* This mount is not fully visible if there are any
5146 * locked child mounts that cover anything except for
5147 * empty directories.
5148 */
5149 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
5150 struct inode *inode = child->mnt_mountpoint->d_inode;
5151 /* Only worry about locked mounts */
5152 if (!(child->mnt.mnt_flags & MNT_LOCKED))
5153 continue;
5154 /* Is the directory permanetly empty? */
5155 if (!is_empty_dir_inode(inode))
5156 goto next;
5157 }
5158 /* Preserve the locked attributes */
5159 *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
5160 MNT_LOCK_ATIME);
5161 visible = true;
5162 goto found;
5163 next: ;
5164 }
5165 found:
5166 up_read(&namespace_sem);
5167 return visible;
5168 }
5169
5170 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
5171 {
5172 const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
5173 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
5174 unsigned long s_iflags;
5175
5176 if (ns->user_ns == &init_user_ns)
5177 return false;
5178
5179 /* Can this filesystem be too revealing? */
5180 s_iflags = sb->s_iflags;
5181 if (!(s_iflags & SB_I_USERNS_VISIBLE))
5182 return false;
5183
5184 if ((s_iflags & required_iflags) != required_iflags) {
5185 WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
5186 required_iflags);
5187 return true;
5188 }
5189
5190 return !mnt_already_visible(ns, sb, new_mnt_flags);
5191 }
5192
5193 bool mnt_may_suid(struct vfsmount *mnt)
5194 {
5195 /*
5196 * Foreign mounts (accessed via fchdir or through /proc
5197 * symlinks) are always treated as if they are nosuid. This
5198 * prevents namespaces from trusting potentially unsafe
5199 * suid/sgid bits, file caps, or security labels that originate
5200 * in other namespaces.
5201 */
5202 return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
5203 current_in_userns(mnt->mnt_sb->s_user_ns);
5204 }
5205
5206 static struct ns_common *mntns_get(struct task_struct *task)
5207 {
5208 struct ns_common *ns = NULL;
5209 struct nsproxy *nsproxy;
5210
5211 task_lock(task);
5212 nsproxy = task->nsproxy;
5213 if (nsproxy) {
5214 ns = &nsproxy->mnt_ns->ns;
5215 get_mnt_ns(to_mnt_ns(ns));
5216 }
5217 task_unlock(task);
5218
5219 return ns;
5220 }
5221
5222 static void mntns_put(struct ns_common *ns)
5223 {
5224 put_mnt_ns(to_mnt_ns(ns));
5225 }
5226
5227 static int mntns_install(struct nsset *nsset, struct ns_common *ns)
5228 {
5229 struct nsproxy *nsproxy = nsset->nsproxy;
5230 struct fs_struct *fs = nsset->fs;
5231 struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
5232 struct user_namespace *user_ns = nsset->cred->user_ns;
5233 struct path root;
5234 int err;
5235
5236 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
5237 !ns_capable(user_ns, CAP_SYS_CHROOT) ||
5238 !ns_capable(user_ns, CAP_SYS_ADMIN))
5239 return -EPERM;
5240
5241 if (is_anon_ns(mnt_ns))
5242 return -EINVAL;
5243
5244 if (fs->users != 1)
5245 return -EINVAL;
5246
5247 get_mnt_ns(mnt_ns);
5248 old_mnt_ns = nsproxy->mnt_ns;
5249 nsproxy->mnt_ns = mnt_ns;
5250
5251 /* Find the root */
5252 err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
5253 "/", LOOKUP_DOWN, &root);
5254 if (err) {
5255 /* revert to old namespace */
5256 nsproxy->mnt_ns = old_mnt_ns;
5257 put_mnt_ns(mnt_ns);
5258 return err;
5259 }
5260
5261 put_mnt_ns(old_mnt_ns);
5262
5263 /* Update the pwd and root */
5264 set_fs_pwd(fs, &root);
5265 set_fs_root(fs, &root);
5266
5267 path_put(&root);
5268 return 0;
5269 }
5270
5271 static struct user_namespace *mntns_owner(struct ns_common *ns)
5272 {
5273 return to_mnt_ns(ns)->user_ns;
5274 }
5275
5276 const struct proc_ns_operations mntns_operations = {
5277 .name = "mnt",
5278 .type = CLONE_NEWNS,
5279 .get = mntns_get,
5280 .put = mntns_put,
5281 .install = mntns_install,
5282 .owner = mntns_owner,
5283 };
5284
5285 #ifdef CONFIG_SYSCTL
5286 static struct ctl_table fs_namespace_sysctls[] = {
5287 {
5288 .procname = "mount-max",
5289 .data = &sysctl_mount_max,
5290 .maxlen = sizeof(unsigned int),
5291 .mode = 0644,
5292 .proc_handler = proc_dointvec_minmax,
5293 .extra1 = SYSCTL_ONE,
5294 },
5295 { }
5296 };
5297
5298 static int __init init_fs_namespace_sysctls(void)
5299 {
5300 register_sysctl_init("fs", fs_namespace_sysctls);
5301 return 0;
5302 }
5303 fs_initcall(init_fs_namespace_sysctls);
5304
5305 #endif /* CONFIG_SYSCTL */