[thirdparty/systemd.git] / src / nspawn / nspawn-patch-uid.c

/* SPDX-License-Identifier: LGPL-2.1-or-later */

#include <fcntl.h>
#include <sys/statvfs.h>
#include <sys/vfs.h>
#include <unistd.h>

#include "acl-util.h"
#include "dirent-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "missing_magic.h"
#include "nspawn-def.h"
#include "nspawn-patch-uid.h"
#include "stat-util.h"
#include "stdio-util.h"
#include "string-util.h"
#include "strv.h"
#include "user-util.h"

#if HAVE_ACL

static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
        acl_t acl;

        assert(fd >= 0);
        assert(ret);

        if (name) {
                _cleanup_close_ int child_fd = -EBADF;

                child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
                if (child_fd < 0)
                        return -errno;

                acl = acl_get_file(FORMAT_PROC_FD_PATH(child_fd), type);
        } else if (type == ACL_TYPE_ACCESS)
                acl = acl_get_fd(fd);
        else
                acl = acl_get_file(FORMAT_PROC_FD_PATH(fd), type);
        if (!acl)
                return -errno;

        *ret = acl;
        return 0;
}

static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
        int r;

        assert(fd >= 0);
        assert(acl);

        if (name) {
                _cleanup_close_ int child_fd = -EBADF;

                child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
                if (child_fd < 0)
                        return -errno;

                r = acl_set_file(FORMAT_PROC_FD_PATH(child_fd), type, acl);
        } else if (type == ACL_TYPE_ACCESS)
                r = acl_set_fd(fd, acl);
        else
                r = acl_set_file(FORMAT_PROC_FD_PATH(fd), type, acl);
        if (r < 0)
                return -errno;

        return 0;
}

static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
        _cleanup_(acl_freep) acl_t copy = NULL;
        acl_entry_t i;
        int r;

        assert(acl);
        assert(ret);

        r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
        if (r < 0)
                return -errno;
        while (r > 0) {
                uid_t *old_uid, new_uid;
                bool modify = false;
                acl_tag_t tag;

                if (acl_get_tag_type(i, &tag) < 0)
                        return -errno;

                if (IN_SET(tag, ACL_USER, ACL_GROUP)) {

                        /* We don't distinguish here between uid_t and gid_t, let's make sure the compiler checks that
                         * this is actually OK */
                        assert_cc(sizeof(uid_t) == sizeof(gid_t));

                        old_uid = acl_get_qualifier(i);
                        if (!old_uid)
                                return -errno;

                        new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
                        if (!uid_is_valid(new_uid))
                                return -EINVAL;

                        modify = new_uid != *old_uid;
                        if (modify && !copy) {
                                int n;

                                /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
                                 * beginning, so that we copy all entries, starting from the first, this time. */

                                n = acl_entries(acl);
                                if (n < 0)
                                        return -errno;

                                copy = acl_init(n);
                                if (!copy)
                                        return -errno;

                                /* Seek back to the beginning */
                                r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
                                if (r < 0)
                                        return -errno;
                                continue;
                        }
                }

                if (copy) {
                        acl_entry_t new_entry;

                        if (acl_create_entry(&copy, &new_entry) < 0)
                                return -errno;

                        if (acl_copy_entry(new_entry, i) < 0)
                                return -errno;

                        if (modify)
                                if (acl_set_qualifier(new_entry, &new_uid) < 0)
                                        return -errno;
                }

                r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
                if (r < 0)
                        return -errno;
        }

        *ret = TAKE_PTR(copy);

        return !!*ret;
}

static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
        _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
        bool changed = false;
        int r;

        assert(fd >= 0);
        assert(st);

        /* ACLs are not supported on symlinks, there's no point in trying */
        if (S_ISLNK(st->st_mode))
                return 0;

        r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
        if (r == -EOPNOTSUPP)
                return 0;
        if (r < 0)
                return r;

        r = shift_acl(acl, shift, &shifted);
        if (r < 0)
                return r;
        if (r > 0) {
                r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
                if (r < 0)
                        return r;

                changed = true;
        }

        if (S_ISDIR(st->st_mode)) {
                acl_free(acl);

                if (shifted)
                        acl_free(shifted);

                acl = shifted = NULL;

                r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
                if (r < 0)
                        return r;

                r = shift_acl(acl, shift, &shifted);
                if (r < 0)
                        return r;
                if (r > 0) {
                        r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
                        if (r < 0)
                                return r;

                        changed = true;
                }
        }

        return changed;
}

#else

static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
        return 0;
}

#endif

static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
        uid_t new_uid;
        gid_t new_gid;
        bool changed = false;
        int r;

        assert(fd >= 0);
        assert(st);

        new_uid =         shift | (st->st_uid & UINT32_C(0xFFFF));
        new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));

        if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
                return -EINVAL;

        if (st->st_uid != new_uid || st->st_gid != new_gid) {
                if (name)
                        r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
                else
                        r = fchown(fd, new_uid, new_gid);
                if (r < 0)
                        return -errno;

                /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
                if (name) {
                        if (!S_ISLNK(st->st_mode))
                                r = fchmodat(fd, name, st->st_mode, 0);
                        else /* Changing the mode of a symlink is not supported by Linux kernel. Don't bother. */
                                r = 0;
                } else
                        r = fchmod(fd, st->st_mode);
                if (r < 0)
                        return -errno;

                changed = true;
        }

        r = patch_acls(fd, name, st, shift);
        if (r < 0)
                return r;

        return r > 0 || changed;
}

/*
 * Check if the filesystem is fully compatible with user namespaces or
 * UID/GID patching. Some filesystems in this list can be fully mounted inside
 * user namespaces, however their inodes may relate to host resources or only
 * valid in the global user namespace, therefore no patching should be applied.
 */
static int is_fs_fully_userns_compatible(const struct statfs *sfs) {

        assert(sfs);

        return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
}

static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
        _cleanup_closedir_ DIR *d = NULL;
        bool changed = false;
        struct statfs sfs;
        int r;

        assert(fd >= 0);

        if (fstatfs(fd, &sfs) < 0)
                return -errno;

        /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
         * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
         * when we hit procfs, sysfs or some other special file systems. */

        r = is_fs_fully_userns_compatible(&sfs);
        if (r < 0)
                goto finish;
        if (r > 0) {
                r = 0; /* don't recurse */
                goto finish;
        }

        /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
        if ((sfs.f_flags & ST_RDONLY) ||
            access_fd(fd, W_OK) == -EROFS)
                goto read_only;

        if (S_ISDIR(st->st_mode)) {
                if (!donate_fd) {
                        int copy;

                        copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
                        if (copy < 0) {
                                r = -errno;
                                goto finish;
                        }

                        fd = copy;
                        donate_fd = true;
                }

                d = take_fdopendir(&fd);
                if (!d) {
                        r = -errno;
                        goto finish;
                }

                FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
                        struct stat fst;

                        if (dot_or_dot_dot(de->d_name))
                                continue;

                        if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
                                r = -errno;
                                goto finish;
                        }

                        if (S_ISDIR(fst.st_mode)) {
                                int subdir_fd;

                                subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
                                if (subdir_fd < 0) {
                                        r = -errno;
                                        goto finish;

                                }

                                r = recurse_fd(subdir_fd, true, &fst, shift, false);
                                if (r < 0)
                                        goto finish;
                                if (r > 0)
                                        changed = true;

                        } else {
                                r = patch_fd(dirfd(d), de->d_name, &fst, shift);
                                if (r < 0)
                                        goto finish;
                                if (r > 0)
                                        changed = true;
                        }
                }
        }

        /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
         * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
         * tree is properly chown()ed already. */
        r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
        if (r == -EROFS)
                goto read_only;
        if (r > 0)
                changed = true;

        r = changed;
        goto finish;

read_only:
        if (!is_toplevel) {
                _cleanup_free_ char *name = NULL;

                /* When we hit a ready-only subtree we simply skip it, but log about it. */
                (void) fd_get_path(fd, &name);
                log_debug("Skipping read-only file or directory %s.", strna(name));
                r = changed;
        }

finish:
        if (donate_fd)
                safe_close(fd);

        return r;
}

static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
        struct stat st;
        int r;

        assert(fd >= 0);

        /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
         * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
         * following the concept that the upper 16-bit of a UID identify the container, and the lower 16-bit are the actual
         * UID within the container. */

        if ((shift & 0xFFFF) != 0) {
                /* We only support containers where the shift starts at a 2^16 boundary */
                r = -EOPNOTSUPP;
                goto finish;
        }

        if (shift == UID_BUSY_BASE) {
                r = -EINVAL;
                goto finish;
        }

        if (range != 0x10000) {
                /* We only support containers with 16-bit UID ranges for the patching logic */
                r = -EOPNOTSUPP;
                goto finish;
        }

        if (fstat(fd, &st) < 0) {
                r = -errno;
                goto finish;
        }

        if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
                /* We only support containers where the uid/gid container ID match */
                r = -EBADE;
                goto finish;
        }

        /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
         * that if the top-level dir has the right upper 16-bit assigned, then everything below will have too... */
        if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
                return 0;

        /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
         * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
         * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */

        if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
                if (fchown(fd,
                           UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
                           (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
                        r = -errno;
                        goto finish;
                }
        }

        return recurse_fd(fd, donate_fd, &st, shift, true);

finish:
        if (donate_fd)
                safe_close(fd);

        return r;
}

int path_patch_uid(const char *path, uid_t shift, uid_t range) {
        int fd;

        fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
        if (fd < 0)
                return -errno;

        return fd_patch_uid_internal(fd, true, shift, range);
}
Commit	Line	Data
db9ecf05	1	/* SPDX-License-Identifier: LGPL-2.1-or-later */
7336138e LP	2
7336138e LP	3	#include <fcntl.h>
3603efde	4	#include <sys/statvfs.h>
88cd066e	5	#include <sys/vfs.h>
7336138e LP	6	#include <unistd.h>
	7
	8	#include "acl-util.h"
	9	#include "dirent-util.h"
	10	#include "fd-util.h"
9f81a592	11	#include "fileio.h"
3603efde	12	#include "fs-util.h"
f5947a5e	13	#include "missing_magic.h"
3603efde	14	#include "nspawn-def.h"
7336138e	15	#include "nspawn-patch-uid.h"
88cd066e	16	#include "stat-util.h"
7336138e LP	17	#include "stdio-util.h"
	18	#include "string-util.h"
	19	#include "strv.h"
	20	#include "user-util.h"
	21
349cc4a5	22	#if HAVE_ACL
7336138e LP	23
7336138e LP	24	static int get_acl(int fd, const char name, acl_type_t type, acl_t ret) {
7336138e LP	25	acl_t acl;
	26
	27	assert(fd >= 0);
	28	assert(ret);
	29
	30	if (name) {
254d1313	31	_cleanup_close_ int child_fd = -EBADF;
7336138e LP	32
	33	child_fd = openat(fd, name, O_PATH\|O_CLOEXEC\|O_NOFOLLOW);
	34	if (child_fd < 0)
	35	return -errno;
	36
ddb6eeaf	37	acl = acl_get_file(FORMAT_PROC_FD_PATH(child_fd), type);
7336138e LP	38	} else if (type == ACL_TYPE_ACCESS)
7336138e LP	39	acl = acl_get_fd(fd);
ddb6eeaf LP	40	else
ddb6eeaf LP	41	acl = acl_get_file(FORMAT_PROC_FD_PATH(fd), type);
7336138e LP	42	if (!acl)
	43	return -errno;
	44
	45	*ret = acl;
	46	return 0;
	47	}
	48
	49	static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
7336138e LP	50	int r;
	51
	52	assert(fd >= 0);
	53	assert(acl);
	54
	55	if (name) {
254d1313	56	_cleanup_close_ int child_fd = -EBADF;
7336138e LP	57
	58	child_fd = openat(fd, name, O_PATH\|O_CLOEXEC\|O_NOFOLLOW);
	59	if (child_fd < 0)
	60	return -errno;
	61
ddb6eeaf	62	r = acl_set_file(FORMAT_PROC_FD_PATH(child_fd), type, acl);
7336138e LP	63	} else if (type == ACL_TYPE_ACCESS)
7336138e LP	64	r = acl_set_fd(fd, acl);
ddb6eeaf LP	65	else
ddb6eeaf LP	66	r = acl_set_file(FORMAT_PROC_FD_PATH(fd), type, acl);
7336138e LP	67	if (r < 0)
	68	return -errno;
	69
	70	return 0;
	71	}
	72
	73	static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
	74	_cleanup_(acl_freep) acl_t copy = NULL;
	75	acl_entry_t i;
	76	int r;
	77
	78	assert(acl);
	79	assert(ret);
	80
	81	r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
	82	if (r < 0)
	83	return -errno;
	84	while (r > 0) {
	85	uid_t *old_uid, new_uid;
	86	bool modify = false;
	87	acl_tag_t tag;
	88
	89	if (acl_get_tag_type(i, &tag) < 0)
	90	return -errno;
	91
	92	if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
	93
5238e957	94	/* We don't distinguish here between uid_t and gid_t, let's make sure the compiler checks that
7336138e LP	95	* this is actually OK */
	96	assert_cc(sizeof(uid_t) == sizeof(gid_t));
	97
	98	old_uid = acl_get_qualifier(i);
	99	if (!old_uid)
	100	return -errno;
	101
	102	new_uid = shift \| (*old_uid & UINT32_C(0xFFFF));
	103	if (!uid_is_valid(new_uid))
	104	return -EINVAL;
	105
	106	modify = new_uid != *old_uid;
	107	if (modify && !copy) {
	108	int n;
	109
	110	/* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
	111	* beginning, so that we copy all entries, starting from the first, this time. */
	112
	113	n = acl_entries(acl);
	114	if (n < 0)
	115	return -errno;
	116
	117	copy = acl_init(n);
	118	if (!copy)
	119	return -errno;
	120
	121	/* Seek back to the beginning */
	122	r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
	123	if (r < 0)
	124	return -errno;
	125	continue;
	126	}
	127	}
	128
	129	if (copy) {
	130	acl_entry_t new_entry;
	131
	132	if (acl_create_entry(&copy, &new_entry) < 0)
	133	return -errno;
	134
	135	if (acl_copy_entry(new_entry, i) < 0)
	136	return -errno;
	137
	138	if (modify)
	139	if (acl_set_qualifier(new_entry, &new_uid) < 0)
	140	return -errno;
	141	}
	142
	143	r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
	144	if (r < 0)
	145	return -errno;
	146	}
	147
1cc6c93a	148	*ret = TAKE_PTR(copy);
7336138e LP	149
	150	return !!*ret;
	151	}
	152
	153	static int patch_acls(int fd, const char name, const struct stat st, uid_t shift) {
	154	_cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
	155	bool changed = false;
	156	int r;
	157
	158	assert(fd >= 0);
	159	assert(st);
	160
	161	/* ACLs are not supported on symlinks, there's no point in trying */
	162	if (S_ISLNK(st->st_mode))
	163	return 0;
	164
	165	r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
	166	if (r == -EOPNOTSUPP)
	167	return 0;
	168	if (r < 0)
	169	return r;
	170
	171	r = shift_acl(acl, shift, &shifted);
	172	if (r < 0)
	173	return r;
	174	if (r > 0) {
	175	r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
	176	if (r < 0)
	177	return r;
	178
	179	changed = true;
	180	}
	181
	182	if (S_ISDIR(st->st_mode)) {
	183	acl_free(acl);
34680637 LP	184
	185	if (shifted)
	186	acl_free(shifted);
7336138e LP	187
	188	acl = shifted = NULL;
	189
	190	r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
	191	if (r < 0)
	192	return r;
	193
	194	r = shift_acl(acl, shift, &shifted);
	195	if (r < 0)
	196	return r;
	197	if (r > 0) {
	198	r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
	199	if (r < 0)
	200	return r;
	201
	202	changed = true;
	203	}
	204	}
	205
	206	return changed;
	207	}
	208
	209	#else
	210
	211	static int patch_acls(int fd, const char name, const struct stat st, uid_t shift) {
	212	return 0;
	213	}
	214
	215	#endif
	216
	217	static int patch_fd(int fd, const char name, const struct stat st, uid_t shift) {
	218	uid_t new_uid;
	219	gid_t new_gid;
	220	bool changed = false;
	221	int r;
	222
	223	assert(fd >= 0);
	224	assert(st);
	225
	226	new_uid = shift \| (st->st_uid & UINT32_C(0xFFFF));
	227	new_gid = (gid_t) shift \| (st->st_gid & UINT32_C(0xFFFF));
	228
	229	if (!uid_is_valid(new_uid) \|\| !gid_is_valid(new_gid))
	230	return -EINVAL;
	231
	232	if (st->st_uid != new_uid \|\| st->st_gid != new_gid) {
	233	if (name)
	234	r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
	235	else
	236	r = fchown(fd, new_uid, new_gid);
	237	if (r < 0)
	238	return -errno;
	239
	240	/* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
0c6aeb46 LP	241	if (name) {
0c6aeb46 LP	242	if (!S_ISLNK(st->st_mode))
677e6445	243	r = fchmodat(fd, name, st->st_mode, 0);
0cdffada	244	else /* Changing the mode of a symlink is not supported by Linux kernel. Don't bother. */
677e6445	245	r = 0;
0c6aeb46	246	} else
677e6445	247	r = fchmod(fd, st->st_mode);
7336138e	248	if (r < 0)
677e6445	249	return -errno;
7336138e LP	250
	251	changed = true;
	252	}
	253
	254	r = patch_acls(fd, name, st, shift);
	255	if (r < 0)
	256	return r;
	257
	258	return r > 0 \|\| changed;
	259	}
	260
231bfb1b DH	261	/*
	262	* Check if the filesystem is fully compatible with user namespaces or
	263	* UID/GID patching. Some filesystems in this list can be fully mounted inside
	264	* user namespaces, however their inodes may relate to host resources or only
	265	* valid in the global user namespace, therefore no patching should be applied.
	266	*/
3603efde LP	267	static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
	268
	269	assert(sfs);
	270
	271	return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) \|\|
	272	F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) \|\|
	273	F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) \|\|
	274	F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) \|\|
	275	F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) \|\|
	276	F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) \|\|
	277	F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) \|\|
	278	F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) \|\|
	279	F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) \|\|
	280	F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) \|\|
	281	F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) \|\|
	282	F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) \|\|
	283	F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) \|\|
	284	F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) \|\|
	285	F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) \|\|
	286	F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
88cd066e LP	287	}
88cd066e LP	288
4aeb20f5	289	static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
3603efde	290	_cleanup_closedir_ DIR *d = NULL;
7336138e	291	bool changed = false;
3603efde	292	struct statfs sfs;
7336138e LP	293	int r;
	294
	295	assert(fd >= 0);
	296
3603efde LP	297	if (fstatfs(fd, &sfs) < 0)
	298	return -errno;
	299
	300	/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
	301	* shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
	302	* when we hit procfs, sysfs or some other special file systems. */
	303
	304	r = is_fs_fully_userns_compatible(&sfs);
88cd066e LP	305	if (r < 0)
	306	goto finish;
	307	if (r > 0) {
	308	r = 0; /* don't recurse */
	309	goto finish;
	310	}
	311
3603efde LP	312	/* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
	313	if ((sfs.f_flags & ST_RDONLY) \|\|
	314	access_fd(fd, W_OK) == -EROFS)
	315	goto read_only;
7336138e LP	316
7336138e LP	317	if (S_ISDIR(st->st_mode)) {
7336138e LP	318	if (!donate_fd) {
	319	int copy;
	320
	321	copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
88cd066e LP	322	if (copy < 0) {
	323	r = -errno;
	324	goto finish;
	325	}
7336138e LP	326
	327	fd = copy;
	328	donate_fd = true;
	329	}
	330
9f81a592	331	d = take_fdopendir(&fd);
7336138e LP	332	if (!d) {
	333	r = -errno;
	334	goto finish;
	335	}
7336138e LP	336
	337	FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
	338	struct stat fst;
	339
49bfc877	340	if (dot_or_dot_dot(de->d_name))
7336138e LP	341	continue;
	342
	343	if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
	344	r = -errno;
	345	goto finish;
	346	}
	347
	348	if (S_ISDIR(fst.st_mode)) {
	349	int subdir_fd;
	350
	351	subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY\|O_NONBLOCK\|O_DIRECTORY\|O_CLOEXEC\|O_NOFOLLOW\|O_NOATIME);
	352	if (subdir_fd < 0) {
	353	r = -errno;
	354	goto finish;
	355
	356	}
	357
4aeb20f5	358	r = recurse_fd(subdir_fd, true, &fst, shift, false);
7336138e LP	359	if (r < 0)
	360	goto finish;
	361	if (r > 0)
	362	changed = true;
	363
	364	} else {
	365	r = patch_fd(dirfd(d), de->d_name, &fst, shift);
	366	if (r < 0)
	367	goto finish;
	368	if (r > 0)
	369	changed = true;
	370	}
	371	}
	372	}
	373
3603efde LP	374	/* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
	375	* directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
	376	* tree is properly chown()ed already. */
	377	r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
	378	if (r == -EROFS)
	379	goto read_only;
	380	if (r > 0)
	381	changed = true;
	382
7336138e	383	r = changed;
3603efde LP	384	goto finish;
	385
	386	read_only:
	387	if (!is_toplevel) {
	388	_cleanup_free_ char *name = NULL;
	389
	390	/* When we hit a ready-only subtree we simply skip it, but log about it. */
	391	(void) fd_get_path(fd, &name);
ebcf6976	392	log_debug("Skipping read-only file or directory %s.", strna(name));
3603efde LP	393	r = changed;
3603efde LP	394	}
7336138e LP	395
	396	finish:
	397	if (donate_fd)
	398	safe_close(fd);
	399
	400	return r;
	401	}
	402
	403	static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
	404	struct stat st;
	405	int r;
	406
	407	assert(fd >= 0);
	408
	409	/* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
	410	* OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
da890466	411	* following the concept that the upper 16-bit of a UID identify the container, and the lower 16-bit are the actual
7336138e LP	412	* UID within the container. */
	413
	414	if ((shift & 0xFFFF) != 0) {
	415	/* We only support containers where the shift starts at a 2^16 boundary */
	416	r = -EOPNOTSUPP;
	417	goto finish;
	418	}
	419
3603efde LP	420	if (shift == UID_BUSY_BASE) {
	421	r = -EINVAL;
	422	goto finish;
	423	}
	424
7336138e	425	if (range != 0x10000) {
da890466	426	/* We only support containers with 16-bit UID ranges for the patching logic */
7336138e LP	427	r = -EOPNOTSUPP;
	428	goto finish;
	429	}
	430
	431	if (fstat(fd, &st) < 0) {
	432	r = -errno;
	433	goto finish;
	434	}
	435
	436	if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
	437	/* We only support containers where the uid/gid container ID match */
	438	r = -EBADE;
	439	goto finish;
	440	}
	441
	442	/* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
da890466	443	* that if the top-level dir has the right upper 16-bit assigned, then everything below will have too... */
7336138e LP	444	if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
	445	return 0;
	446
3603efde LP	447	/* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
	448	* range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
	449	* chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
	450
	451	if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
	452	if (fchown(fd,
	453	UID_BUSY_BASE \| (st.st_uid & ~UID_BUSY_MASK),
	454	(gid_t) UID_BUSY_BASE \| (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
	455	r = -errno;
	456	goto finish;
	457	}
	458	}
	459
4aeb20f5	460	return recurse_fd(fd, donate_fd, &st, shift, true);
7336138e LP	461
	462	finish:
	463	if (donate_fd)
	464	safe_close(fd);
	465
	466	return r;
	467	}
	468
7336138e LP	469	int path_patch_uid(const char *path, uid_t shift, uid_t range) {
	470	int fd;
	471
	472	fd = open(path, O_RDONLY\|O_NONBLOCK\|O_DIRECTORY\|O_CLOEXEC\|O_NOFOLLOW\|O_NOATIME);
	473	if (fd < 0)
	474	return -errno;
	475
	476	return fd_patch_uid_internal(fd, true, shift, range);
	477	}