[thirdparty/systemd.git] / src / nspawn / nspawn-patch-uid.c

/* SPDX-License-Identifier: LGPL-2.1+ */
/***
  This file is part of systemd.

  Copyright 2016 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

#include <fcntl.h>
#include <linux/magic.h>
#if HAVE_ACL
#include <sys/acl.h>
#endif
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/vfs.h>
#include <unistd.h>

#include "acl-util.h"
#include "dirent-util.h"
#include "fd-util.h"
#include "fs-util.h"
#include "missing.h"
#include "nspawn-def.h"
#include "nspawn-patch-uid.h"
#include "stat-util.h"
#include "stdio-util.h"
#include "string-util.h"
#include "strv.h"
#include "user-util.h"

#if HAVE_ACL

static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
        char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
        acl_t acl;

        assert(fd >= 0);
        assert(ret);

        if (name) {
                _cleanup_close_ int child_fd = -1;

                child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
                if (child_fd < 0)
                        return -errno;

                xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
                acl = acl_get_file(procfs_path, type);
        } else if (type == ACL_TYPE_ACCESS)
                acl = acl_get_fd(fd);
        else {
                xsprintf(procfs_path, "/proc/self/fd/%i", fd);
                acl = acl_get_file(procfs_path, type);
        }
        if (!acl)
                return -errno;

        *ret = acl;
        return 0;
}

static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
        char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
        int r;

        assert(fd >= 0);
        assert(acl);

        if (name) {
                _cleanup_close_ int child_fd = -1;

                child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
                if (child_fd < 0)
                        return -errno;

                xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
                r = acl_set_file(procfs_path, type, acl);
        } else if (type == ACL_TYPE_ACCESS)
                r = acl_set_fd(fd, acl);
        else {
                xsprintf(procfs_path, "/proc/self/fd/%i", fd);
                r = acl_set_file(procfs_path, type, acl);
        }
        if (r < 0)
                return -errno;

        return 0;
}

static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
        _cleanup_(acl_freep) acl_t copy = NULL;
        acl_entry_t i;
        int r;

        assert(acl);
        assert(ret);

        r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
        if (r < 0)
                return -errno;
        while (r > 0) {
                uid_t *old_uid, new_uid;
                bool modify = false;
                acl_tag_t tag;

                if (acl_get_tag_type(i, &tag) < 0)
                        return -errno;

                if (IN_SET(tag, ACL_USER, ACL_GROUP)) {

                        /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
                         * this is actually OK */
                        assert_cc(sizeof(uid_t) == sizeof(gid_t));

                        old_uid = acl_get_qualifier(i);
                        if (!old_uid)
                                return -errno;

                        new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
                        if (!uid_is_valid(new_uid))
                                return -EINVAL;

                        modify = new_uid != *old_uid;
                        if (modify && !copy) {
                                int n;

                                /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
                                 * beginning, so that we copy all entries, starting from the first, this time. */

                                n = acl_entries(acl);
                                if (n < 0)
                                        return -errno;

                                copy = acl_init(n);
                                if (!copy)
                                        return -errno;

                                /* Seek back to the beginning */
                                r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
                                if (r < 0)
                                        return -errno;
                                continue;
                        }
                }

                if (copy) {
                        acl_entry_t new_entry;

                        if (acl_create_entry(&copy, &new_entry) < 0)
                                return -errno;

                        if (acl_copy_entry(new_entry, i) < 0)
                                return -errno;

                        if (modify)
                                if (acl_set_qualifier(new_entry, &new_uid) < 0)
                                        return -errno;
                }

                r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
                if (r < 0)
                        return -errno;
        }

        *ret = TAKE_PTR(copy);

        return !!*ret;
}

static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
        _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
        bool changed = false;
        int r;

        assert(fd >= 0);
        assert(st);

        /* ACLs are not supported on symlinks, there's no point in trying */
        if (S_ISLNK(st->st_mode))
                return 0;

        r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
        if (r == -EOPNOTSUPP)
                return 0;
        if (r < 0)
                return r;

        r = shift_acl(acl, shift, &shifted);
        if (r < 0)
                return r;
        if (r > 0) {
                r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
                if (r < 0)
                        return r;

                changed = true;
        }

        if (S_ISDIR(st->st_mode)) {
                acl_free(acl);
                acl_free(shifted);

                acl = shifted = NULL;

                r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
                if (r < 0)
                        return r;

                r = shift_acl(acl, shift, &shifted);
                if (r < 0)
                        return r;
                if (r > 0) {
                        r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
                        if (r < 0)
                                return r;

                        changed = true;
                }
        }

        return changed;
}

#else

static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
        return 0;
}

#endif

static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
        uid_t new_uid;
        gid_t new_gid;
        bool changed = false;
        int r;

        assert(fd >= 0);
        assert(st);

        new_uid =         shift | (st->st_uid & UINT32_C(0xFFFF));
        new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));

        if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
                return -EINVAL;

        if (st->st_uid != new_uid || st->st_gid != new_gid) {
                if (name)
                        r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
                else
                        r = fchown(fd, new_uid, new_gid);
                if (r < 0)
                        return -errno;

                /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
                if (name) {
                        if (!S_ISLNK(st->st_mode))
                                r = fchmodat(fd, name, st->st_mode, 0);
                        else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
                                r = 0;
                } else
                        r = fchmod(fd, st->st_mode);
                if (r < 0)
                        return -errno;

                changed = true;
        }

        r = patch_acls(fd, name, st, shift);
        if (r < 0)
                return r;

        return r > 0 || changed;
}

/*
 * Check if the filesystem is fully compatible with user namespaces or
 * UID/GID patching. Some filesystems in this list can be fully mounted inside
 * user namespaces, however their inodes may relate to host resources or only
 * valid in the global user namespace, therefore no patching should be applied.
 */
static int is_fs_fully_userns_compatible(const struct statfs *sfs) {

        assert(sfs);

        return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
               F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
}

static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
        _cleanup_closedir_ DIR *d = NULL;
        bool changed = false;
        struct statfs sfs;
        int r;

        assert(fd >= 0);

        if (fstatfs(fd, &sfs) < 0)
                return -errno;

        /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
         * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
         * when we hit procfs, sysfs or some other special file systems. */

        r = is_fs_fully_userns_compatible(&sfs);
        if (r < 0)
                goto finish;
        if (r > 0) {
                r = 0; /* don't recurse */
                goto finish;
        }

        /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
        if ((sfs.f_flags & ST_RDONLY) ||
            access_fd(fd, W_OK) == -EROFS)
                goto read_only;

        if (S_ISDIR(st->st_mode)) {
                struct dirent *de;

                if (!donate_fd) {
                        int copy;

                        copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
                        if (copy < 0) {
                                r = -errno;
                                goto finish;
                        }

                        fd = copy;
                        donate_fd = true;
                }

                d = fdopendir(fd);
                if (!d) {
                        r = -errno;
                        goto finish;
                }
                fd = -1;

                FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
                        struct stat fst;

                        if (dot_or_dot_dot(de->d_name))
                                continue;

                        if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
                                r = -errno;
                                goto finish;
                        }

                        if (S_ISDIR(fst.st_mode)) {
                                int subdir_fd;

                                subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
                                if (subdir_fd < 0) {
                                        r = -errno;
                                        goto finish;

                                }

                                r = recurse_fd(subdir_fd, true, &fst, shift, false);
                                if (r < 0)
                                        goto finish;
                                if (r > 0)
                                        changed = true;

                        } else {
                                r = patch_fd(dirfd(d), de->d_name, &fst, shift);
                                if (r < 0)
                                        goto finish;
                                if (r > 0)
                                        changed = true;
                        }
                }
        }

        /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
         * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
         * tree is properly chown()ed already. */
        r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
        if (r == -EROFS)
                goto read_only;
        if (r > 0)
                changed = true;

        r = changed;
        goto finish;

read_only:
        if (!is_toplevel) {
                _cleanup_free_ char *name = NULL;

                /* When we hit a ready-only subtree we simply skip it, but log about it. */
                (void) fd_get_path(fd, &name);
                log_debug("Skippping read-only file or directory %s.", strna(name));
                r = changed;
        }

finish:
        if (donate_fd)
                safe_close(fd);

        return r;
}

static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
        struct stat st;
        int r;

        assert(fd >= 0);

        /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
         * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
         * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
         * UID within the container. */

        if ((shift & 0xFFFF) != 0) {
                /* We only support containers where the shift starts at a 2^16 boundary */
                r = -EOPNOTSUPP;
                goto finish;
        }

        if (shift == UID_BUSY_BASE) {
                r = -EINVAL;
                goto finish;
        }

        if (range != 0x10000) {
                /* We only support containers with 16bit UID ranges for the patching logic */
                r = -EOPNOTSUPP;
                goto finish;
        }

        if (fstat(fd, &st) < 0) {
                r = -errno;
                goto finish;
        }

        if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
                /* We only support containers where the uid/gid container ID match */
                r = -EBADE;
                goto finish;
        }

        /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
         * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
        if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
                return 0;

        /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
         * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
         * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */

        if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
                if (fchown(fd,
                           UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
                           (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
                        r = -errno;
                        goto finish;
                }
        }

        return recurse_fd(fd, donate_fd, &st, shift, true);

finish:
        if (donate_fd)
                safe_close(fd);

        return r;
}

int fd_patch_uid(int fd, uid_t shift, uid_t range) {
        return fd_patch_uid_internal(fd, false, shift, range);
}

int path_patch_uid(const char *path, uid_t shift, uid_t range) {
        int fd;

        fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
        if (fd < 0)
                return -errno;

        return fd_patch_uid_internal(fd, true, shift, range);
}
Commit	Line	Data
53e1b683	1	/* SPDX-License-Identifier: LGPL-2.1+ */
7336138e LP	2	/***
	3	This file is part of systemd.
	4
	5	Copyright 2016 Lennart Poettering
	6
	7	systemd is free software; you can redistribute it and/or modify it
	8	under the terms of the GNU Lesser General Public License as published by
	9	the Free Software Foundation; either version 2.1 of the License, or
	10	(at your option) any later version.
	11
	12	systemd is distributed in the hope that it will be useful, but
	13	WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	Lesser General Public License for more details.
	16
	17	You should have received a copy of the GNU Lesser General Public License
	18	along with systemd; If not, see <http://www.gnu.org/licenses/>.
	19	***/
	20
	21	#include <fcntl.h>
88cd066e	22	#include <linux/magic.h>
349cc4a5	23	#if HAVE_ACL
7336138e LP	24	#include <sys/acl.h>
	25	#endif
	26	#include <sys/stat.h>
3603efde	27	#include <sys/statvfs.h>
88cd066e	28	#include <sys/vfs.h>
7336138e LP	29	#include <unistd.h>
	30
	31	#include "acl-util.h"
	32	#include "dirent-util.h"
	33	#include "fd-util.h"
3603efde	34	#include "fs-util.h"
88cd066e	35	#include "missing.h"
3603efde	36	#include "nspawn-def.h"
7336138e	37	#include "nspawn-patch-uid.h"
88cd066e	38	#include "stat-util.h"
7336138e LP	39	#include "stdio-util.h"
	40	#include "string-util.h"
	41	#include "strv.h"
	42	#include "user-util.h"
	43
349cc4a5	44	#if HAVE_ACL
7336138e LP	45
7336138e LP	46	static int get_acl(int fd, const char name, acl_type_t type, acl_t ret) {
fbd0b64f	47	char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
7336138e LP	48	acl_t acl;
	49
	50	assert(fd >= 0);
	51	assert(ret);
	52
	53	if (name) {
	54	_cleanup_close_ int child_fd = -1;
	55
	56	child_fd = openat(fd, name, O_PATH\|O_CLOEXEC\|O_NOFOLLOW);
	57	if (child_fd < 0)
	58	return -errno;
	59
	60	xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
	61	acl = acl_get_file(procfs_path, type);
	62	} else if (type == ACL_TYPE_ACCESS)
	63	acl = acl_get_fd(fd);
	64	else {
	65	xsprintf(procfs_path, "/proc/self/fd/%i", fd);
	66	acl = acl_get_file(procfs_path, type);
	67	}
	68	if (!acl)
	69	return -errno;
	70
	71	*ret = acl;
	72	return 0;
	73	}
	74
	75	static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
fbd0b64f	76	char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
7336138e LP	77	int r;
	78
	79	assert(fd >= 0);
	80	assert(acl);
	81
	82	if (name) {
	83	_cleanup_close_ int child_fd = -1;
	84
	85	child_fd = openat(fd, name, O_PATH\|O_CLOEXEC\|O_NOFOLLOW);
	86	if (child_fd < 0)
	87	return -errno;
	88
	89	xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
	90	r = acl_set_file(procfs_path, type, acl);
	91	} else if (type == ACL_TYPE_ACCESS)
	92	r = acl_set_fd(fd, acl);
	93	else {
	94	xsprintf(procfs_path, "/proc/self/fd/%i", fd);
	95	r = acl_set_file(procfs_path, type, acl);
	96	}
	97	if (r < 0)
	98	return -errno;
	99
	100	return 0;
	101	}
	102
	103	static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
	104	_cleanup_(acl_freep) acl_t copy = NULL;
	105	acl_entry_t i;
	106	int r;
	107
	108	assert(acl);
	109	assert(ret);
	110
	111	r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
	112	if (r < 0)
	113	return -errno;
	114	while (r > 0) {
	115	uid_t *old_uid, new_uid;
	116	bool modify = false;
	117	acl_tag_t tag;
	118
	119	if (acl_get_tag_type(i, &tag) < 0)
	120	return -errno;
	121
	122	if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
	123
	124	/* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
	125	* this is actually OK */
	126	assert_cc(sizeof(uid_t) == sizeof(gid_t));
	127
	128	old_uid = acl_get_qualifier(i);
	129	if (!old_uid)
	130	return -errno;
	131
	132	new_uid = shift \| (*old_uid & UINT32_C(0xFFFF));
	133	if (!uid_is_valid(new_uid))
	134	return -EINVAL;
	135
	136	modify = new_uid != *old_uid;
	137	if (modify && !copy) {
	138	int n;
	139
	140	/* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
141	* beginning, so that we copy all entries, starting from the first, this time. */
142
143	n = acl_entries(acl);
144	if (n < 0)
145	return -errno;
146
147	copy = acl_init(n);
148	if (!copy)
149	return -errno;
150
151	/* Seek back to the beginning */
152	r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
153	if (r < 0)
154	return -errno;
155	continue;
156	}
157	}
158
159	if (copy) {
160	acl_entry_t new_entry;
161
162	if (acl_create_entry(&copy, &new_entry) < 0)
163	return -errno;
164
165	if (acl_copy_entry(new_entry, i) < 0)
166	return -errno;
167
168	if (modify)
169	if (acl_set_qualifier(new_entry, &new_uid) < 0)
170	return -errno;
171	}
172
173	r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
174	if (r < 0)
175	return -errno;
176	}
177
1cc6c93a	178	*ret = TAKE_PTR(copy);
7336138e LP	179
	180	return !!*ret;
	181	}
	182
	183	static int patch_acls(int fd, const char name, const struct stat st, uid_t shift) {
	184	_cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
	185	bool changed = false;
	186	int r;
	187
	188	assert(fd >= 0);
	189	assert(st);
	190
	191	/* ACLs are not supported on symlinks, there's no point in trying */
	192	if (S_ISLNK(st->st_mode))
	193	return 0;
	194
	195	r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
	196	if (r == -EOPNOTSUPP)
	197	return 0;
	198	if (r < 0)
	199	return r;
	200
	201	r = shift_acl(acl, shift, &shifted);
	202	if (r < 0)
	203	return r;
	204	if (r > 0) {
	205	r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
	206	if (r < 0)
	207	return r;
	208
	209	changed = true;
	210	}
	211
	212	if (S_ISDIR(st->st_mode)) {
	213	acl_free(acl);
	214	acl_free(shifted);
	215
	216	acl = shifted = NULL;
	217
	218	r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
	219	if (r < 0)
	220	return r;
	221
	222	r = shift_acl(acl, shift, &shifted);
	223	if (r < 0)
	224	return r;
	225	if (r > 0) {
	226	r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
	227	if (r < 0)
	228	return r;
	229
	230	changed = true;
	231	}
	232	}
	233
	234	return changed;
	235	}
	236
	237	#else
	238
	239	static int patch_acls(int fd, const char name, const struct stat st, uid_t shift) {
	240	return 0;
	241	}
	242
243	#endif
244
245	static int patch_fd(int fd, const char name, const struct stat st, uid_t shift) {
246	uid_t new_uid;
247	gid_t new_gid;
248	bool changed = false;
249	int r;
250
251	assert(fd >= 0);
252	assert(st);
253
254	new_uid = shift \| (st->st_uid & UINT32_C(0xFFFF));
255	new_gid = (gid_t) shift \| (st->st_gid & UINT32_C(0xFFFF));
256
257	if (!uid_is_valid(new_uid) \|\| !gid_is_valid(new_gid))
258	return -EINVAL;
259
260	if (st->st_uid != new_uid \|\| st->st_gid != new_gid) {
261	if (name)
262	r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
263	else
264	r = fchown(fd, new_uid, new_gid);
265	if (r < 0)
266	return -errno;
267
268	/* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
0c6aeb46 LP	269	if (name) {
	270	if (!S_ISLNK(st->st_mode))
	271	r = fchmodat(fd, name, st->st_mode, 0);
	272	else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
	273	r = 0;
	274	} else
7336138e LP	275	r = fchmod(fd, st->st_mode);
	276	if (r < 0)
	277	return -errno;
	278
	279	changed = true;
	280	}
	281
	282	r = patch_acls(fd, name, st, shift);
	283	if (r < 0)
	284	return r;
	285
	286	return r > 0 \|\| changed;
	287	}
	288
231bfb1b DH	289	/*
	290	* Check if the filesystem is fully compatible with user namespaces or
	291	* UID/GID patching. Some filesystems in this list can be fully mounted inside
	292	* user namespaces, however their inodes may relate to host resources or only
	293	* valid in the global user namespace, therefore no patching should be applied.
	294	*/
3603efde LP	295	static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
	296
	297	assert(sfs);
	298
	299	return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) \|\|
	300	F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) \|\|
	301	F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) \|\|
	302	F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) \|\|
	303	F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) \|\|
	304	F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) \|\|
	305	F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) \|\|
	306	F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) \|\|
	307	F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) \|\|
	308	F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) \|\|
	309	F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) \|\|
	310	F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) \|\|
	311	F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) \|\|
	312	F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) \|\|
	313	F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) \|\|
	314	F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
88cd066e LP	315	}
88cd066e LP	316
4aeb20f5	317	static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
3603efde	318	_cleanup_closedir_ DIR *d = NULL;
7336138e	319	bool changed = false;
3603efde	320	struct statfs sfs;
7336138e LP	321	int r;
	322
	323	assert(fd >= 0);
	324
3603efde LP	325	if (fstatfs(fd, &sfs) < 0)
	326	return -errno;
	327
	328	/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
	329	* shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
	330	* when we hit procfs, sysfs or some other special file systems. */
	331
	332	r = is_fs_fully_userns_compatible(&sfs);
88cd066e LP	333	if (r < 0)
	334	goto finish;
	335	if (r > 0) {
	336	r = 0; /* don't recurse */
	337	goto finish;
	338	}
	339
3603efde LP	340	/* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
	341	if ((sfs.f_flags & ST_RDONLY) \|\|
	342	access_fd(fd, W_OK) == -EROFS)
	343	goto read_only;
7336138e LP	344
7336138e LP	345	if (S_ISDIR(st->st_mode)) {
7336138e LP	346	struct dirent *de;
	347
	348	if (!donate_fd) {
	349	int copy;
	350
	351	copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
88cd066e LP	352	if (copy < 0) {
	353	r = -errno;
	354	goto finish;
	355	}
7336138e LP	356
	357	fd = copy;
	358	donate_fd = true;
	359	}
	360
	361	d = fdopendir(fd);
	362	if (!d) {
	363	r = -errno;
	364	goto finish;
	365	}
	366	fd = -1;
	367
	368	FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
	369	struct stat fst;
	370
49bfc877	371	if (dot_or_dot_dot(de->d_name))
7336138e LP	372	continue;
	373
	374	if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
	375	r = -errno;
	376	goto finish;
	377	}
	378
	379	if (S_ISDIR(fst.st_mode)) {
	380	int subdir_fd;
	381
	382	subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY\|O_NONBLOCK\|O_DIRECTORY\|O_CLOEXEC\|O_NOFOLLOW\|O_NOATIME);
	383	if (subdir_fd < 0) {
	384	r = -errno;
	385	goto finish;
	386
	387	}
	388
4aeb20f5	389	r = recurse_fd(subdir_fd, true, &fst, shift, false);
7336138e LP	390	if (r < 0)
	391	goto finish;
	392	if (r > 0)
	393	changed = true;
	394
	395	} else {
	396	r = patch_fd(dirfd(d), de->d_name, &fst, shift);
	397	if (r < 0)
	398	goto finish;
	399	if (r > 0)
	400	changed = true;
	401	}
	402	}
	403	}
	404
3603efde LP	405	/* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
	406	* directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
	407	* tree is properly chown()ed already. */
	408	r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
	409	if (r == -EROFS)
	410	goto read_only;
	411	if (r > 0)
	412	changed = true;
	413
7336138e	414	r = changed;
3603efde LP	415	goto finish;
	416
	417	read_only:
	418	if (!is_toplevel) {
	419	_cleanup_free_ char *name = NULL;
	420
	421	/* When we hit a ready-only subtree we simply skip it, but log about it. */
	422	(void) fd_get_path(fd, &name);
	423	log_debug("Skippping read-only file or directory %s.", strna(name));
	424	r = changed;
	425	}
7336138e LP	426
	427	finish:
	428	if (donate_fd)
	429	safe_close(fd);
	430
	431	return r;
	432	}
	433
	434	static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
	435	struct stat st;
	436	int r;
	437
	438	assert(fd >= 0);
	439
	440	/* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
	441	* OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
	442	* following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
	443	* UID within the container. */
	444
	445	if ((shift & 0xFFFF) != 0) {
	446	/* We only support containers where the shift starts at a 2^16 boundary */
	447	r = -EOPNOTSUPP;
	448	goto finish;
	449	}
	450
3603efde LP	451	if (shift == UID_BUSY_BASE) {
	452	r = -EINVAL;
	453	goto finish;
	454	}
	455
7336138e LP	456	if (range != 0x10000) {
	457	/* We only support containers with 16bit UID ranges for the patching logic */
	458	r = -EOPNOTSUPP;
	459	goto finish;
	460	}
	461
	462	if (fstat(fd, &st) < 0) {
	463	r = -errno;
	464	goto finish;
	465	}
	466
	467	if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
	468	/* We only support containers where the uid/gid container ID match */
	469	r = -EBADE;
	470	goto finish;
	471	}
	472
	473	/* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
	474	* that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
	475	if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
	476	return 0;
	477
3603efde LP	478	/* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
	479	* range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
	480	* chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
	481
	482	if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
	483	if (fchown(fd,
	484	UID_BUSY_BASE \| (st.st_uid & ~UID_BUSY_MASK),
	485	(gid_t) UID_BUSY_BASE \| (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
	486	r = -errno;
	487	goto finish;
	488	}
	489	}
	490
4aeb20f5	491	return recurse_fd(fd, donate_fd, &st, shift, true);
7336138e LP	492
	493	finish:
	494	if (donate_fd)
	495	safe_close(fd);
	496
	497	return r;
	498	}
	499
	500	int fd_patch_uid(int fd, uid_t shift, uid_t range) {
	501	return fd_patch_uid_internal(fd, false, shift, range);
	502	}
	503
	504	int path_patch_uid(const char *path, uid_t shift, uid_t range) {
	505	int fd;
	506
	507	fd = open(path, O_RDONLY\|O_NONBLOCK\|O_DIRECTORY\|O_CLOEXEC\|O_NOFOLLOW\|O_NOATIME);
	508	if (fd < 0)
	509	return -errno;
	510
	511	return fd_patch_uid_internal(fd, true, shift, range);
	512	}