]>
git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-patch-uid.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2016 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <linux/magic.h>
27 #include <sys/statvfs.h>
32 #include "dirent-util.h"
36 #include "nspawn-def.h"
37 #include "nspawn-patch-uid.h"
38 #include "stat-util.h"
39 #include "stdio-util.h"
40 #include "string-util.h"
42 #include "user-util.h"
46 static int get_acl(int fd
, const char *name
, acl_type_t type
, acl_t
*ret
) {
47 char procfs_path
[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
54 _cleanup_close_
int child_fd
= -1;
56 child_fd
= openat(fd
, name
, O_PATH
|O_CLOEXEC
|O_NOFOLLOW
);
60 xsprintf(procfs_path
, "/proc/self/fd/%i", child_fd
);
61 acl
= acl_get_file(procfs_path
, type
);
62 } else if (type
== ACL_TYPE_ACCESS
)
65 xsprintf(procfs_path
, "/proc/self/fd/%i", fd
);
66 acl
= acl_get_file(procfs_path
, type
);
75 static int set_acl(int fd
, const char *name
, acl_type_t type
, acl_t acl
) {
76 char procfs_path
[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
83 _cleanup_close_
int child_fd
= -1;
85 child_fd
= openat(fd
, name
, O_PATH
|O_CLOEXEC
|O_NOFOLLOW
);
89 xsprintf(procfs_path
, "/proc/self/fd/%i", child_fd
);
90 r
= acl_set_file(procfs_path
, type
, acl
);
91 } else if (type
== ACL_TYPE_ACCESS
)
92 r
= acl_set_fd(fd
, acl
);
94 xsprintf(procfs_path
, "/proc/self/fd/%i", fd
);
95 r
= acl_set_file(procfs_path
, type
, acl
);
103 static int shift_acl(acl_t acl
, uid_t shift
, acl_t
*ret
) {
104 _cleanup_(acl_freep
) acl_t copy
= NULL
;
111 r
= acl_get_entry(acl
, ACL_FIRST_ENTRY
, &i
);
115 uid_t
*old_uid
, new_uid
;
119 if (acl_get_tag_type(i
, &tag
) < 0)
122 if (IN_SET(tag
, ACL_USER
, ACL_GROUP
)) {
124 /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
125 * this is actually OK */
126 assert_cc(sizeof(uid_t
) == sizeof(gid_t
));
128 old_uid
= acl_get_qualifier(i
);
132 new_uid
= shift
| (*old_uid
& UINT32_C(0xFFFF));
133 if (!uid_is_valid(new_uid
))
136 modify
= new_uid
!= *old_uid
;
137 if (modify
&& !copy
) {
140 /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
141 * beginning, so that we copy all entries, starting from the first, this time. */
143 n
= acl_entries(acl
);
151 /* Seek back to the beginning */
152 r
= acl_get_entry(acl
, ACL_FIRST_ENTRY
, &i
);
160 acl_entry_t new_entry
;
162 if (acl_create_entry(©
, &new_entry
) < 0)
165 if (acl_copy_entry(new_entry
, i
) < 0)
169 if (acl_set_qualifier(new_entry
, &new_uid
) < 0)
173 r
= acl_get_entry(acl
, ACL_NEXT_ENTRY
, &i
);
178 *ret
= TAKE_PTR(copy
);
183 static int patch_acls(int fd
, const char *name
, const struct stat
*st
, uid_t shift
) {
184 _cleanup_(acl_freep
) acl_t acl
= NULL
, shifted
= NULL
;
185 bool changed
= false;
191 /* ACLs are not supported on symlinks, there's no point in trying */
192 if (S_ISLNK(st
->st_mode
))
195 r
= get_acl(fd
, name
, ACL_TYPE_ACCESS
, &acl
);
196 if (r
== -EOPNOTSUPP
)
201 r
= shift_acl(acl
, shift
, &shifted
);
205 r
= set_acl(fd
, name
, ACL_TYPE_ACCESS
, shifted
);
212 if (S_ISDIR(st
->st_mode
)) {
216 acl
= shifted
= NULL
;
218 r
= get_acl(fd
, name
, ACL_TYPE_DEFAULT
, &acl
);
222 r
= shift_acl(acl
, shift
, &shifted
);
226 r
= set_acl(fd
, name
, ACL_TYPE_DEFAULT
, shifted
);
239 static int patch_acls(int fd
, const char *name
, const struct stat
*st
, uid_t shift
) {
245 static int patch_fd(int fd
, const char *name
, const struct stat
*st
, uid_t shift
) {
248 bool changed
= false;
254 new_uid
= shift
| (st
->st_uid
& UINT32_C(0xFFFF));
255 new_gid
= (gid_t
) shift
| (st
->st_gid
& UINT32_C(0xFFFF));
257 if (!uid_is_valid(new_uid
) || !gid_is_valid(new_gid
))
260 if (st
->st_uid
!= new_uid
|| st
->st_gid
!= new_gid
) {
262 r
= fchownat(fd
, name
, new_uid
, new_gid
, AT_SYMLINK_NOFOLLOW
);
264 r
= fchown(fd
, new_uid
, new_gid
);
268 /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
270 if (!S_ISLNK(st
->st_mode
))
271 r
= fchmodat(fd
, name
, st
->st_mode
, 0);
272 else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
275 r
= fchmod(fd
, st
->st_mode
);
282 r
= patch_acls(fd
, name
, st
, shift
);
286 return r
> 0 || changed
;
290 * Check if the filesystem is fully compatible with user namespaces or
291 * UID/GID patching. Some filesystems in this list can be fully mounted inside
292 * user namespaces, however their inodes may relate to host resources or only
293 * valid in the global user namespace, therefore no patching should be applied.
295 static int is_fs_fully_userns_compatible(const struct statfs
*sfs
) {
299 return F_TYPE_EQUAL(sfs
->f_type
, BINFMTFS_MAGIC
) ||
300 F_TYPE_EQUAL(sfs
->f_type
, CGROUP_SUPER_MAGIC
) ||
301 F_TYPE_EQUAL(sfs
->f_type
, CGROUP2_SUPER_MAGIC
) ||
302 F_TYPE_EQUAL(sfs
->f_type
, DEBUGFS_MAGIC
) ||
303 F_TYPE_EQUAL(sfs
->f_type
, DEVPTS_SUPER_MAGIC
) ||
304 F_TYPE_EQUAL(sfs
->f_type
, EFIVARFS_MAGIC
) ||
305 F_TYPE_EQUAL(sfs
->f_type
, HUGETLBFS_MAGIC
) ||
306 F_TYPE_EQUAL(sfs
->f_type
, MQUEUE_MAGIC
) ||
307 F_TYPE_EQUAL(sfs
->f_type
, PROC_SUPER_MAGIC
) ||
308 F_TYPE_EQUAL(sfs
->f_type
, PSTOREFS_MAGIC
) ||
309 F_TYPE_EQUAL(sfs
->f_type
, SELINUX_MAGIC
) ||
310 F_TYPE_EQUAL(sfs
->f_type
, SMACK_MAGIC
) ||
311 F_TYPE_EQUAL(sfs
->f_type
, SECURITYFS_MAGIC
) ||
312 F_TYPE_EQUAL(sfs
->f_type
, BPF_FS_MAGIC
) ||
313 F_TYPE_EQUAL(sfs
->f_type
, TRACEFS_MAGIC
) ||
314 F_TYPE_EQUAL(sfs
->f_type
, SYSFS_MAGIC
);
317 static int recurse_fd(int fd
, bool donate_fd
, const struct stat
*st
, uid_t shift
, bool is_toplevel
) {
318 _cleanup_closedir_
DIR *d
= NULL
;
319 bool changed
= false;
325 if (fstatfs(fd
, &sfs
) < 0)
328 /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
329 * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
330 * when we hit procfs, sysfs or some other special file systems. */
332 r
= is_fs_fully_userns_compatible(&sfs
);
336 r
= 0; /* don't recurse */
340 /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
341 if ((sfs
.f_flags
& ST_RDONLY
) ||
342 access_fd(fd
, W_OK
) == -EROFS
)
345 if (S_ISDIR(st
->st_mode
)) {
351 copy
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3);
368 FOREACH_DIRENT_ALL(de
, d
, r
= -errno
; goto finish
) {
371 if (dot_or_dot_dot(de
->d_name
))
374 if (fstatat(dirfd(d
), de
->d_name
, &fst
, AT_SYMLINK_NOFOLLOW
) < 0) {
379 if (S_ISDIR(fst
.st_mode
)) {
382 subdir_fd
= openat(dirfd(d
), de
->d_name
, O_RDONLY
|O_NONBLOCK
|O_DIRECTORY
|O_CLOEXEC
|O_NOFOLLOW
|O_NOATIME
);
389 r
= recurse_fd(subdir_fd
, true, &fst
, shift
, false);
396 r
= patch_fd(dirfd(d
), de
->d_name
, &fst
, shift
);
405 /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
406 * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
407 * tree is properly chown()ed already. */
408 r
= patch_fd(d
? dirfd(d
) : fd
, NULL
, st
, shift
);
419 _cleanup_free_
char *name
= NULL
;
421 /* When we hit a ready-only subtree we simply skip it, but log about it. */
422 (void) fd_get_path(fd
, &name
);
423 log_debug("Skippping read-only file or directory %s.", strna(name
));
434 static int fd_patch_uid_internal(int fd
, bool donate_fd
, uid_t shift
, uid_t range
) {
440 /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
441 * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
442 * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
443 * UID within the container. */
445 if ((shift
& 0xFFFF) != 0) {
446 /* We only support containers where the shift starts at a 2^16 boundary */
451 if (shift
== UID_BUSY_BASE
) {
456 if (range
!= 0x10000) {
457 /* We only support containers with 16bit UID ranges for the patching logic */
462 if (fstat(fd
, &st
) < 0) {
467 if ((uint32_t) st
.st_uid
>> 16 != (uint32_t) st
.st_gid
>> 16) {
468 /* We only support containers where the uid/gid container ID match */
473 /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
474 * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
475 if (((uint32_t) (st
.st_uid
^ shift
) >> 16) == 0)
478 /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
479 * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
480 * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
482 if ((st
.st_uid
& UID_BUSY_MASK
) != UID_BUSY_BASE
) {
484 UID_BUSY_BASE
| (st
.st_uid
& ~UID_BUSY_MASK
),
485 (gid_t
) UID_BUSY_BASE
| (st
.st_gid
& ~(gid_t
) UID_BUSY_MASK
)) < 0) {
491 return recurse_fd(fd
, donate_fd
, &st
, shift
, true);
500 int fd_patch_uid(int fd
, uid_t shift
, uid_t range
) {
501 return fd_patch_uid_internal(fd
, false, shift
, range
);
504 int path_patch_uid(const char *path
, uid_t shift
, uid_t range
) {
507 fd
= open(path
, O_RDONLY
|O_NONBLOCK
|O_DIRECTORY
|O_CLOEXEC
|O_NOFOLLOW
|O_NOATIME
);
511 return fd_patch_uid_internal(fd
, true, shift
, range
);