]>
git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-patch-uid.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2016 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <linux/magic.h>
27 #include <sys/statvfs.h>
32 #include "dirent-util.h"
36 #include "nspawn-def.h"
37 #include "nspawn-patch-uid.h"
38 #include "stat-util.h"
39 #include "stdio-util.h"
40 #include "string-util.h"
42 #include "user-util.h"
46 static int get_acl(int fd
, const char *name
, acl_type_t type
, acl_t
*ret
) {
47 char procfs_path
[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
54 _cleanup_close_
int child_fd
= -1;
56 child_fd
= openat(fd
, name
, O_PATH
|O_CLOEXEC
|O_NOFOLLOW
);
60 xsprintf(procfs_path
, "/proc/self/fd/%i", child_fd
);
61 acl
= acl_get_file(procfs_path
, type
);
62 } else if (type
== ACL_TYPE_ACCESS
)
65 xsprintf(procfs_path
, "/proc/self/fd/%i", fd
);
66 acl
= acl_get_file(procfs_path
, type
);
75 static int set_acl(int fd
, const char *name
, acl_type_t type
, acl_t acl
) {
76 char procfs_path
[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
83 _cleanup_close_
int child_fd
= -1;
85 child_fd
= openat(fd
, name
, O_PATH
|O_CLOEXEC
|O_NOFOLLOW
);
89 xsprintf(procfs_path
, "/proc/self/fd/%i", child_fd
);
90 r
= acl_set_file(procfs_path
, type
, acl
);
91 } else if (type
== ACL_TYPE_ACCESS
)
92 r
= acl_set_fd(fd
, acl
);
94 xsprintf(procfs_path
, "/proc/self/fd/%i", fd
);
95 r
= acl_set_file(procfs_path
, type
, acl
);
103 static int shift_acl(acl_t acl
, uid_t shift
, acl_t
*ret
) {
104 _cleanup_(acl_freep
) acl_t copy
= NULL
;
111 r
= acl_get_entry(acl
, ACL_FIRST_ENTRY
, &i
);
115 uid_t
*old_uid
, new_uid
;
119 if (acl_get_tag_type(i
, &tag
) < 0)
122 if (IN_SET(tag
, ACL_USER
, ACL_GROUP
)) {
124 /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
125 * this is actually OK */
126 assert_cc(sizeof(uid_t
) == sizeof(gid_t
));
128 old_uid
= acl_get_qualifier(i
);
132 new_uid
= shift
| (*old_uid
& UINT32_C(0xFFFF));
133 if (!uid_is_valid(new_uid
))
136 modify
= new_uid
!= *old_uid
;
137 if (modify
&& !copy
) {
140 /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
141 * beginning, so that we copy all entries, starting from the first, this time. */
143 n
= acl_entries(acl
);
151 /* Seek back to the beginning */
152 r
= acl_get_entry(acl
, ACL_FIRST_ENTRY
, &i
);
160 acl_entry_t new_entry
;
162 if (acl_create_entry(©
, &new_entry
) < 0)
165 if (acl_copy_entry(new_entry
, i
) < 0)
169 if (acl_set_qualifier(new_entry
, &new_uid
) < 0)
173 r
= acl_get_entry(acl
, ACL_NEXT_ENTRY
, &i
);
184 static int patch_acls(int fd
, const char *name
, const struct stat
*st
, uid_t shift
) {
185 _cleanup_(acl_freep
) acl_t acl
= NULL
, shifted
= NULL
;
186 bool changed
= false;
192 /* ACLs are not supported on symlinks, there's no point in trying */
193 if (S_ISLNK(st
->st_mode
))
196 r
= get_acl(fd
, name
, ACL_TYPE_ACCESS
, &acl
);
197 if (r
== -EOPNOTSUPP
)
202 r
= shift_acl(acl
, shift
, &shifted
);
206 r
= set_acl(fd
, name
, ACL_TYPE_ACCESS
, shifted
);
213 if (S_ISDIR(st
->st_mode
)) {
217 acl
= shifted
= NULL
;
219 r
= get_acl(fd
, name
, ACL_TYPE_DEFAULT
, &acl
);
223 r
= shift_acl(acl
, shift
, &shifted
);
227 r
= set_acl(fd
, name
, ACL_TYPE_DEFAULT
, shifted
);
240 static int patch_acls(int fd
, const char *name
, const struct stat
*st
, uid_t shift
) {
246 static int patch_fd(int fd
, const char *name
, const struct stat
*st
, uid_t shift
) {
249 bool changed
= false;
255 new_uid
= shift
| (st
->st_uid
& UINT32_C(0xFFFF));
256 new_gid
= (gid_t
) shift
| (st
->st_gid
& UINT32_C(0xFFFF));
258 if (!uid_is_valid(new_uid
) || !gid_is_valid(new_gid
))
261 if (st
->st_uid
!= new_uid
|| st
->st_gid
!= new_gid
) {
263 r
= fchownat(fd
, name
, new_uid
, new_gid
, AT_SYMLINK_NOFOLLOW
);
265 r
= fchown(fd
, new_uid
, new_gid
);
269 /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
271 if (!S_ISLNK(st
->st_mode
))
272 r
= fchmodat(fd
, name
, st
->st_mode
, 0);
273 else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
276 r
= fchmod(fd
, st
->st_mode
);
283 r
= patch_acls(fd
, name
, st
, shift
);
287 return r
> 0 || changed
;
291 * Check if the filesystem is fully compatible with user namespaces or
292 * UID/GID patching. Some filesystems in this list can be fully mounted inside
293 * user namespaces, however their inodes may relate to host resources or only
294 * valid in the global user namespace, therefore no patching should be applied.
296 static int is_fs_fully_userns_compatible(const struct statfs
*sfs
) {
300 return F_TYPE_EQUAL(sfs
->f_type
, BINFMTFS_MAGIC
) ||
301 F_TYPE_EQUAL(sfs
->f_type
, CGROUP_SUPER_MAGIC
) ||
302 F_TYPE_EQUAL(sfs
->f_type
, CGROUP2_SUPER_MAGIC
) ||
303 F_TYPE_EQUAL(sfs
->f_type
, DEBUGFS_MAGIC
) ||
304 F_TYPE_EQUAL(sfs
->f_type
, DEVPTS_SUPER_MAGIC
) ||
305 F_TYPE_EQUAL(sfs
->f_type
, EFIVARFS_MAGIC
) ||
306 F_TYPE_EQUAL(sfs
->f_type
, HUGETLBFS_MAGIC
) ||
307 F_TYPE_EQUAL(sfs
->f_type
, MQUEUE_MAGIC
) ||
308 F_TYPE_EQUAL(sfs
->f_type
, PROC_SUPER_MAGIC
) ||
309 F_TYPE_EQUAL(sfs
->f_type
, PSTOREFS_MAGIC
) ||
310 F_TYPE_EQUAL(sfs
->f_type
, SELINUX_MAGIC
) ||
311 F_TYPE_EQUAL(sfs
->f_type
, SMACK_MAGIC
) ||
312 F_TYPE_EQUAL(sfs
->f_type
, SECURITYFS_MAGIC
) ||
313 F_TYPE_EQUAL(sfs
->f_type
, BPF_FS_MAGIC
) ||
314 F_TYPE_EQUAL(sfs
->f_type
, TRACEFS_MAGIC
) ||
315 F_TYPE_EQUAL(sfs
->f_type
, SYSFS_MAGIC
);
318 static int recurse_fd(int fd
, bool donate_fd
, const struct stat
*st
, uid_t shift
, bool is_toplevel
) {
319 _cleanup_closedir_
DIR *d
= NULL
;
320 bool changed
= false;
326 if (fstatfs(fd
, &sfs
) < 0)
329 /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
330 * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
331 * when we hit procfs, sysfs or some other special file systems. */
333 r
= is_fs_fully_userns_compatible(&sfs
);
337 r
= 0; /* don't recurse */
341 /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
342 if ((sfs
.f_flags
& ST_RDONLY
) ||
343 access_fd(fd
, W_OK
) == -EROFS
)
346 if (S_ISDIR(st
->st_mode
)) {
352 copy
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3);
369 FOREACH_DIRENT_ALL(de
, d
, r
= -errno
; goto finish
) {
372 if (dot_or_dot_dot(de
->d_name
))
375 if (fstatat(dirfd(d
), de
->d_name
, &fst
, AT_SYMLINK_NOFOLLOW
) < 0) {
380 if (S_ISDIR(fst
.st_mode
)) {
383 subdir_fd
= openat(dirfd(d
), de
->d_name
, O_RDONLY
|O_NONBLOCK
|O_DIRECTORY
|O_CLOEXEC
|O_NOFOLLOW
|O_NOATIME
);
390 r
= recurse_fd(subdir_fd
, true, &fst
, shift
, false);
397 r
= patch_fd(dirfd(d
), de
->d_name
, &fst
, shift
);
406 /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
407 * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
408 * tree is properly chown()ed already. */
409 r
= patch_fd(d
? dirfd(d
) : fd
, NULL
, st
, shift
);
420 _cleanup_free_
char *name
= NULL
;
422 /* When we hit a ready-only subtree we simply skip it, but log about it. */
423 (void) fd_get_path(fd
, &name
);
424 log_debug("Skippping read-only file or directory %s.", strna(name
));
435 static int fd_patch_uid_internal(int fd
, bool donate_fd
, uid_t shift
, uid_t range
) {
441 /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
442 * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
443 * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
444 * UID within the container. */
446 if ((shift
& 0xFFFF) != 0) {
447 /* We only support containers where the shift starts at a 2^16 boundary */
452 if (shift
== UID_BUSY_BASE
) {
457 if (range
!= 0x10000) {
458 /* We only support containers with 16bit UID ranges for the patching logic */
463 if (fstat(fd
, &st
) < 0) {
468 if ((uint32_t) st
.st_uid
>> 16 != (uint32_t) st
.st_gid
>> 16) {
469 /* We only support containers where the uid/gid container ID match */
474 /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
475 * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
476 if (((uint32_t) (st
.st_uid
^ shift
) >> 16) == 0)
479 /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
480 * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
481 * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
483 if ((st
.st_uid
& UID_BUSY_MASK
) != UID_BUSY_BASE
) {
485 UID_BUSY_BASE
| (st
.st_uid
& ~UID_BUSY_MASK
),
486 (gid_t
) UID_BUSY_BASE
| (st
.st_gid
& ~(gid_t
) UID_BUSY_MASK
)) < 0) {
492 return recurse_fd(fd
, donate_fd
, &st
, shift
, true);
501 int fd_patch_uid(int fd
, uid_t shift
, uid_t range
) {
502 return fd_patch_uid_internal(fd
, false, shift
, range
);
505 int path_patch_uid(const char *path
, uid_t shift
, uid_t range
) {
508 fd
= open(path
, O_RDONLY
|O_NONBLOCK
|O_DIRECTORY
|O_CLOEXEC
|O_NOFOLLOW
|O_NOATIME
);
512 return fd_patch_uid_internal(fd
, true, shift
, range
);