]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn-patch-uid.c
tree-wide: use TAKE_PTR() and TAKE_FD() macros
[thirdparty/systemd.git] / src / nspawn / nspawn-patch-uid.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
7336138e
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2016 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
21#include <fcntl.h>
88cd066e 22#include <linux/magic.h>
349cc4a5 23#if HAVE_ACL
7336138e
LP
24#include <sys/acl.h>
25#endif
26#include <sys/stat.h>
3603efde 27#include <sys/statvfs.h>
88cd066e 28#include <sys/vfs.h>
7336138e
LP
29#include <unistd.h>
30
31#include "acl-util.h"
32#include "dirent-util.h"
33#include "fd-util.h"
3603efde 34#include "fs-util.h"
88cd066e 35#include "missing.h"
3603efde 36#include "nspawn-def.h"
7336138e 37#include "nspawn-patch-uid.h"
88cd066e 38#include "stat-util.h"
7336138e
LP
39#include "stdio-util.h"
40#include "string-util.h"
41#include "strv.h"
42#include "user-util.h"
43
349cc4a5 44#if HAVE_ACL
7336138e
LP
45
46static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
fbd0b64f 47 char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
7336138e
LP
48 acl_t acl;
49
50 assert(fd >= 0);
51 assert(ret);
52
53 if (name) {
54 _cleanup_close_ int child_fd = -1;
55
56 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
57 if (child_fd < 0)
58 return -errno;
59
60 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
61 acl = acl_get_file(procfs_path, type);
62 } else if (type == ACL_TYPE_ACCESS)
63 acl = acl_get_fd(fd);
64 else {
65 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
66 acl = acl_get_file(procfs_path, type);
67 }
68 if (!acl)
69 return -errno;
70
71 *ret = acl;
72 return 0;
73}
74
75static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
fbd0b64f 76 char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
7336138e
LP
77 int r;
78
79 assert(fd >= 0);
80 assert(acl);
81
82 if (name) {
83 _cleanup_close_ int child_fd = -1;
84
85 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
86 if (child_fd < 0)
87 return -errno;
88
89 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
90 r = acl_set_file(procfs_path, type, acl);
91 } else if (type == ACL_TYPE_ACCESS)
92 r = acl_set_fd(fd, acl);
93 else {
94 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
95 r = acl_set_file(procfs_path, type, acl);
96 }
97 if (r < 0)
98 return -errno;
99
100 return 0;
101}
102
103static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
104 _cleanup_(acl_freep) acl_t copy = NULL;
105 acl_entry_t i;
106 int r;
107
108 assert(acl);
109 assert(ret);
110
111 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
112 if (r < 0)
113 return -errno;
114 while (r > 0) {
115 uid_t *old_uid, new_uid;
116 bool modify = false;
117 acl_tag_t tag;
118
119 if (acl_get_tag_type(i, &tag) < 0)
120 return -errno;
121
122 if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
123
124 /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
125 * this is actually OK */
126 assert_cc(sizeof(uid_t) == sizeof(gid_t));
127
128 old_uid = acl_get_qualifier(i);
129 if (!old_uid)
130 return -errno;
131
132 new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
133 if (!uid_is_valid(new_uid))
134 return -EINVAL;
135
136 modify = new_uid != *old_uid;
137 if (modify && !copy) {
138 int n;
139
140 /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
141 * beginning, so that we copy all entries, starting from the first, this time. */
142
143 n = acl_entries(acl);
144 if (n < 0)
145 return -errno;
146
147 copy = acl_init(n);
148 if (!copy)
149 return -errno;
150
151 /* Seek back to the beginning */
152 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
153 if (r < 0)
154 return -errno;
155 continue;
156 }
157 }
158
159 if (copy) {
160 acl_entry_t new_entry;
161
162 if (acl_create_entry(&copy, &new_entry) < 0)
163 return -errno;
164
165 if (acl_copy_entry(new_entry, i) < 0)
166 return -errno;
167
168 if (modify)
169 if (acl_set_qualifier(new_entry, &new_uid) < 0)
170 return -errno;
171 }
172
173 r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
174 if (r < 0)
175 return -errno;
176 }
177
1cc6c93a 178 *ret = TAKE_PTR(copy);
7336138e
LP
179
180 return !!*ret;
181}
182
183static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
184 _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
185 bool changed = false;
186 int r;
187
188 assert(fd >= 0);
189 assert(st);
190
191 /* ACLs are not supported on symlinks, there's no point in trying */
192 if (S_ISLNK(st->st_mode))
193 return 0;
194
195 r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
196 if (r == -EOPNOTSUPP)
197 return 0;
198 if (r < 0)
199 return r;
200
201 r = shift_acl(acl, shift, &shifted);
202 if (r < 0)
203 return r;
204 if (r > 0) {
205 r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
206 if (r < 0)
207 return r;
208
209 changed = true;
210 }
211
212 if (S_ISDIR(st->st_mode)) {
213 acl_free(acl);
214 acl_free(shifted);
215
216 acl = shifted = NULL;
217
218 r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
219 if (r < 0)
220 return r;
221
222 r = shift_acl(acl, shift, &shifted);
223 if (r < 0)
224 return r;
225 if (r > 0) {
226 r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
227 if (r < 0)
228 return r;
229
230 changed = true;
231 }
232 }
233
234 return changed;
235}
236
237#else
238
239static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
240 return 0;
241}
242
243#endif
244
245static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
246 uid_t new_uid;
247 gid_t new_gid;
248 bool changed = false;
249 int r;
250
251 assert(fd >= 0);
252 assert(st);
253
254 new_uid = shift | (st->st_uid & UINT32_C(0xFFFF));
255 new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));
256
257 if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
258 return -EINVAL;
259
260 if (st->st_uid != new_uid || st->st_gid != new_gid) {
261 if (name)
262 r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
263 else
264 r = fchown(fd, new_uid, new_gid);
265 if (r < 0)
266 return -errno;
267
268 /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
0c6aeb46
LP
269 if (name) {
270 if (!S_ISLNK(st->st_mode))
271 r = fchmodat(fd, name, st->st_mode, 0);
272 else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
273 r = 0;
274 } else
7336138e
LP
275 r = fchmod(fd, st->st_mode);
276 if (r < 0)
277 return -errno;
278
279 changed = true;
280 }
281
282 r = patch_acls(fd, name, st, shift);
283 if (r < 0)
284 return r;
285
286 return r > 0 || changed;
287}
288
231bfb1b
DH
289/*
290 * Check if the filesystem is fully compatible with user namespaces or
291 * UID/GID patching. Some filesystems in this list can be fully mounted inside
292 * user namespaces, however their inodes may relate to host resources or only
293 * valid in the global user namespace, therefore no patching should be applied.
294 */
3603efde
LP
295static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
296
297 assert(sfs);
298
299 return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
300 F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
301 F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
302 F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
303 F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
304 F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
305 F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
306 F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
307 F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
308 F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
309 F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
310 F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
311 F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
312 F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
313 F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
314 F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
88cd066e
LP
315}
316
4aeb20f5 317static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
3603efde 318 _cleanup_closedir_ DIR *d = NULL;
7336138e 319 bool changed = false;
3603efde 320 struct statfs sfs;
7336138e
LP
321 int r;
322
323 assert(fd >= 0);
324
3603efde
LP
325 if (fstatfs(fd, &sfs) < 0)
326 return -errno;
327
328 /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
329 * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
330 * when we hit procfs, sysfs or some other special file systems. */
331
332 r = is_fs_fully_userns_compatible(&sfs);
88cd066e
LP
333 if (r < 0)
334 goto finish;
335 if (r > 0) {
336 r = 0; /* don't recurse */
337 goto finish;
338 }
339
3603efde
LP
340 /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
341 if ((sfs.f_flags & ST_RDONLY) ||
342 access_fd(fd, W_OK) == -EROFS)
343 goto read_only;
7336138e
LP
344
345 if (S_ISDIR(st->st_mode)) {
7336138e
LP
346 struct dirent *de;
347
348 if (!donate_fd) {
349 int copy;
350
351 copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
88cd066e
LP
352 if (copy < 0) {
353 r = -errno;
354 goto finish;
355 }
7336138e
LP
356
357 fd = copy;
358 donate_fd = true;
359 }
360
361 d = fdopendir(fd);
362 if (!d) {
363 r = -errno;
364 goto finish;
365 }
366 fd = -1;
367
368 FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
369 struct stat fst;
370
49bfc877 371 if (dot_or_dot_dot(de->d_name))
7336138e
LP
372 continue;
373
374 if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
375 r = -errno;
376 goto finish;
377 }
378
379 if (S_ISDIR(fst.st_mode)) {
380 int subdir_fd;
381
382 subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
383 if (subdir_fd < 0) {
384 r = -errno;
385 goto finish;
386
387 }
388
4aeb20f5 389 r = recurse_fd(subdir_fd, true, &fst, shift, false);
7336138e
LP
390 if (r < 0)
391 goto finish;
392 if (r > 0)
393 changed = true;
394
395 } else {
396 r = patch_fd(dirfd(d), de->d_name, &fst, shift);
397 if (r < 0)
398 goto finish;
399 if (r > 0)
400 changed = true;
401 }
402 }
403 }
404
3603efde
LP
405 /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
406 * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
407 * tree is properly chown()ed already. */
408 r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
409 if (r == -EROFS)
410 goto read_only;
411 if (r > 0)
412 changed = true;
413
7336138e 414 r = changed;
3603efde
LP
415 goto finish;
416
417read_only:
418 if (!is_toplevel) {
419 _cleanup_free_ char *name = NULL;
420
421 /* When we hit a ready-only subtree we simply skip it, but log about it. */
422 (void) fd_get_path(fd, &name);
423 log_debug("Skippping read-only file or directory %s.", strna(name));
424 r = changed;
425 }
7336138e
LP
426
427finish:
428 if (donate_fd)
429 safe_close(fd);
430
431 return r;
432}
433
434static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
435 struct stat st;
436 int r;
437
438 assert(fd >= 0);
439
440 /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
441 * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
442 * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
443 * UID within the container. */
444
445 if ((shift & 0xFFFF) != 0) {
446 /* We only support containers where the shift starts at a 2^16 boundary */
447 r = -EOPNOTSUPP;
448 goto finish;
449 }
450
3603efde
LP
451 if (shift == UID_BUSY_BASE) {
452 r = -EINVAL;
453 goto finish;
454 }
455
7336138e
LP
456 if (range != 0x10000) {
457 /* We only support containers with 16bit UID ranges for the patching logic */
458 r = -EOPNOTSUPP;
459 goto finish;
460 }
461
462 if (fstat(fd, &st) < 0) {
463 r = -errno;
464 goto finish;
465 }
466
467 if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
468 /* We only support containers where the uid/gid container ID match */
469 r = -EBADE;
470 goto finish;
471 }
472
473 /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
474 * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
475 if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
476 return 0;
477
3603efde
LP
478 /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
479 * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
480 * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
481
482 if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
483 if (fchown(fd,
484 UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
485 (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
486 r = -errno;
487 goto finish;
488 }
489 }
490
4aeb20f5 491 return recurse_fd(fd, donate_fd, &st, shift, true);
7336138e
LP
492
493finish:
494 if (donate_fd)
495 safe_close(fd);
496
497 return r;
498}
499
500int fd_patch_uid(int fd, uid_t shift, uid_t range) {
501 return fd_patch_uid_internal(fd, false, shift, range);
502}
503
504int path_patch_uid(const char *path, uid_t shift, uid_t range) {
505 int fd;
506
507 fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
508 if (fd < 0)
509 return -errno;
510
511 return fd_patch_uid_internal(fd, true, shift, range);
512}