]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn-patch-uid.c
Merge pull request #11827 from keszybz/pkgconfig-variables
[thirdparty/systemd.git] / src / nspawn / nspawn-patch-uid.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
7336138e
LP
2
3#include <fcntl.h>
88cd066e 4#include <linux/magic.h>
349cc4a5 5#if HAVE_ACL
7336138e
LP
6#include <sys/acl.h>
7#endif
8#include <sys/stat.h>
3603efde 9#include <sys/statvfs.h>
88cd066e 10#include <sys/vfs.h>
7336138e
LP
11#include <unistd.h>
12
13#include "acl-util.h"
14#include "dirent-util.h"
15#include "fd-util.h"
3603efde 16#include "fs-util.h"
88cd066e 17#include "missing.h"
3603efde 18#include "nspawn-def.h"
7336138e 19#include "nspawn-patch-uid.h"
88cd066e 20#include "stat-util.h"
7336138e
LP
21#include "stdio-util.h"
22#include "string-util.h"
23#include "strv.h"
24#include "user-util.h"
25
349cc4a5 26#if HAVE_ACL
7336138e
LP
27
28static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
fbd0b64f 29 char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
7336138e
LP
30 acl_t acl;
31
32 assert(fd >= 0);
33 assert(ret);
34
35 if (name) {
36 _cleanup_close_ int child_fd = -1;
37
38 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
39 if (child_fd < 0)
40 return -errno;
41
42 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
43 acl = acl_get_file(procfs_path, type);
44 } else if (type == ACL_TYPE_ACCESS)
45 acl = acl_get_fd(fd);
46 else {
47 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
48 acl = acl_get_file(procfs_path, type);
49 }
50 if (!acl)
51 return -errno;
52
53 *ret = acl;
54 return 0;
55}
56
57static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
fbd0b64f 58 char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
7336138e
LP
59 int r;
60
61 assert(fd >= 0);
62 assert(acl);
63
64 if (name) {
65 _cleanup_close_ int child_fd = -1;
66
67 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
68 if (child_fd < 0)
69 return -errno;
70
71 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
72 r = acl_set_file(procfs_path, type, acl);
73 } else if (type == ACL_TYPE_ACCESS)
74 r = acl_set_fd(fd, acl);
75 else {
76 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
77 r = acl_set_file(procfs_path, type, acl);
78 }
79 if (r < 0)
80 return -errno;
81
82 return 0;
83}
84
85static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
86 _cleanup_(acl_freep) acl_t copy = NULL;
87 acl_entry_t i;
88 int r;
89
90 assert(acl);
91 assert(ret);
92
93 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
94 if (r < 0)
95 return -errno;
96 while (r > 0) {
97 uid_t *old_uid, new_uid;
98 bool modify = false;
99 acl_tag_t tag;
100
101 if (acl_get_tag_type(i, &tag) < 0)
102 return -errno;
103
104 if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
105
106 /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
107 * this is actually OK */
108 assert_cc(sizeof(uid_t) == sizeof(gid_t));
109
110 old_uid = acl_get_qualifier(i);
111 if (!old_uid)
112 return -errno;
113
114 new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
115 if (!uid_is_valid(new_uid))
116 return -EINVAL;
117
118 modify = new_uid != *old_uid;
119 if (modify && !copy) {
120 int n;
121
122 /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
123 * beginning, so that we copy all entries, starting from the first, this time. */
124
125 n = acl_entries(acl);
126 if (n < 0)
127 return -errno;
128
129 copy = acl_init(n);
130 if (!copy)
131 return -errno;
132
133 /* Seek back to the beginning */
134 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
135 if (r < 0)
136 return -errno;
137 continue;
138 }
139 }
140
141 if (copy) {
142 acl_entry_t new_entry;
143
144 if (acl_create_entry(&copy, &new_entry) < 0)
145 return -errno;
146
147 if (acl_copy_entry(new_entry, i) < 0)
148 return -errno;
149
150 if (modify)
151 if (acl_set_qualifier(new_entry, &new_uid) < 0)
152 return -errno;
153 }
154
155 r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
156 if (r < 0)
157 return -errno;
158 }
159
1cc6c93a 160 *ret = TAKE_PTR(copy);
7336138e
LP
161
162 return !!*ret;
163}
164
165static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
166 _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
167 bool changed = false;
168 int r;
169
170 assert(fd >= 0);
171 assert(st);
172
173 /* ACLs are not supported on symlinks, there's no point in trying */
174 if (S_ISLNK(st->st_mode))
175 return 0;
176
177 r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
178 if (r == -EOPNOTSUPP)
179 return 0;
180 if (r < 0)
181 return r;
182
183 r = shift_acl(acl, shift, &shifted);
184 if (r < 0)
185 return r;
186 if (r > 0) {
187 r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
188 if (r < 0)
189 return r;
190
191 changed = true;
192 }
193
194 if (S_ISDIR(st->st_mode)) {
195 acl_free(acl);
196 acl_free(shifted);
197
198 acl = shifted = NULL;
199
200 r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
201 if (r < 0)
202 return r;
203
204 r = shift_acl(acl, shift, &shifted);
205 if (r < 0)
206 return r;
207 if (r > 0) {
208 r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
209 if (r < 0)
210 return r;
211
212 changed = true;
213 }
214 }
215
216 return changed;
217}
218
219#else
220
221static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
222 return 0;
223}
224
225#endif
226
227static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
228 uid_t new_uid;
229 gid_t new_gid;
230 bool changed = false;
231 int r;
232
233 assert(fd >= 0);
234 assert(st);
235
236 new_uid = shift | (st->st_uid & UINT32_C(0xFFFF));
237 new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));
238
239 if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
240 return -EINVAL;
241
242 if (st->st_uid != new_uid || st->st_gid != new_gid) {
243 if (name)
244 r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
245 else
246 r = fchown(fd, new_uid, new_gid);
247 if (r < 0)
248 return -errno;
249
250 /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
0c6aeb46
LP
251 if (name) {
252 if (!S_ISLNK(st->st_mode))
253 r = fchmodat(fd, name, st->st_mode, 0);
254 else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
255 r = 0;
256 } else
7336138e
LP
257 r = fchmod(fd, st->st_mode);
258 if (r < 0)
259 return -errno;
260
261 changed = true;
262 }
263
264 r = patch_acls(fd, name, st, shift);
265 if (r < 0)
266 return r;
267
268 return r > 0 || changed;
269}
270
231bfb1b
DH
271/*
272 * Check if the filesystem is fully compatible with user namespaces or
273 * UID/GID patching. Some filesystems in this list can be fully mounted inside
274 * user namespaces, however their inodes may relate to host resources or only
275 * valid in the global user namespace, therefore no patching should be applied.
276 */
3603efde
LP
277static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
278
279 assert(sfs);
280
281 return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
282 F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
283 F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
284 F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
285 F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
286 F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
287 F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
288 F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
289 F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
290 F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
291 F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
292 F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
293 F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
294 F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
295 F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
296 F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
88cd066e
LP
297}
298
4aeb20f5 299static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
3603efde 300 _cleanup_closedir_ DIR *d = NULL;
7336138e 301 bool changed = false;
3603efde 302 struct statfs sfs;
7336138e
LP
303 int r;
304
305 assert(fd >= 0);
306
3603efde
LP
307 if (fstatfs(fd, &sfs) < 0)
308 return -errno;
309
310 /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
311 * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
312 * when we hit procfs, sysfs or some other special file systems. */
313
314 r = is_fs_fully_userns_compatible(&sfs);
88cd066e
LP
315 if (r < 0)
316 goto finish;
317 if (r > 0) {
318 r = 0; /* don't recurse */
319 goto finish;
320 }
321
3603efde
LP
322 /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
323 if ((sfs.f_flags & ST_RDONLY) ||
324 access_fd(fd, W_OK) == -EROFS)
325 goto read_only;
7336138e
LP
326
327 if (S_ISDIR(st->st_mode)) {
7336138e
LP
328 struct dirent *de;
329
330 if (!donate_fd) {
331 int copy;
332
333 copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
88cd066e
LP
334 if (copy < 0) {
335 r = -errno;
336 goto finish;
337 }
7336138e
LP
338
339 fd = copy;
340 donate_fd = true;
341 }
342
343 d = fdopendir(fd);
344 if (!d) {
345 r = -errno;
346 goto finish;
347 }
348 fd = -1;
349
350 FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
351 struct stat fst;
352
49bfc877 353 if (dot_or_dot_dot(de->d_name))
7336138e
LP
354 continue;
355
356 if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
357 r = -errno;
358 goto finish;
359 }
360
361 if (S_ISDIR(fst.st_mode)) {
362 int subdir_fd;
363
364 subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
365 if (subdir_fd < 0) {
366 r = -errno;
367 goto finish;
368
369 }
370
4aeb20f5 371 r = recurse_fd(subdir_fd, true, &fst, shift, false);
7336138e
LP
372 if (r < 0)
373 goto finish;
374 if (r > 0)
375 changed = true;
376
377 } else {
378 r = patch_fd(dirfd(d), de->d_name, &fst, shift);
379 if (r < 0)
380 goto finish;
381 if (r > 0)
382 changed = true;
383 }
384 }
385 }
386
3603efde
LP
387 /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
388 * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
389 * tree is properly chown()ed already. */
390 r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
391 if (r == -EROFS)
392 goto read_only;
393 if (r > 0)
394 changed = true;
395
7336138e 396 r = changed;
3603efde
LP
397 goto finish;
398
399read_only:
400 if (!is_toplevel) {
401 _cleanup_free_ char *name = NULL;
402
403 /* When we hit a ready-only subtree we simply skip it, but log about it. */
404 (void) fd_get_path(fd, &name);
405 log_debug("Skippping read-only file or directory %s.", strna(name));
406 r = changed;
407 }
7336138e
LP
408
409finish:
410 if (donate_fd)
411 safe_close(fd);
412
413 return r;
414}
415
416static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
417 struct stat st;
418 int r;
419
420 assert(fd >= 0);
421
422 /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
423 * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
424 * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
425 * UID within the container. */
426
427 if ((shift & 0xFFFF) != 0) {
428 /* We only support containers where the shift starts at a 2^16 boundary */
429 r = -EOPNOTSUPP;
430 goto finish;
431 }
432
3603efde
LP
433 if (shift == UID_BUSY_BASE) {
434 r = -EINVAL;
435 goto finish;
436 }
437
7336138e
LP
438 if (range != 0x10000) {
439 /* We only support containers with 16bit UID ranges for the patching logic */
440 r = -EOPNOTSUPP;
441 goto finish;
442 }
443
444 if (fstat(fd, &st) < 0) {
445 r = -errno;
446 goto finish;
447 }
448
449 if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
450 /* We only support containers where the uid/gid container ID match */
451 r = -EBADE;
452 goto finish;
453 }
454
455 /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
456 * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
457 if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
458 return 0;
459
3603efde
LP
460 /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
461 * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
462 * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
463
464 if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
465 if (fchown(fd,
466 UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
467 (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
468 r = -errno;
469 goto finish;
470 }
471 }
472
4aeb20f5 473 return recurse_fd(fd, donate_fd, &st, shift, true);
7336138e
LP
474
475finish:
476 if (donate_fd)
477 safe_close(fd);
478
479 return r;
480}
481
7336138e
LP
482int path_patch_uid(const char *path, uid_t shift, uid_t range) {
483 int fd;
484
485 fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
486 if (fd < 0)
487 return -errno;
488
489 return fd_patch_uid_internal(fd, true, shift, range);
490}