]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-patch-uid.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / nspawn / nspawn-patch-uid.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2016 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <fcntl.h>
22 #include <linux/magic.h>
23 #if HAVE_ACL
24 #include <sys/acl.h>
25 #endif
26 #include <sys/stat.h>
27 #include <sys/statvfs.h>
28 #include <sys/vfs.h>
29 #include <unistd.h>
30
31 #include "acl-util.h"
32 #include "dirent-util.h"
33 #include "fd-util.h"
34 #include "fs-util.h"
35 #include "missing.h"
36 #include "nspawn-def.h"
37 #include "nspawn-patch-uid.h"
38 #include "stat-util.h"
39 #include "stdio-util.h"
40 #include "string-util.h"
41 #include "strv.h"
42 #include "user-util.h"
43
44 #if HAVE_ACL
45
46 static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
47 char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
48 acl_t acl;
49
50 assert(fd >= 0);
51 assert(ret);
52
53 if (name) {
54 _cleanup_close_ int child_fd = -1;
55
56 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
57 if (child_fd < 0)
58 return -errno;
59
60 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
61 acl = acl_get_file(procfs_path, type);
62 } else if (type == ACL_TYPE_ACCESS)
63 acl = acl_get_fd(fd);
64 else {
65 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
66 acl = acl_get_file(procfs_path, type);
67 }
68 if (!acl)
69 return -errno;
70
71 *ret = acl;
72 return 0;
73 }
74
75 static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
76 char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
77 int r;
78
79 assert(fd >= 0);
80 assert(acl);
81
82 if (name) {
83 _cleanup_close_ int child_fd = -1;
84
85 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
86 if (child_fd < 0)
87 return -errno;
88
89 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
90 r = acl_set_file(procfs_path, type, acl);
91 } else if (type == ACL_TYPE_ACCESS)
92 r = acl_set_fd(fd, acl);
93 else {
94 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
95 r = acl_set_file(procfs_path, type, acl);
96 }
97 if (r < 0)
98 return -errno;
99
100 return 0;
101 }
102
103 static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
104 _cleanup_(acl_freep) acl_t copy = NULL;
105 acl_entry_t i;
106 int r;
107
108 assert(acl);
109 assert(ret);
110
111 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
112 if (r < 0)
113 return -errno;
114 while (r > 0) {
115 uid_t *old_uid, new_uid;
116 bool modify = false;
117 acl_tag_t tag;
118
119 if (acl_get_tag_type(i, &tag) < 0)
120 return -errno;
121
122 if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
123
124 /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
125 * this is actually OK */
126 assert_cc(sizeof(uid_t) == sizeof(gid_t));
127
128 old_uid = acl_get_qualifier(i);
129 if (!old_uid)
130 return -errno;
131
132 new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
133 if (!uid_is_valid(new_uid))
134 return -EINVAL;
135
136 modify = new_uid != *old_uid;
137 if (modify && !copy) {
138 int n;
139
140 /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
141 * beginning, so that we copy all entries, starting from the first, this time. */
142
143 n = acl_entries(acl);
144 if (n < 0)
145 return -errno;
146
147 copy = acl_init(n);
148 if (!copy)
149 return -errno;
150
151 /* Seek back to the beginning */
152 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
153 if (r < 0)
154 return -errno;
155 continue;
156 }
157 }
158
159 if (copy) {
160 acl_entry_t new_entry;
161
162 if (acl_create_entry(&copy, &new_entry) < 0)
163 return -errno;
164
165 if (acl_copy_entry(new_entry, i) < 0)
166 return -errno;
167
168 if (modify)
169 if (acl_set_qualifier(new_entry, &new_uid) < 0)
170 return -errno;
171 }
172
173 r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
174 if (r < 0)
175 return -errno;
176 }
177
178 *ret = copy;
179 copy = NULL;
180
181 return !!*ret;
182 }
183
184 static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
185 _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
186 bool changed = false;
187 int r;
188
189 assert(fd >= 0);
190 assert(st);
191
192 /* ACLs are not supported on symlinks, there's no point in trying */
193 if (S_ISLNK(st->st_mode))
194 return 0;
195
196 r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
197 if (r == -EOPNOTSUPP)
198 return 0;
199 if (r < 0)
200 return r;
201
202 r = shift_acl(acl, shift, &shifted);
203 if (r < 0)
204 return r;
205 if (r > 0) {
206 r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
207 if (r < 0)
208 return r;
209
210 changed = true;
211 }
212
213 if (S_ISDIR(st->st_mode)) {
214 acl_free(acl);
215 acl_free(shifted);
216
217 acl = shifted = NULL;
218
219 r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
220 if (r < 0)
221 return r;
222
223 r = shift_acl(acl, shift, &shifted);
224 if (r < 0)
225 return r;
226 if (r > 0) {
227 r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
228 if (r < 0)
229 return r;
230
231 changed = true;
232 }
233 }
234
235 return changed;
236 }
237
238 #else
239
240 static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
241 return 0;
242 }
243
244 #endif
245
246 static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
247 uid_t new_uid;
248 gid_t new_gid;
249 bool changed = false;
250 int r;
251
252 assert(fd >= 0);
253 assert(st);
254
255 new_uid = shift | (st->st_uid & UINT32_C(0xFFFF));
256 new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));
257
258 if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
259 return -EINVAL;
260
261 if (st->st_uid != new_uid || st->st_gid != new_gid) {
262 if (name)
263 r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
264 else
265 r = fchown(fd, new_uid, new_gid);
266 if (r < 0)
267 return -errno;
268
269 /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
270 if (name) {
271 if (!S_ISLNK(st->st_mode))
272 r = fchmodat(fd, name, st->st_mode, 0);
273 else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
274 r = 0;
275 } else
276 r = fchmod(fd, st->st_mode);
277 if (r < 0)
278 return -errno;
279
280 changed = true;
281 }
282
283 r = patch_acls(fd, name, st, shift);
284 if (r < 0)
285 return r;
286
287 return r > 0 || changed;
288 }
289
290 /*
291 * Check if the filesystem is fully compatible with user namespaces or
292 * UID/GID patching. Some filesystems in this list can be fully mounted inside
293 * user namespaces, however their inodes may relate to host resources or only
294 * valid in the global user namespace, therefore no patching should be applied.
295 */
296 static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
297
298 assert(sfs);
299
300 return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
301 F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
302 F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
303 F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
304 F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
305 F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
306 F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
307 F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
308 F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
309 F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
310 F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
311 F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
312 F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
313 F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
314 F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
315 F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
316 }
317
318 static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
319 _cleanup_closedir_ DIR *d = NULL;
320 bool changed = false;
321 struct statfs sfs;
322 int r;
323
324 assert(fd >= 0);
325
326 if (fstatfs(fd, &sfs) < 0)
327 return -errno;
328
329 /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
330 * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
331 * when we hit procfs, sysfs or some other special file systems. */
332
333 r = is_fs_fully_userns_compatible(&sfs);
334 if (r < 0)
335 goto finish;
336 if (r > 0) {
337 r = 0; /* don't recurse */
338 goto finish;
339 }
340
341 /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
342 if ((sfs.f_flags & ST_RDONLY) ||
343 access_fd(fd, W_OK) == -EROFS)
344 goto read_only;
345
346 if (S_ISDIR(st->st_mode)) {
347 struct dirent *de;
348
349 if (!donate_fd) {
350 int copy;
351
352 copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
353 if (copy < 0) {
354 r = -errno;
355 goto finish;
356 }
357
358 fd = copy;
359 donate_fd = true;
360 }
361
362 d = fdopendir(fd);
363 if (!d) {
364 r = -errno;
365 goto finish;
366 }
367 fd = -1;
368
369 FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
370 struct stat fst;
371
372 if (dot_or_dot_dot(de->d_name))
373 continue;
374
375 if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
376 r = -errno;
377 goto finish;
378 }
379
380 if (S_ISDIR(fst.st_mode)) {
381 int subdir_fd;
382
383 subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
384 if (subdir_fd < 0) {
385 r = -errno;
386 goto finish;
387
388 }
389
390 r = recurse_fd(subdir_fd, true, &fst, shift, false);
391 if (r < 0)
392 goto finish;
393 if (r > 0)
394 changed = true;
395
396 } else {
397 r = patch_fd(dirfd(d), de->d_name, &fst, shift);
398 if (r < 0)
399 goto finish;
400 if (r > 0)
401 changed = true;
402 }
403 }
404 }
405
406 /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
407 * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
408 * tree is properly chown()ed already. */
409 r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
410 if (r == -EROFS)
411 goto read_only;
412 if (r > 0)
413 changed = true;
414
415 r = changed;
416 goto finish;
417
418 read_only:
419 if (!is_toplevel) {
420 _cleanup_free_ char *name = NULL;
421
422 /* When we hit a ready-only subtree we simply skip it, but log about it. */
423 (void) fd_get_path(fd, &name);
424 log_debug("Skippping read-only file or directory %s.", strna(name));
425 r = changed;
426 }
427
428 finish:
429 if (donate_fd)
430 safe_close(fd);
431
432 return r;
433 }
434
435 static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
436 struct stat st;
437 int r;
438
439 assert(fd >= 0);
440
441 /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
442 * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
443 * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
444 * UID within the container. */
445
446 if ((shift & 0xFFFF) != 0) {
447 /* We only support containers where the shift starts at a 2^16 boundary */
448 r = -EOPNOTSUPP;
449 goto finish;
450 }
451
452 if (shift == UID_BUSY_BASE) {
453 r = -EINVAL;
454 goto finish;
455 }
456
457 if (range != 0x10000) {
458 /* We only support containers with 16bit UID ranges for the patching logic */
459 r = -EOPNOTSUPP;
460 goto finish;
461 }
462
463 if (fstat(fd, &st) < 0) {
464 r = -errno;
465 goto finish;
466 }
467
468 if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
469 /* We only support containers where the uid/gid container ID match */
470 r = -EBADE;
471 goto finish;
472 }
473
474 /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
475 * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
476 if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
477 return 0;
478
479 /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
480 * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
481 * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
482
483 if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
484 if (fchown(fd,
485 UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
486 (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
487 r = -errno;
488 goto finish;
489 }
490 }
491
492 return recurse_fd(fd, donate_fd, &st, shift, true);
493
494 finish:
495 if (donate_fd)
496 safe_close(fd);
497
498 return r;
499 }
500
501 int fd_patch_uid(int fd, uid_t shift, uid_t range) {
502 return fd_patch_uid_internal(fd, false, shift, range);
503 }
504
505 int path_patch_uid(const char *path, uid_t shift, uid_t range) {
506 int fd;
507
508 fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
509 if (fd < 0)
510 return -errno;
511
512 return fd_patch_uid_internal(fd, true, shift, range);
513 }