]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-patch-uid.c
Merge pull request #8406 from dell/hibernate-disk-offset
[thirdparty/systemd.git] / src / nspawn / nspawn-patch-uid.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2016 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <fcntl.h>
22 #include <linux/magic.h>
23 #if HAVE_ACL
24 #include <sys/acl.h>
25 #endif
26 #include <sys/stat.h>
27 #include <sys/statvfs.h>
28 #include <sys/vfs.h>
29 #include <unistd.h>
30
31 #include "acl-util.h"
32 #include "dirent-util.h"
33 #include "fd-util.h"
34 #include "fs-util.h"
35 #include "missing.h"
36 #include "nspawn-def.h"
37 #include "nspawn-patch-uid.h"
38 #include "stat-util.h"
39 #include "stdio-util.h"
40 #include "string-util.h"
41 #include "strv.h"
42 #include "user-util.h"
43
44 #if HAVE_ACL
45
46 static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
47 char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
48 acl_t acl;
49
50 assert(fd >= 0);
51 assert(ret);
52
53 if (name) {
54 _cleanup_close_ int child_fd = -1;
55
56 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
57 if (child_fd < 0)
58 return -errno;
59
60 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
61 acl = acl_get_file(procfs_path, type);
62 } else if (type == ACL_TYPE_ACCESS)
63 acl = acl_get_fd(fd);
64 else {
65 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
66 acl = acl_get_file(procfs_path, type);
67 }
68 if (!acl)
69 return -errno;
70
71 *ret = acl;
72 return 0;
73 }
74
75 static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
76 char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
77 int r;
78
79 assert(fd >= 0);
80 assert(acl);
81
82 if (name) {
83 _cleanup_close_ int child_fd = -1;
84
85 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
86 if (child_fd < 0)
87 return -errno;
88
89 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
90 r = acl_set_file(procfs_path, type, acl);
91 } else if (type == ACL_TYPE_ACCESS)
92 r = acl_set_fd(fd, acl);
93 else {
94 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
95 r = acl_set_file(procfs_path, type, acl);
96 }
97 if (r < 0)
98 return -errno;
99
100 return 0;
101 }
102
103 static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
104 _cleanup_(acl_freep) acl_t copy = NULL;
105 acl_entry_t i;
106 int r;
107
108 assert(acl);
109 assert(ret);
110
111 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
112 if (r < 0)
113 return -errno;
114 while (r > 0) {
115 uid_t *old_uid, new_uid;
116 bool modify = false;
117 acl_tag_t tag;
118
119 if (acl_get_tag_type(i, &tag) < 0)
120 return -errno;
121
122 if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
123
124 /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
125 * this is actually OK */
126 assert_cc(sizeof(uid_t) == sizeof(gid_t));
127
128 old_uid = acl_get_qualifier(i);
129 if (!old_uid)
130 return -errno;
131
132 new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
133 if (!uid_is_valid(new_uid))
134 return -EINVAL;
135
136 modify = new_uid != *old_uid;
137 if (modify && !copy) {
138 int n;
139
140 /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
141 * beginning, so that we copy all entries, starting from the first, this time. */
142
143 n = acl_entries(acl);
144 if (n < 0)
145 return -errno;
146
147 copy = acl_init(n);
148 if (!copy)
149 return -errno;
150
151 /* Seek back to the beginning */
152 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
153 if (r < 0)
154 return -errno;
155 continue;
156 }
157 }
158
159 if (copy) {
160 acl_entry_t new_entry;
161
162 if (acl_create_entry(&copy, &new_entry) < 0)
163 return -errno;
164
165 if (acl_copy_entry(new_entry, i) < 0)
166 return -errno;
167
168 if (modify)
169 if (acl_set_qualifier(new_entry, &new_uid) < 0)
170 return -errno;
171 }
172
173 r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
174 if (r < 0)
175 return -errno;
176 }
177
178 *ret = TAKE_PTR(copy);
179
180 return !!*ret;
181 }
182
183 static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
184 _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
185 bool changed = false;
186 int r;
187
188 assert(fd >= 0);
189 assert(st);
190
191 /* ACLs are not supported on symlinks, there's no point in trying */
192 if (S_ISLNK(st->st_mode))
193 return 0;
194
195 r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
196 if (r == -EOPNOTSUPP)
197 return 0;
198 if (r < 0)
199 return r;
200
201 r = shift_acl(acl, shift, &shifted);
202 if (r < 0)
203 return r;
204 if (r > 0) {
205 r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
206 if (r < 0)
207 return r;
208
209 changed = true;
210 }
211
212 if (S_ISDIR(st->st_mode)) {
213 acl_free(acl);
214 acl_free(shifted);
215
216 acl = shifted = NULL;
217
218 r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
219 if (r < 0)
220 return r;
221
222 r = shift_acl(acl, shift, &shifted);
223 if (r < 0)
224 return r;
225 if (r > 0) {
226 r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
227 if (r < 0)
228 return r;
229
230 changed = true;
231 }
232 }
233
234 return changed;
235 }
236
237 #else
238
239 static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
240 return 0;
241 }
242
243 #endif
244
245 static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
246 uid_t new_uid;
247 gid_t new_gid;
248 bool changed = false;
249 int r;
250
251 assert(fd >= 0);
252 assert(st);
253
254 new_uid = shift | (st->st_uid & UINT32_C(0xFFFF));
255 new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));
256
257 if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
258 return -EINVAL;
259
260 if (st->st_uid != new_uid || st->st_gid != new_gid) {
261 if (name)
262 r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
263 else
264 r = fchown(fd, new_uid, new_gid);
265 if (r < 0)
266 return -errno;
267
268 /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
269 if (name) {
270 if (!S_ISLNK(st->st_mode))
271 r = fchmodat(fd, name, st->st_mode, 0);
272 else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
273 r = 0;
274 } else
275 r = fchmod(fd, st->st_mode);
276 if (r < 0)
277 return -errno;
278
279 changed = true;
280 }
281
282 r = patch_acls(fd, name, st, shift);
283 if (r < 0)
284 return r;
285
286 return r > 0 || changed;
287 }
288
289 /*
290 * Check if the filesystem is fully compatible with user namespaces or
291 * UID/GID patching. Some filesystems in this list can be fully mounted inside
292 * user namespaces, however their inodes may relate to host resources or only
293 * valid in the global user namespace, therefore no patching should be applied.
294 */
295 static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
296
297 assert(sfs);
298
299 return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
300 F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
301 F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
302 F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
303 F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
304 F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
305 F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
306 F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
307 F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
308 F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
309 F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
310 F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
311 F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
312 F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
313 F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
314 F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
315 }
316
317 static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
318 _cleanup_closedir_ DIR *d = NULL;
319 bool changed = false;
320 struct statfs sfs;
321 int r;
322
323 assert(fd >= 0);
324
325 if (fstatfs(fd, &sfs) < 0)
326 return -errno;
327
328 /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
329 * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
330 * when we hit procfs, sysfs or some other special file systems. */
331
332 r = is_fs_fully_userns_compatible(&sfs);
333 if (r < 0)
334 goto finish;
335 if (r > 0) {
336 r = 0; /* don't recurse */
337 goto finish;
338 }
339
340 /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
341 if ((sfs.f_flags & ST_RDONLY) ||
342 access_fd(fd, W_OK) == -EROFS)
343 goto read_only;
344
345 if (S_ISDIR(st->st_mode)) {
346 struct dirent *de;
347
348 if (!donate_fd) {
349 int copy;
350
351 copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
352 if (copy < 0) {
353 r = -errno;
354 goto finish;
355 }
356
357 fd = copy;
358 donate_fd = true;
359 }
360
361 d = fdopendir(fd);
362 if (!d) {
363 r = -errno;
364 goto finish;
365 }
366 fd = -1;
367
368 FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
369 struct stat fst;
370
371 if (dot_or_dot_dot(de->d_name))
372 continue;
373
374 if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
375 r = -errno;
376 goto finish;
377 }
378
379 if (S_ISDIR(fst.st_mode)) {
380 int subdir_fd;
381
382 subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
383 if (subdir_fd < 0) {
384 r = -errno;
385 goto finish;
386
387 }
388
389 r = recurse_fd(subdir_fd, true, &fst, shift, false);
390 if (r < 0)
391 goto finish;
392 if (r > 0)
393 changed = true;
394
395 } else {
396 r = patch_fd(dirfd(d), de->d_name, &fst, shift);
397 if (r < 0)
398 goto finish;
399 if (r > 0)
400 changed = true;
401 }
402 }
403 }
404
405 /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
406 * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
407 * tree is properly chown()ed already. */
408 r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
409 if (r == -EROFS)
410 goto read_only;
411 if (r > 0)
412 changed = true;
413
414 r = changed;
415 goto finish;
416
417 read_only:
418 if (!is_toplevel) {
419 _cleanup_free_ char *name = NULL;
420
421 /* When we hit a ready-only subtree we simply skip it, but log about it. */
422 (void) fd_get_path(fd, &name);
423 log_debug("Skippping read-only file or directory %s.", strna(name));
424 r = changed;
425 }
426
427 finish:
428 if (donate_fd)
429 safe_close(fd);
430
431 return r;
432 }
433
434 static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
435 struct stat st;
436 int r;
437
438 assert(fd >= 0);
439
440 /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
441 * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
442 * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
443 * UID within the container. */
444
445 if ((shift & 0xFFFF) != 0) {
446 /* We only support containers where the shift starts at a 2^16 boundary */
447 r = -EOPNOTSUPP;
448 goto finish;
449 }
450
451 if (shift == UID_BUSY_BASE) {
452 r = -EINVAL;
453 goto finish;
454 }
455
456 if (range != 0x10000) {
457 /* We only support containers with 16bit UID ranges for the patching logic */
458 r = -EOPNOTSUPP;
459 goto finish;
460 }
461
462 if (fstat(fd, &st) < 0) {
463 r = -errno;
464 goto finish;
465 }
466
467 if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
468 /* We only support containers where the uid/gid container ID match */
469 r = -EBADE;
470 goto finish;
471 }
472
473 /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
474 * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
475 if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
476 return 0;
477
478 /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
479 * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
480 * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
481
482 if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
483 if (fchown(fd,
484 UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
485 (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
486 r = -errno;
487 goto finish;
488 }
489 }
490
491 return recurse_fd(fd, donate_fd, &st, shift, true);
492
493 finish:
494 if (donate_fd)
495 safe_close(fd);
496
497 return r;
498 }
499
500 int fd_patch_uid(int fd, uid_t shift, uid_t range) {
501 return fd_patch_uid_internal(fd, false, shift, range);
502 }
503
504 int path_patch_uid(const char *path, uid_t shift, uid_t range) {
505 int fd;
506
507 fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
508 if (fd < 0)
509 return -errno;
510
511 return fd_patch_uid_internal(fd, true, shift, range);
512 }