]> git.ipfire.org Git - thirdparty/util-linux.git/blob - sys-utils/unshare.c
Merge branch 'apple-midr' of https://github.com/chadmed/util-linux
[thirdparty/util-linux.git] / sys-utils / unshare.c
1 /*
2 * unshare(1) - command-line interface for unshare(2)
3 *
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21 #include <errno.h>
22 #include <getopt.h>
23 #include <poll.h>
24 #include <sched.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <unistd.h>
28 #include <sys/eventfd.h>
29 #include <sys/wait.h>
30 #include <sys/mount.h>
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include <sys/prctl.h>
34 #include <grp.h>
35
36 /* we only need some defines missing in sys/mount.h, no libmount linkage */
37 #include <libmount.h>
38
39 #include "nls.h"
40 #include "c.h"
41 #include "caputils.h"
42 #include "closestream.h"
43 #include "namespace.h"
44 #include "pidfd-utils.h"
45 #include "exec_shell.h"
46 #include "xalloc.h"
47 #include "pathnames.h"
48 #include "all-io.h"
49 #include "signames.h"
50 #include "strutils.h"
51 #include "pwdutils.h"
52
53 /* synchronize parent and child by pipe */
54 #define PIPE_SYNC_BYTE 0x06
55
56 /* 'private' is kernel default */
57 #define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
58
59 /* /proc namespace files and mountpoints for binds */
60 static struct namespace_file {
61 int type; /* CLONE_NEW* */
62 const char *name; /* ns/<type> */
63 const char *target; /* user specified target for bind mount */
64 } namespace_files[] = {
65 { .type = CLONE_NEWUSER, .name = "ns/user" },
66 { .type = CLONE_NEWCGROUP,.name = "ns/cgroup" },
67 { .type = CLONE_NEWIPC, .name = "ns/ipc" },
68 { .type = CLONE_NEWUTS, .name = "ns/uts" },
69 { .type = CLONE_NEWNET, .name = "ns/net" },
70 { .type = CLONE_NEWPID, .name = "ns/pid_for_children" },
71 { .type = CLONE_NEWNS, .name = "ns/mnt" },
72 { .type = CLONE_NEWTIME, .name = "ns/time_for_children" },
73 { .name = NULL }
74 };
75
76 static int npersists; /* number of persistent namespaces */
77
78 enum {
79 SETGROUPS_NONE = -1,
80 SETGROUPS_DENY = 0,
81 SETGROUPS_ALLOW = 1,
82 };
83
84 static const char *setgroups_strings[] =
85 {
86 [SETGROUPS_DENY] = "deny",
87 [SETGROUPS_ALLOW] = "allow"
88 };
89
90 static int setgroups_str2id(const char *str)
91 {
92 size_t i;
93
94 for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++)
95 if (strcmp(str, setgroups_strings[i]) == 0)
96 return i;
97
98 errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str);
99 }
100
101 static void setgroups_control(int action)
102 {
103 const char *file = _PATH_PROC_SETGROUPS;
104 const char *cmd;
105 int fd;
106
107 if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings))
108 return;
109 cmd = setgroups_strings[action];
110
111 fd = open(file, O_WRONLY);
112 if (fd < 0) {
113 if (errno == ENOENT)
114 return;
115 err(EXIT_FAILURE, _("cannot open %s"), file);
116 }
117
118 if (write_all(fd, cmd, strlen(cmd)))
119 err(EXIT_FAILURE, _("write failed %s"), file);
120 close(fd);
121 }
122
123 static void map_id(const char *file, uint32_t from, uint32_t to)
124 {
125 char *buf;
126 int fd;
127
128 fd = open(file, O_WRONLY);
129 if (fd < 0)
130 err(EXIT_FAILURE, _("cannot open %s"), file);
131
132 xasprintf(&buf, "%u %u 1", from, to);
133 if (write_all(fd, buf, strlen(buf)))
134 err(EXIT_FAILURE, _("write failed %s"), file);
135 free(buf);
136 close(fd);
137 }
138
139 static unsigned long parse_propagation(const char *str)
140 {
141 size_t i;
142 static const struct prop_opts {
143 const char *name;
144 unsigned long flag;
145 } opts[] = {
146 { "slave", MS_REC | MS_SLAVE },
147 { "private", MS_REC | MS_PRIVATE },
148 { "shared", MS_REC | MS_SHARED },
149 { "unchanged", 0 }
150 };
151
152 for (i = 0; i < ARRAY_SIZE(opts); i++) {
153 if (strcmp(opts[i].name, str) == 0)
154 return opts[i].flag;
155 }
156
157 errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str);
158 }
159
160 static void set_propagation(unsigned long flags)
161 {
162 if (flags == 0)
163 return;
164
165 if (mount("none", "/", NULL, flags, NULL) != 0)
166 err(EXIT_FAILURE, _("cannot change root filesystem propagation"));
167 }
168
169
170 static int set_ns_target(int type, const char *path)
171 {
172 struct namespace_file *ns;
173
174 for (ns = namespace_files; ns->name; ns++) {
175 if (ns->type != type)
176 continue;
177 ns->target = path;
178 npersists++;
179 return 0;
180 }
181
182 return -EINVAL;
183 }
184
185 static int bind_ns_files(pid_t pid)
186 {
187 struct namespace_file *ns;
188 char src[PATH_MAX];
189
190 for (ns = namespace_files; ns->name; ns++) {
191 if (!ns->target)
192 continue;
193
194 snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name);
195
196 if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0)
197 err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target);
198 }
199
200 return 0;
201 }
202
203 static ino_t get_mnt_ino(pid_t pid)
204 {
205 struct stat st;
206 char path[PATH_MAX];
207
208 snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
209
210 if (stat(path, &st) != 0)
211 err(EXIT_FAILURE, _("stat of %s failed"), path);
212 return st.st_ino;
213 }
214
215 static void settime(time_t offset, clockid_t clk_id)
216 {
217 char buf[sizeof(stringify_value(ULONG_MAX)) * 3];
218 int fd, len;
219
220 len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, (int64_t) offset);
221
222 fd = open("/proc/self/timens_offsets", O_WRONLY);
223 if (fd < 0)
224 err(EXIT_FAILURE, _("failed to open /proc/self/timens_offsets"));
225
226 if (write(fd, buf, len) != len)
227 err(EXIT_FAILURE, _("failed to write to /proc/self/timens_offsets"));
228
229 close(fd);
230 }
231
232 /**
233 * waitchild() - Wait for a process to exit successfully
234 * @pid: PID of the process to wait for
235 *
236 * Wait for a process to exit successfully. If it exits with a non-zero return
237 * code, then exit() with the same status.
238 */
239 static void waitchild(int pid)
240 {
241 int rc, status;
242
243 do {
244 rc = waitpid(pid, &status, 0);
245 if (rc < 0) {
246 if (errno == EINTR)
247 continue;
248 err(EXIT_FAILURE, _("waitpid failed"));
249 }
250 if (WIFEXITED(status) &&
251 WEXITSTATUS(status) != EXIT_SUCCESS)
252 exit(WEXITSTATUS(status));
253 } while (rc < 0);
254 }
255
256 /**
257 * sync_with_child() - Tell our child we're ready and wait for it to exit
258 * @pid: The pid of our child
259 * @fd: A file descriptor created with eventfd()
260 *
261 * This tells a child created with fork_and_wait() that we are ready for it to
262 * continue. Once we have done that, wait for our child to exit.
263 */
264 static void sync_with_child(pid_t pid, int fd)
265 {
266 uint64_t ch = PIPE_SYNC_BYTE;
267
268 write_all(fd, &ch, sizeof(ch));
269 close(fd);
270
271 waitchild(pid);
272 }
273
274 /**
275 * fork_and_wait() - Fork and wait to be sync'd with
276 * @fd - A file descriptor created with eventfd() which should be passed to
277 * sync_with_child()
278 *
279 * This creates an eventfd and forks. The parent process returns immediately,
280 * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
281 * This allows the parent to perform some tasks before the child starts its
282 * work. The parent should call sync_with_child() once it is ready for the
283 * child to continue.
284 *
285 * Return: The pid from fork()
286 */
287 static pid_t fork_and_wait(int *fd)
288 {
289 pid_t pid;
290 uint64_t ch;
291
292 *fd = eventfd(0, 0);
293 if (*fd < 0)
294 err(EXIT_FAILURE, _("eventfd failed"));
295
296 pid = fork();
297 if (pid < 0)
298 err(EXIT_FAILURE, _("fork failed"));
299
300 if (!pid) {
301 /* wait for the our parent to tell us to continue */
302 if (read_all(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) ||
303 ch != PIPE_SYNC_BYTE)
304 err(EXIT_FAILURE, _("failed to read eventfd"));
305 close(*fd);
306 }
307
308 return pid;
309 }
310
311 static pid_t bind_ns_files_from_child(int *fd)
312 {
313 pid_t child, ppid = getpid();
314 ino_t ino = get_mnt_ino(ppid);
315
316 child = fork_and_wait(fd);
317 if (child)
318 return child;
319
320 if (get_mnt_ino(ppid) == ino)
321 exit(EXIT_FAILURE);
322 bind_ns_files(ppid);
323 exit(EXIT_SUCCESS);
324 }
325
326 static uid_t get_user(const char *s, const char *err)
327 {
328 struct passwd *pw;
329 char *buf = NULL;
330 uid_t ret;
331
332 pw = xgetpwnam(s, &buf);
333 if (pw) {
334 ret = pw->pw_uid;
335 free(pw);
336 free(buf);
337 } else {
338 ret = strtoul_or_err(s, err);
339 }
340
341 return ret;
342 }
343
344 static gid_t get_group(const char *s, const char *err)
345 {
346 struct group *gr;
347 char *buf = NULL;
348 gid_t ret;
349
350 gr = xgetgrnam(s, &buf);
351 if (gr) {
352 ret = gr->gr_gid;
353 free(gr);
354 free(buf);
355 } else {
356 ret = strtoul_or_err(s, err);
357 }
358
359 return ret;
360 }
361
362 /**
363 * struct map_range - A range of IDs to map
364 * @outer: First ID mapped on the outside of the namespace
365 * @inner: First ID mapped on the inside of the namespace
366 * @count: Length of the inside and outside ranges
367 *
368 * A range of uids/gids to map using new[gu]idmap.
369 */
370 struct map_range {
371 unsigned int outer;
372 unsigned int inner;
373 unsigned int count;
374 };
375
376 #define UID_BUFSIZ sizeof(stringify_value(ULONG_MAX))
377
378 /**
379 * uint_to_id() - Convert a string into a user/group ID
380 * @name: The string representation of the ID
381 * @sz: The length of @name, without an (optional) nul-terminator
382 *
383 * This converts a (possibly not nul-terminated_ string into user or group ID.
384 * No name lookup is performed.
385 *
386 * Return: @name as a numeric ID
387 */
388 static int uint_to_id(const char *name, size_t sz)
389 {
390 char buf[UID_BUFSIZ];
391
392 mem2strcpy(buf, name, sz, sizeof(buf));
393 return strtoul_or_err(buf, _("could not parse ID"));
394 }
395
396 /**
397 * get_map_range() - Parse a mapping range from a string
398 * @s: A string of the format outer,inner,count
399 *
400 * Parse a string of the form outer,inner,count into a new mapping range.
401 *
402 * Return: A new &struct map_range
403 */
404 static struct map_range *get_map_range(const char *s)
405 {
406 int n, map[3];
407 struct map_range *ret;
408
409 n = string_to_idarray(s, map, ARRAY_SIZE(map), uint_to_id);
410 if (n < 0)
411 errx(EXIT_FAILURE, _("too many elements for mapping '%s'"), s);
412 if (n != ARRAY_SIZE(map))
413 errx(EXIT_FAILURE, _("mapping '%s' contains only %d elements"),
414 s, n);
415
416 ret = xmalloc(sizeof(*ret));
417 ret->outer = map[0];
418 ret->inner = map[1];
419 ret->count = map[2];
420 return ret;
421 }
422
423 /**
424 * read_subid_range() - Look up a user's sub[gu]id range
425 * @filename: The file to look up the range from. This should be either
426 * ``/etc/subuid`` or ``/etc/subgid``.
427 * @uid: The uid of the user whose range we should look up.
428 *
429 * This finds the first subid range matching @uid in @filename.
430 */
431 static struct map_range *read_subid_range(char *filename, uid_t uid)
432 {
433 char *line = NULL, *pwbuf;
434 FILE *idmap;
435 size_t n = 0;
436 struct passwd *pw;
437 struct map_range *map;
438
439 map = xmalloc(sizeof(*map));
440 map->inner = 0;
441
442 pw = xgetpwuid(uid, &pwbuf);
443 if (!pw)
444 errx(EXIT_FAILURE, _("you (user %d) don't exist."), uid);
445
446 idmap = fopen(filename, "r");
447 if (!idmap)
448 err(EXIT_FAILURE, _("could not open '%s'"), filename);
449
450 /*
451 * Each line in sub[ug]idmap looks like
452 * username:subuid:count
453 * OR
454 * uid:subuid:count
455 */
456 while (getline(&line, &n, idmap) != -1) {
457 char *rest, *s;
458
459 rest = strchr(line, ':');
460 if (!rest)
461 continue;
462 *rest = '\0';
463
464 if (strcmp(line, pw->pw_name) &&
465 strtoul(line, NULL, 10) != pw->pw_uid)
466 continue;
467
468 s = rest + 1;
469 rest = strchr(s, ':');
470 if (!rest)
471 continue;
472 *rest = '\0';
473 map->outer = strtoul_or_err(s, _("failed to parse subid map"));
474
475 s = rest + 1;
476 rest = strchr(s, '\n');
477 if (rest)
478 *rest = '\0';
479 map->count = strtoul_or_err(s, _("failed to parse subid map"));
480
481 fclose(idmap);
482 free(pw);
483 free(pwbuf);
484
485 return map;
486 }
487
488 err(EXIT_FAILURE, _("no line matching user \"%s\" in %s"),
489 pw->pw_name, filename);
490 }
491
492 /**
493 * map_ids() - Create a new uid/gid map
494 * @idmapper: Either newuidmap or newgidmap
495 * @ppid: Pid to set the map for
496 * @outer: ID outside the namespace for a single map.
497 * @inner: ID inside the namespace for a single map. May be -1 to only use @map.
498 * @map: A range of IDs to map
499 *
500 * This creates a new uid/gid map for @ppid using @idmapper. The ID @outer in
501 * the parent (our) namespace is mapped to the ID @inner in the child (@ppid's)
502 * namespace. In addition, the range of IDs beginning at @map->outer is mapped
503 * to the range of IDs beginning at @map->inner. The tricky bit is that we
504 * cannot let these mappings overlap. We accomplish this by removing a "hole"
505 * from @map, if @outer or @inner overlap it. This may result in one less than
506 * @map->count IDs being mapped from @map. The unmapped IDs are always the
507 * topmost IDs of the mapping (either in the parent or the child namespace).
508 *
509 * Most of the time, this function will be called with @map->outer as some
510 * large ID, @map->inner as 0, and @map->count as a large number (at least
511 * 1000, but less than @map->outer). Typically, there will be no conflict with
512 * @outer. However, @inner may split the mapping for e.g. --map-current-user.
513 *
514 * This function always exec()s or errors out and does not return.
515 */
516 static void __attribute__((__noreturn__))
517 map_ids(const char *idmapper, int ppid, unsigned int outer, unsigned int inner,
518 struct map_range *map)
519 {
520 /* idmapper + pid + 4 * map + NULL */
521 char *argv[15];
522 /* argv - idmapper - "1" - NULL */
523 char args[12][UID_BUFSIZ];
524 int i = 0, j = 0;
525 struct map_range lo, mid, hi;
526 unsigned int inner_offset, outer_offset;
527
528 /* Some helper macros to reduce bookkeeping */
529 #define push_str(s) do { \
530 argv[i++] = s; \
531 } while (0)
532 #define push_ul(x) do { \
533 snprintf(args[j], sizeof(args[j]), "%u", x); \
534 push_str(args[j++]); \
535 } while (0)
536
537 push_str(xstrdup(idmapper));
538 push_ul(ppid);
539 if ((int)inner == -1) {
540 /*
541 * If we don't have a "single" mapping, then we can just use
542 * map directly
543 */
544 push_ul(map->inner);
545 push_ul(map->outer);
546 push_ul(map->count);
547 push_str(NULL);
548
549 execvp(idmapper, argv);
550 errexec(idmapper);
551 }
552
553 /* If the mappings overlap, remove an ID from map */
554 if ((outer >= map->outer && outer <= map->outer + map->count) ||
555 (inner >= map->inner && inner <= map->inner + map->count))
556 map->count--;
557
558 /* Determine where the splits between lo, mid, and hi will be */
559 outer_offset = min(outer > map->outer ? outer - map->outer : 0,
560 map->count);
561 inner_offset = min(inner > map->inner ? inner - map->inner : 0,
562 map->count);
563
564 /*
565 * In the worst case, we need three mappings:
566 * From the bottom of map to either inner or outer
567 */
568 lo.outer = map->outer;
569 lo.inner = map->inner;
570 lo.count = min(inner_offset, outer_offset);
571
572 /* From the lower of inner or outer to the higher */
573 mid.outer = lo.outer + lo.count;
574 mid.outer += mid.outer == outer;
575 mid.inner = lo.inner + lo.count;
576 mid.inner += mid.inner == inner;
577 mid.count = abs_diff(outer_offset, inner_offset);
578
579 /* And from the higher of inner or outer to the end of the map */
580 hi.outer = mid.outer + mid.count;
581 hi.outer += hi.outer == outer;
582 hi.inner = mid.inner + mid.count;
583 hi.inner += hi.inner == inner;
584 hi.count = map->count - lo.count - mid.count;
585
586 push_ul(inner);
587 push_ul(outer);
588 push_str("1");
589 /* new[gu]idmap doesn't like zero-length mappings, so skip them */
590 if (lo.count) {
591 push_ul(lo.inner);
592 push_ul(lo.outer);
593 push_ul(lo.count);
594 }
595 if (mid.count) {
596 push_ul(mid.inner);
597 push_ul(mid.outer);
598 push_ul(mid.count);
599 }
600 if (hi.count) {
601 push_ul(hi.inner);
602 push_ul(hi.outer);
603 push_ul(hi.count);
604 }
605 push_str(NULL);
606 execvp(idmapper, argv);
607 errexec(idmapper);
608 }
609
610 /**
611 * map_ids_from_child() - Set up a new uid/gid map
612 * @fd: The eventfd to wait on
613 * @mapuser: The user to map the current user to (or -1)
614 * @usermap: The range of UIDs to map (or %NULL)
615 * @mapgroup: The group to map the current group to (or -1)
616 * @groupmap: The range of GIDs to map (or %NULL)
617 *
618 * fork_and_wait() for our parent to call sync_with_child() on @fd. Upon
619 * recieving the go-ahead, use newuidmap and newgidmap to set the uid/gid map
620 * for our parent's PID.
621 *
622 * Return: The pid of the child.
623 */
624 static pid_t map_ids_from_child(int *fd, uid_t mapuser,
625 struct map_range *usermap, gid_t mapgroup,
626 struct map_range *groupmap)
627 {
628 pid_t child, pid = 0;
629 pid_t ppid = getpid();
630
631 child = fork_and_wait(fd);
632 if (child)
633 return child;
634
635 /* Avoid forking more than we need to */
636 if (usermap && groupmap) {
637 pid = fork();
638 if (pid < 0)
639 err(EXIT_FAILURE, _("fork failed"));
640 if (pid)
641 waitchild(pid);
642 }
643
644 if (!pid && usermap)
645 map_ids("newuidmap", ppid, geteuid(), mapuser, usermap);
646 if (groupmap)
647 map_ids("newgidmap", ppid, getegid(), mapgroup, groupmap);
648 exit(EXIT_SUCCESS);
649 }
650
651 static void __attribute__((__noreturn__)) usage(void)
652 {
653 FILE *out = stdout;
654
655 fputs(USAGE_HEADER, out);
656 fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"),
657 program_invocation_short_name);
658
659 fputs(USAGE_SEPARATOR, out);
660 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out);
661
662 fputs(USAGE_OPTIONS, out);
663 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out);
664 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out);
665 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out);
666 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out);
667 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out);
668 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out);
669 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out);
670 fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out);
671 fputs(USAGE_SEPARATOR, out);
672 fputs(_(" -f, --fork fork before launching <program>\n"), out);
673 fputs(_(" --map-user=<uid>|<name> map current user to uid (implies --user)\n"), out);
674 fputs(_(" --map-group=<gid>|<name> map current group to gid (implies --user)\n"), out);
675 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out);
676 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out);
677 fputs(_(" --map-auto map users and groups automatically (implies --user)\n"), out);
678 fputs(_(" --map-users=<outeruid>,<inneruid>,<count>\n"
679 " map count users from outeruid to inneruid (implies --user)\n"), out);
680 fputs(_(" --map-groups=<outergid>,<innergid>,<count>\n"
681 " map count groups from outergid to innergid (implies --user)\n"), out);
682 fputs(USAGE_SEPARATOR, out);
683 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
684 " defaults to SIGKILL\n"), out);
685 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out);
686 fputs(_(" --propagation slave|shared|private|unchanged\n"
687 " modify mount propagation in mount namespace\n"), out);
688 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out);
689 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out);
690 fputs(USAGE_SEPARATOR, out);
691 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out);
692 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out);
693 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out);
694 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out);
695 fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out);
696 fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out);
697
698 fputs(USAGE_SEPARATOR, out);
699 printf(USAGE_HELP_OPTIONS(27));
700 printf(USAGE_MAN_TAIL("unshare(1)"));
701
702 exit(EXIT_SUCCESS);
703 }
704
705 int main(int argc, char *argv[])
706 {
707 enum {
708 OPT_MOUNTPROC = CHAR_MAX + 1,
709 OPT_PROPAGATION,
710 OPT_SETGROUPS,
711 OPT_KILLCHILD,
712 OPT_KEEPCAPS,
713 OPT_MONOTONIC,
714 OPT_BOOTTIME,
715 OPT_MAPUSER,
716 OPT_MAPUSERS,
717 OPT_MAPGROUP,
718 OPT_MAPGROUPS,
719 OPT_MAPAUTO,
720 };
721 static const struct option longopts[] = {
722 { "help", no_argument, NULL, 'h' },
723 { "version", no_argument, NULL, 'V' },
724
725 { "mount", optional_argument, NULL, 'm' },
726 { "uts", optional_argument, NULL, 'u' },
727 { "ipc", optional_argument, NULL, 'i' },
728 { "net", optional_argument, NULL, 'n' },
729 { "pid", optional_argument, NULL, 'p' },
730 { "user", optional_argument, NULL, 'U' },
731 { "cgroup", optional_argument, NULL, 'C' },
732 { "time", optional_argument, NULL, 'T' },
733
734 { "fork", no_argument, NULL, 'f' },
735 { "kill-child", optional_argument, NULL, OPT_KILLCHILD },
736 { "mount-proc", optional_argument, NULL, OPT_MOUNTPROC },
737 { "map-user", required_argument, NULL, OPT_MAPUSER },
738 { "map-users", required_argument, NULL, OPT_MAPUSERS },
739 { "map-group", required_argument, NULL, OPT_MAPGROUP },
740 { "map-groups", required_argument, NULL, OPT_MAPGROUPS },
741 { "map-root-user", no_argument, NULL, 'r' },
742 { "map-current-user", no_argument, NULL, 'c' },
743 { "map-auto", no_argument, NULL, OPT_MAPAUTO },
744 { "propagation", required_argument, NULL, OPT_PROPAGATION },
745 { "setgroups", required_argument, NULL, OPT_SETGROUPS },
746 { "keep-caps", no_argument, NULL, OPT_KEEPCAPS },
747 { "setuid", required_argument, NULL, 'S' },
748 { "setgid", required_argument, NULL, 'G' },
749 { "root", required_argument, NULL, 'R' },
750 { "wd", required_argument, NULL, 'w' },
751 { "monotonic", required_argument, NULL, OPT_MONOTONIC },
752 { "boottime", required_argument, NULL, OPT_BOOTTIME },
753 { NULL, 0, NULL, 0 }
754 };
755
756 int setgrpcmd = SETGROUPS_NONE;
757 int unshare_flags = 0;
758 int c, forkit = 0;
759 uid_t mapuser = -1;
760 gid_t mapgroup = -1;
761 struct map_range *usermap = NULL;
762 struct map_range *groupmap = NULL;
763 int kill_child_signo = 0; /* 0 means --kill-child was not used */
764 const char *procmnt = NULL;
765 const char *newroot = NULL;
766 const char *newdir = NULL;
767 pid_t pid_bind = 0, pid_idmap = 0;
768 pid_t pid = 0;
769 #ifdef UL_HAVE_PIDFD
770 int fd_parent_pid = -1;
771 #endif
772 int fd_idmap, fd_bind = -1;
773 sigset_t sigset, oldsigset;
774 int status;
775 unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT;
776 int force_uid = 0, force_gid = 0;
777 uid_t uid = 0, real_euid = geteuid();
778 gid_t gid = 0, real_egid = getegid();
779 int keepcaps = 0;
780 time_t monotonic = 0;
781 time_t boottime = 0;
782 int force_monotonic = 0;
783 int force_boottime = 0;
784
785 setlocale(LC_ALL, "");
786 bindtextdomain(PACKAGE, LOCALEDIR);
787 textdomain(PACKAGE);
788 close_stdout_atexit();
789
790 while ((c = getopt_long(argc, argv, "+fhVmuinpCTUrR:w:S:G:c", longopts, NULL)) != -1) {
791 switch (c) {
792 case 'f':
793 forkit = 1;
794 break;
795 case 'm':
796 unshare_flags |= CLONE_NEWNS;
797 if (optarg)
798 set_ns_target(CLONE_NEWNS, optarg);
799 break;
800 case 'u':
801 unshare_flags |= CLONE_NEWUTS;
802 if (optarg)
803 set_ns_target(CLONE_NEWUTS, optarg);
804 break;
805 case 'i':
806 unshare_flags |= CLONE_NEWIPC;
807 if (optarg)
808 set_ns_target(CLONE_NEWIPC, optarg);
809 break;
810 case 'n':
811 unshare_flags |= CLONE_NEWNET;
812 if (optarg)
813 set_ns_target(CLONE_NEWNET, optarg);
814 break;
815 case 'p':
816 unshare_flags |= CLONE_NEWPID;
817 if (optarg)
818 set_ns_target(CLONE_NEWPID, optarg);
819 break;
820 case 'U':
821 unshare_flags |= CLONE_NEWUSER;
822 if (optarg)
823 set_ns_target(CLONE_NEWUSER, optarg);
824 break;
825 case 'C':
826 unshare_flags |= CLONE_NEWCGROUP;
827 if (optarg)
828 set_ns_target(CLONE_NEWCGROUP, optarg);
829 break;
830 case 'T':
831 unshare_flags |= CLONE_NEWTIME;
832 if (optarg)
833 set_ns_target(CLONE_NEWTIME, optarg);
834 break;
835 case OPT_MOUNTPROC:
836 unshare_flags |= CLONE_NEWNS;
837 procmnt = optarg ? optarg : "/proc";
838 break;
839 case OPT_MAPUSER:
840 unshare_flags |= CLONE_NEWUSER;
841 mapuser = get_user(optarg, _("failed to parse uid"));
842 break;
843 case OPT_MAPGROUP:
844 unshare_flags |= CLONE_NEWUSER;
845 mapgroup = get_group(optarg, _("failed to parse gid"));
846 break;
847 case 'r':
848 unshare_flags |= CLONE_NEWUSER;
849 mapuser = 0;
850 mapgroup = 0;
851 break;
852 case 'c':
853 unshare_flags |= CLONE_NEWUSER;
854 mapuser = real_euid;
855 mapgroup = real_egid;
856 break;
857 case OPT_MAPUSERS:
858 unshare_flags |= CLONE_NEWUSER;
859 if (!strcmp(optarg, "auto"))
860 usermap = read_subid_range(_PATH_SUBUID, real_euid);
861 else
862 usermap = get_map_range(optarg);
863 break;
864 case OPT_MAPGROUPS:
865 unshare_flags |= CLONE_NEWUSER;
866 if (!strcmp(optarg, "auto"))
867 groupmap = read_subid_range(_PATH_SUBGID, real_euid);
868 else
869 groupmap = get_map_range(optarg);
870 break;
871 case OPT_MAPAUTO:
872 unshare_flags |= CLONE_NEWUSER;
873 usermap = read_subid_range(_PATH_SUBUID, real_euid);
874 groupmap = read_subid_range(_PATH_SUBGID, real_euid);
875 break;
876 case OPT_SETGROUPS:
877 setgrpcmd = setgroups_str2id(optarg);
878 break;
879 case OPT_PROPAGATION:
880 propagation = parse_propagation(optarg);
881 break;
882 case OPT_KILLCHILD:
883 forkit = 1;
884 if (optarg) {
885 if ((kill_child_signo = signame_to_signum(optarg)) < 0)
886 errx(EXIT_FAILURE, _("unknown signal: %s"),
887 optarg);
888 } else {
889 kill_child_signo = SIGKILL;
890 }
891 break;
892 case OPT_KEEPCAPS:
893 keepcaps = 1;
894 cap_last_cap(); /* Force last cap to be cached before we fork. */
895 break;
896 case 'S':
897 uid = strtoul_or_err(optarg, _("failed to parse uid"));
898 force_uid = 1;
899 break;
900 case 'G':
901 gid = strtoul_or_err(optarg, _("failed to parse gid"));
902 force_gid = 1;
903 break;
904 case 'R':
905 newroot = optarg;
906 break;
907 case 'w':
908 newdir = optarg;
909 break;
910 case OPT_MONOTONIC:
911 monotonic = strtoul_or_err(optarg, _("failed to parse monotonic offset"));
912 force_monotonic = 1;
913 break;
914 case OPT_BOOTTIME:
915 boottime = strtoul_or_err(optarg, _("failed to parse boottime offset"));
916 force_boottime = 1;
917 break;
918
919 case 'h':
920 usage();
921 case 'V':
922 print_version(EXIT_SUCCESS);
923 default:
924 errtryhelp(EXIT_FAILURE);
925 }
926 }
927
928 if ((force_monotonic || force_boottime) && !(unshare_flags & CLONE_NEWTIME))
929 errx(EXIT_FAILURE, _("options --monotonic and --boottime require "
930 "unsharing of a time namespace (-t)"));
931
932 /* clear any inherited settings */
933 signal(SIGCHLD, SIG_DFL);
934
935 if (npersists && (unshare_flags & CLONE_NEWNS))
936 pid_bind = bind_ns_files_from_child(&fd_bind);
937
938 if (usermap || groupmap)
939 pid_idmap = map_ids_from_child(&fd_idmap, mapuser, usermap,
940 mapgroup, groupmap);
941
942 if (-1 == unshare(unshare_flags))
943 err(EXIT_FAILURE, _("unshare failed"));
944
945 /* Tell child we've called unshare() */
946 if (usermap || groupmap)
947 sync_with_child(pid_idmap, fd_idmap);
948
949 if (force_boottime)
950 settime(boottime, CLOCK_BOOTTIME);
951
952 if (force_monotonic)
953 settime(monotonic, CLOCK_MONOTONIC);
954
955 if (forkit) {
956 if (sigemptyset(&sigset) != 0 ||
957 sigaddset(&sigset, SIGINT) != 0 ||
958 sigaddset(&sigset, SIGTERM) != 0 ||
959 sigprocmask(SIG_BLOCK, &sigset, &oldsigset) != 0)
960 err(EXIT_FAILURE, _("sigprocmask block failed"));
961 #ifdef UL_HAVE_PIDFD
962 if (kill_child_signo != 0) {
963 /* make a connection to the original process (parent) */
964 fd_parent_pid = pidfd_open(getpid(), 0);
965 if (0 > fd_parent_pid)
966 err(EXIT_FAILURE, _("pidfd_open failed"));
967 }
968 #endif
969 /* force child forking before mountspace binding so
970 * pid_for_children is populated */
971 pid = fork();
972
973 switch(pid) {
974 case -1:
975 err(EXIT_FAILURE, _("fork failed"));
976 case 0: /* child */
977 if (sigprocmask(SIG_SETMASK, &oldsigset, NULL))
978 err(EXIT_FAILURE,
979 _("sigprocmask restore failed"));
980 if (npersists && (unshare_flags & CLONE_NEWNS))
981 close(fd_bind);
982 break;
983 default: /* parent */
984 break;
985 }
986 }
987
988 if (npersists && (pid || !forkit)) {
989 /* run in parent */
990 if (pid_bind && (unshare_flags & CLONE_NEWNS))
991 sync_with_child(pid_bind, fd_bind);
992 else
993 /* simple way, just bind */
994 bind_ns_files(getpid());
995 }
996
997 if (pid) {
998 if (waitpid(pid, &status, 0) == -1)
999 err(EXIT_FAILURE, _("waitpid failed"));
1000
1001 if (WIFEXITED(status))
1002 return WEXITSTATUS(status);
1003 if (WIFSIGNALED(status)) {
1004
1005 /* Ensure the signal that terminated the child will
1006 * also terminate the parent. */
1007
1008 int termsig = WTERMSIG(status);
1009
1010 if (signal(termsig, SIG_DFL) == SIG_ERR ||
1011 sigemptyset(&sigset) != 0 ||
1012 sigaddset(&sigset, termsig) != 0 ||
1013 sigprocmask(SIG_UNBLOCK, &sigset, NULL) != 0)
1014 err(EXIT_FAILURE,
1015 _("sigprocmask unblock failed"));
1016
1017 kill(getpid(), termsig);
1018 }
1019 err(EXIT_FAILURE, _("child exit failed"));
1020 }
1021
1022 if (kill_child_signo != 0) {
1023 if (prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0)
1024 err(EXIT_FAILURE, "prctl failed");
1025 #ifdef UL_HAVE_PIDFD
1026 /* Use poll() to check that there is still the original parent. */
1027 if (fd_parent_pid != -1) {
1028 struct pollfd pollfds[1] = {
1029 { .fd = fd_parent_pid, .events = POLLIN }
1030 };
1031 int nfds = poll(pollfds, 1, 0);
1032
1033 if (0 > nfds)
1034 err(EXIT_FAILURE, "poll parent pidfd failed");
1035
1036 /* If the child was re-parented before prctl(2) was called, the
1037 * new parent will likely not be interested in the precise exit
1038 * status of the orphan.
1039 */
1040 if (nfds)
1041 exit(EXIT_FAILURE);
1042
1043 close(fd_parent_pid);
1044 fd_parent_pid = -1;
1045 }
1046 #endif
1047 }
1048
1049 if (mapuser != (uid_t) -1 && !usermap)
1050 map_id(_PATH_PROC_UIDMAP, mapuser, real_euid);
1051
1052 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
1053 * has been disabled unless /proc/self/setgroups is written
1054 * first to permanently disable the ability to call setgroups
1055 * in that user namespace. */
1056 if (mapgroup != (gid_t) -1 && !groupmap) {
1057 if (setgrpcmd == SETGROUPS_ALLOW)
1058 errx(EXIT_FAILURE, _("options --setgroups=allow and "
1059 "--map-group are mutually exclusive"));
1060 setgroups_control(SETGROUPS_DENY);
1061 map_id(_PATH_PROC_GIDMAP, mapgroup, real_egid);
1062 }
1063
1064 if (setgrpcmd != SETGROUPS_NONE)
1065 setgroups_control(setgrpcmd);
1066
1067 if ((unshare_flags & CLONE_NEWNS) && propagation)
1068 set_propagation(propagation);
1069
1070 if (newroot) {
1071 if (chroot(newroot) != 0)
1072 err(EXIT_FAILURE,
1073 _("cannot change root directory to '%s'"), newroot);
1074 newdir = newdir ?: "/";
1075 }
1076 if (newdir && chdir(newdir))
1077 err(EXIT_FAILURE, _("cannot chdir to '%s'"), newdir);
1078
1079 if (procmnt) {
1080 /* When not changing root and using the default propagation flags
1081 then the recursive propagation change of root will
1082 automatically change that of an existing proc mount. */
1083 if (!newroot && propagation != (MS_PRIVATE|MS_REC)) {
1084 int rc = mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL);
1085
1086 /* Custom procmnt means that proc is very likely not mounted, causing EINVAL.
1087 Ignoring the error in this specific instance is considered safe. */
1088 if(rc != 0 && errno != EINVAL)
1089 err(EXIT_FAILURE, _("cannot change %s filesystem propagation"), procmnt);
1090 }
1091
1092 if (mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0)
1093 err(EXIT_FAILURE, _("mount %s failed"), procmnt);
1094 }
1095
1096 if (force_gid) {
1097 if (setgroups(0, NULL) != 0) /* drop supplementary groups */
1098 err(EXIT_FAILURE, _("setgroups failed"));
1099 if (setgid(gid) < 0) /* change GID */
1100 err(EXIT_FAILURE, _("setgid failed"));
1101 }
1102 if (force_uid && setuid(uid) < 0) /* change UID */
1103 err(EXIT_FAILURE, _("setuid failed"));
1104
1105 /* We use capabilities system calls to propagate the permitted
1106 * capabilities into the ambient set because we have already
1107 * forked so are in async-signal-safe context. */
1108 if (keepcaps && (unshare_flags & CLONE_NEWUSER)) {
1109 struct __user_cap_header_struct header = {
1110 .version = _LINUX_CAPABILITY_VERSION_3,
1111 .pid = 0,
1112 };
1113
1114 struct __user_cap_data_struct payload[_LINUX_CAPABILITY_U32S_3] = {{ 0 }};
1115 uint64_t effective, cap;
1116
1117 if (capget(&header, payload) < 0)
1118 err(EXIT_FAILURE, _("capget failed"));
1119
1120 /* In order the make capabilities ambient, we first need to ensure
1121 * that they are all inheritable. */
1122 payload[0].inheritable = payload[0].permitted;
1123 payload[1].inheritable = payload[1].permitted;
1124
1125 if (capset(&header, payload) < 0)
1126 err(EXIT_FAILURE, _("capset failed"));
1127
1128 effective = ((uint64_t)payload[1].effective << 32) | (uint64_t)payload[0].effective;
1129
1130 for (cap = 0; cap < (sizeof(effective) * 8); cap++) {
1131 /* This is the same check as cap_valid(), but using
1132 * the runtime value for the last valid cap. */
1133 if (cap > (uint64_t) cap_last_cap())
1134 continue;
1135
1136 if ((effective & (1 << cap))
1137 && prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0)
1138 err(EXIT_FAILURE, _("prctl(PR_CAP_AMBIENT) failed"));
1139 }
1140 }
1141
1142 if (optind < argc) {
1143 execvp(argv[optind], argv + optind);
1144 errexec(argv[optind]);
1145 }
1146 exec_shell();
1147 }