]> git.ipfire.org Git - thirdparty/util-linux.git/blob - sys-utils/unshare.c
Merge branch 'rename-basename' of https://github.com/t-8ch/util-linux
[thirdparty/util-linux.git] / sys-utils / unshare.c
1 /*
2 * unshare(1) - command-line interface for unshare(2)
3 *
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21 #include <errno.h>
22 #include <getopt.h>
23 #include <poll.h>
24 #include <sched.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <unistd.h>
28 #include <sys/eventfd.h>
29 #include <sys/wait.h>
30 #include <sys/mount.h>
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include <sys/prctl.h>
34 #include <grp.h>
35
36 /* we only need some defines missing in sys/mount.h, no libmount linkage */
37 #include <libmount.h>
38
39 #include "nls.h"
40 #include "c.h"
41 #include "caputils.h"
42 #include "closestream.h"
43 #include "namespace.h"
44 #include "pidfd-utils.h"
45 #include "exec_shell.h"
46 #include "xalloc.h"
47 #include "pathnames.h"
48 #include "all-io.h"
49 #include "signames.h"
50 #include "strutils.h"
51 #include "pwdutils.h"
52
53 /* synchronize parent and child by pipe */
54 #define PIPE_SYNC_BYTE 0x06
55
56 /* 'private' is kernel default */
57 #define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
58
59 /* /proc namespace files and mountpoints for binds */
60 static struct namespace_file {
61 int type; /* CLONE_NEW* */
62 const char *name; /* ns/<type> */
63 const char *target; /* user specified target for bind mount */
64 } namespace_files[] = {
65 { .type = CLONE_NEWUSER, .name = "ns/user" },
66 { .type = CLONE_NEWCGROUP,.name = "ns/cgroup" },
67 { .type = CLONE_NEWIPC, .name = "ns/ipc" },
68 { .type = CLONE_NEWUTS, .name = "ns/uts" },
69 { .type = CLONE_NEWNET, .name = "ns/net" },
70 { .type = CLONE_NEWPID, .name = "ns/pid_for_children" },
71 { .type = CLONE_NEWNS, .name = "ns/mnt" },
72 { .type = CLONE_NEWTIME, .name = "ns/time_for_children" },
73 { .name = NULL }
74 };
75
76 static int npersists; /* number of persistent namespaces */
77
78 enum {
79 SETGROUPS_NONE = -1,
80 SETGROUPS_DENY = 0,
81 SETGROUPS_ALLOW = 1,
82 };
83
84 static const char *setgroups_strings[] =
85 {
86 [SETGROUPS_DENY] = "deny",
87 [SETGROUPS_ALLOW] = "allow"
88 };
89
90 static int setgroups_str2id(const char *str)
91 {
92 size_t i;
93
94 for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++)
95 if (strcmp(str, setgroups_strings[i]) == 0)
96 return i;
97
98 errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str);
99 }
100
101 static void setgroups_control(int action)
102 {
103 const char *file = _PATH_PROC_SETGROUPS;
104 const char *cmd;
105 int fd;
106
107 if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings))
108 return;
109 cmd = setgroups_strings[action];
110
111 fd = open(file, O_WRONLY);
112 if (fd < 0) {
113 if (errno == ENOENT)
114 return;
115 err(EXIT_FAILURE, _("cannot open %s"), file);
116 }
117
118 if (write_all(fd, cmd, strlen(cmd)))
119 err(EXIT_FAILURE, _("write failed %s"), file);
120 close(fd);
121 }
122
123 static void map_id(const char *file, uint32_t from, uint32_t to)
124 {
125 char *buf;
126 int fd;
127
128 fd = open(file, O_WRONLY);
129 if (fd < 0)
130 err(EXIT_FAILURE, _("cannot open %s"), file);
131
132 xasprintf(&buf, "%u %u 1", from, to);
133 if (write_all(fd, buf, strlen(buf)))
134 err(EXIT_FAILURE, _("write failed %s"), file);
135 free(buf);
136 close(fd);
137 }
138
139 static unsigned long parse_propagation(const char *str)
140 {
141 size_t i;
142 static const struct prop_opts {
143 const char *name;
144 unsigned long flag;
145 } opts[] = {
146 { "slave", MS_REC | MS_SLAVE },
147 { "private", MS_REC | MS_PRIVATE },
148 { "shared", MS_REC | MS_SHARED },
149 { "unchanged", 0 }
150 };
151
152 for (i = 0; i < ARRAY_SIZE(opts); i++) {
153 if (strcmp(opts[i].name, str) == 0)
154 return opts[i].flag;
155 }
156
157 errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str);
158 }
159
160 static void set_propagation(unsigned long flags)
161 {
162 if (flags == 0)
163 return;
164
165 if (mount("none", "/", NULL, flags, NULL) != 0)
166 err(EXIT_FAILURE, _("cannot change root filesystem propagation"));
167 }
168
169
170 static int set_ns_target(int type, const char *path)
171 {
172 struct namespace_file *ns;
173
174 for (ns = namespace_files; ns->name; ns++) {
175 if (ns->type != type)
176 continue;
177 ns->target = path;
178 npersists++;
179 return 0;
180 }
181
182 return -EINVAL;
183 }
184
185 static int bind_ns_files(pid_t pid)
186 {
187 struct namespace_file *ns;
188 char src[PATH_MAX];
189
190 for (ns = namespace_files; ns->name; ns++) {
191 if (!ns->target)
192 continue;
193
194 snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name);
195
196 if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0)
197 err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target);
198 }
199
200 return 0;
201 }
202
203 static ino_t get_mnt_ino(pid_t pid)
204 {
205 struct stat st;
206 char path[PATH_MAX];
207
208 snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
209
210 if (stat(path, &st) != 0)
211 err(EXIT_FAILURE, _("stat of %s failed"), path);
212 return st.st_ino;
213 }
214
215 static void settime(int64_t offset, clockid_t clk_id)
216 {
217 char buf[sizeof(stringify_value(ULONG_MAX)) * 3];
218 int fd, len;
219
220 len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, offset);
221
222 fd = open("/proc/self/timens_offsets", O_WRONLY);
223 if (fd < 0)
224 err(EXIT_FAILURE, _("failed to open /proc/self/timens_offsets"));
225
226 if (write(fd, buf, len) != len)
227 err(EXIT_FAILURE, _("failed to write to /proc/self/timens_offsets"));
228
229 close(fd);
230 }
231
232 /**
233 * waitchild() - Wait for a process to exit successfully
234 * @pid: PID of the process to wait for
235 *
236 * Wait for a process to exit successfully. If it exits with a non-zero return
237 * code, then exit() with the same status.
238 */
239 static void waitchild(int pid)
240 {
241 int rc, status;
242
243 do {
244 rc = waitpid(pid, &status, 0);
245 if (rc < 0) {
246 if (errno == EINTR)
247 continue;
248 err(EXIT_FAILURE, _("waitpid failed"));
249 }
250 if (WIFEXITED(status) &&
251 WEXITSTATUS(status) != EXIT_SUCCESS)
252 exit(WEXITSTATUS(status));
253 } while (rc < 0);
254 }
255
256 /**
257 * sync_with_child() - Tell our child we're ready and wait for it to exit
258 * @pid: The pid of our child
259 * @fd: A file descriptor created with eventfd()
260 *
261 * This tells a child created with fork_and_wait() that we are ready for it to
262 * continue. Once we have done that, wait for our child to exit.
263 */
264 static void sync_with_child(pid_t pid, int fd)
265 {
266 uint64_t ch = PIPE_SYNC_BYTE;
267
268 write_all(fd, &ch, sizeof(ch));
269 close(fd);
270
271 waitchild(pid);
272 }
273
274 /**
275 * fork_and_wait() - Fork and wait to be sync'd with
276 * @fd - A file descriptor created with eventfd() which should be passed to
277 * sync_with_child()
278 *
279 * This creates an eventfd and forks. The parent process returns immediately,
280 * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
281 * This allows the parent to perform some tasks before the child starts its
282 * work. The parent should call sync_with_child() once it is ready for the
283 * child to continue.
284 *
285 * Return: The pid from fork()
286 */
287 static pid_t fork_and_wait(int *fd)
288 {
289 pid_t pid;
290 uint64_t ch;
291
292 *fd = eventfd(0, 0);
293 if (*fd < 0)
294 err(EXIT_FAILURE, _("eventfd failed"));
295
296 pid = fork();
297 if (pid < 0)
298 err(EXIT_FAILURE, _("fork failed"));
299
300 if (!pid) {
301 /* wait for the our parent to tell us to continue */
302 if (read_all(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) ||
303 ch != PIPE_SYNC_BYTE)
304 err(EXIT_FAILURE, _("failed to read eventfd"));
305 close(*fd);
306 }
307
308 return pid;
309 }
310
311 static pid_t bind_ns_files_from_child(int *fd)
312 {
313 pid_t child, ppid = getpid();
314 ino_t ino = get_mnt_ino(ppid);
315
316 child = fork_and_wait(fd);
317 if (child)
318 return child;
319
320 if (get_mnt_ino(ppid) == ino)
321 exit(EXIT_FAILURE);
322 bind_ns_files(ppid);
323 exit(EXIT_SUCCESS);
324 }
325
326 static uid_t get_user(const char *s, const char *err)
327 {
328 struct passwd *pw;
329 char *buf = NULL;
330 uid_t ret;
331
332 pw = xgetpwnam(s, &buf);
333 if (pw) {
334 ret = pw->pw_uid;
335 free(pw);
336 free(buf);
337 } else {
338 ret = strtoul_or_err(s, err);
339 }
340
341 return ret;
342 }
343
344 static gid_t get_group(const char *s, const char *err)
345 {
346 struct group *gr;
347 char *buf = NULL;
348 gid_t ret;
349
350 gr = xgetgrnam(s, &buf);
351 if (gr) {
352 ret = gr->gr_gid;
353 free(gr);
354 free(buf);
355 } else {
356 ret = strtoul_or_err(s, err);
357 }
358
359 return ret;
360 }
361
362 /**
363 * struct map_range - A range of IDs to map
364 * @outer: First ID mapped on the outside of the namespace
365 * @inner: First ID mapped on the inside of the namespace
366 * @count: Length of the inside and outside ranges
367 * @next: Next range of IDs in the chain
368 *
369 * A range of uids/gids to map using new[gu]idmap.
370 */
371 struct map_range {
372 unsigned int outer;
373 unsigned int inner;
374 unsigned int count;
375 struct map_range *next;
376 };
377
378 static void insert_map_range(struct map_range **chain, struct map_range map)
379 {
380 struct map_range *tail = *chain;
381 *chain = xmalloc(sizeof(**chain));
382 memcpy(*chain, &map, sizeof(**chain));
383 (*chain)->next = tail;
384 }
385
386 /**
387 * get_map_range() - Parse a mapping range from a string
388 * @s: A string of the format inner:outer:count or outer,inner,count
389 *
390 * Parse a string of the form inner:outer:count or outer,inner,count into
391 * a new mapping range.
392 *
393 * Return: A struct map_range
394 */
395 static struct map_range get_map_range(const char *s)
396 {
397 int end;
398 struct map_range ret;
399
400 ret.next = NULL;
401
402 if (sscanf(s, "%u:%u:%u%n", &ret.inner, &ret.outer, &ret.count,
403 &end) >= 3 && !s[end])
404 return ret; /* inner:outer:count */
405
406 if (sscanf(s, "%u,%u,%u%n", &ret.outer, &ret.inner, &ret.count,
407 &end) >= 3 && !s[end])
408 return ret; /* outer,inner,count */
409
410 errx(EXIT_FAILURE, _("invalid mapping '%s'"), s);
411 }
412
413 /**
414 * read_subid_range() - Look up a user's sub[gu]id range
415 * @filename: The file to look up the range from. This should be either
416 * ``/etc/subuid`` or ``/etc/subgid``.
417 * @uid: The uid of the user whose range we should look up.
418 *
419 * This finds the first subid range matching @uid in @filename.
420 */
421 static struct map_range read_subid_range(char *filename, uid_t uid)
422 {
423 char *line = NULL, *pwbuf;
424 FILE *idmap;
425 size_t n = 0;
426 struct passwd *pw;
427 struct map_range map;
428
429 map.inner = -1;
430 map.next = NULL;
431
432 pw = xgetpwuid(uid, &pwbuf);
433 if (!pw)
434 errx(EXIT_FAILURE, _("you (user %d) don't exist."), uid);
435
436 idmap = fopen(filename, "r");
437 if (!idmap)
438 err(EXIT_FAILURE, _("could not open '%s'"), filename);
439
440 /*
441 * Each line in sub[ug]idmap looks like
442 * username:subuid:count
443 * OR
444 * uid:subuid:count
445 */
446 while (getline(&line, &n, idmap) != -1) {
447 char *rest, *s;
448
449 rest = strchr(line, ':');
450 if (!rest)
451 continue;
452 *rest = '\0';
453
454 if (strcmp(line, pw->pw_name) &&
455 strtoul(line, NULL, 10) != pw->pw_uid)
456 continue;
457
458 s = rest + 1;
459 rest = strchr(s, ':');
460 if (!rest)
461 continue;
462 *rest = '\0';
463 map.outer = strtoul_or_err(s, _("failed to parse subid map"));
464
465 s = rest + 1;
466 rest = strchr(s, '\n');
467 if (rest)
468 *rest = '\0';
469 map.count = strtoul_or_err(s, _("failed to parse subid map"));
470
471 fclose(idmap);
472 free(pw);
473 free(pwbuf);
474
475 return map;
476 }
477
478 errx(EXIT_FAILURE, _("no line matching user \"%s\" in %s"),
479 pw->pw_name, filename);
480 }
481
482 /**
483 * read_kernel_map() - Read all available IDs from the kernel
484 * @chain: destination list to receive pass-through ID mappings
485 * @filename: either /proc/self/uid_map or /proc/self/gid_map
486 *
487 * This is used by --map-users=all and --map-groups=all to construct
488 * pass-through mappings for all IDs available in the parent namespace.
489 */
490 static void read_kernel_map(struct map_range **chain, char *filename)
491 {
492 char *line = NULL;
493 size_t size = 0;
494 FILE *idmap;
495
496 idmap = fopen(filename, "r");
497 if (!idmap)
498 err(EXIT_FAILURE, _("could not open '%s'"), filename);
499
500 while (getline(&line, &size, idmap) != -1) {
501 unsigned int start, count;
502 if (sscanf(line, " %u %*u %u", &start, &count) < 2)
503 continue;
504 insert_map_range(chain, (struct map_range) {
505 .inner = start,
506 .outer = start,
507 .count = count
508 });
509 }
510
511 fclose(idmap);
512 free(line);
513 }
514
515 /**
516 * add_single_map_range() - Add a single-ID map into a list without overlap
517 * @chain: A linked list of ID range mappings
518 * @outer: ID outside the namespace for a single map.
519 * @inner: ID inside the namespace for a single map, or -1 for no map.
520 *
521 * Prepend a mapping to @chain for the single ID @outer to the single ID
522 * @inner. The tricky bit is that we cannot let existing mappings overlap it.
523 * We accomplish this by removing a "hole" from each existing range @map, if
524 * @outer or @inner overlap it. This may result in one less than @map->count
525 * IDs being mapped from @map. The unmapped IDs are always the topmost IDs
526 * of the mapping (either in the parent or the child namespace).
527 *
528 * Most of the time, this function will be called with a single mapping range
529 * @map, @map->outer as some large ID, @map->inner as 0, and @map->count as a
530 * large number (at least 1000, but less than @map->outer). Typically, there
531 * will be no conflict with @outer. However, @inner may split the mapping for
532 * e.g. --map-current-user.
533 */
534
535 static void add_single_map_range(struct map_range **chain, unsigned int outer,
536 unsigned int inner)
537 {
538 struct map_range *map = *chain;
539
540 if (inner + 1 == 0)
541 outer = (unsigned int) -1;
542 *chain = NULL;
543
544 while (map) {
545 struct map_range lo, mid, hi, *next = map->next;
546 unsigned int inner_offset, outer_offset;
547
548 /*
549 * Start inner IDs from zero for an auto mapping; otherwise, if
550 * the single mapping exists and overlaps the range, remove an ID
551 */
552 if (map->inner + 1 == 0)
553 map->inner = 0;
554 else if (inner + 1 != 0 &&
555 ((outer >= map->outer && outer <= map->outer + map->count) ||
556 (inner >= map->inner && inner <= map->inner + map->count)))
557 map->count--;
558
559 /* Determine where the splits between lo, mid, and hi will be */
560 outer_offset = min(outer > map->outer ? outer - map->outer : 0,
561 map->count);
562 inner_offset = min(inner > map->inner ? inner - map->inner : 0,
563 map->count);
564
565 /*
566 * In the worst case, we need three mappings:
567 * From the bottom of map to either inner or outer
568 */
569 lo.outer = map->outer;
570 lo.inner = map->inner;
571 lo.count = min(inner_offset, outer_offset);
572
573 /* From the lower of inner or outer to the higher */
574 mid.outer = lo.outer + lo.count;
575 mid.outer += mid.outer == outer;
576 mid.inner = lo.inner + lo.count;
577 mid.inner += mid.inner == inner;
578 mid.count = abs_diff(outer_offset, inner_offset);
579
580 /* And from the higher of inner or outer to the end of the map */
581 hi.outer = mid.outer + mid.count;
582 hi.outer += hi.outer == outer;
583 hi.inner = mid.inner + mid.count;
584 hi.inner += hi.inner == inner;
585 hi.count = map->count - lo.count - mid.count;
586
587 /* Insert non-empty mappings into the output chain */
588 if (hi.count)
589 insert_map_range(chain, hi);
590 if (mid.count)
591 insert_map_range(chain, mid);
592 if (lo.count)
593 insert_map_range(chain, lo);
594
595 free(map);
596 map = next;
597 }
598
599 if (inner + 1 != 0) {
600 /* Insert single ID mapping as the first entry in the chain */
601 insert_map_range(chain, (struct map_range) {
602 .inner = inner,
603 .outer = outer,
604 .count = 1
605 });
606 }
607 }
608
609 /**
610 * map_ids_external() - Create a new uid/gid map using setuid helper
611 * @idmapper: Either newuidmap or newgidmap
612 * @ppid: Pid to set the map for
613 * @chain: A linked list of ID range mappings
614 *
615 * This creates a new uid/gid map for @ppid using @idmapper to set the
616 * mapping for each of the ranges in @chain.
617 *
618 * This function always exec()s or errors out and does not return.
619 */
620 static void __attribute__((__noreturn__))
621 map_ids_external(const char *idmapper, int ppid, struct map_range *chain)
622 {
623 unsigned int i = 0, length = 3;
624 char **argv;
625
626 for (struct map_range *map = chain; map; map = map->next)
627 length += 3;
628 argv = xcalloc(length, sizeof(*argv));
629 argv[i++] = xstrdup(idmapper);
630 xasprintf(&argv[i++], "%u", ppid);
631
632 for (struct map_range *map = chain; map; map = map->next) {
633 xasprintf(&argv[i++], "%u", map->inner);
634 xasprintf(&argv[i++], "%u", map->outer);
635 xasprintf(&argv[i++], "%u", map->count);
636 }
637
638 argv[i] = NULL;
639 execvp(idmapper, argv);
640 errexec(idmapper);
641 }
642
643 /**
644 * map_ids_internal() - Create a new uid/gid map using root privilege
645 * @type: Either uid_map or gid_map
646 * @ppid: Pid to set the map for
647 * @chain: A linked list of ID range mappings
648 *
649 * This creates a new uid/gid map for @ppid using a privileged write to
650 * /proc/@ppid/@type to set a mapping for each of the ranges in @chain.
651 */
652 static void map_ids_internal(const char *type, int ppid, struct map_range *chain)
653 {
654 int count, fd;
655 unsigned int length = 0;
656 char buffer[4096], *path;
657
658 xasprintf(&path, "/proc/%u/%s", ppid, type);
659 for (struct map_range *map = chain; map; map = map->next) {
660 count = snprintf(buffer + length, sizeof(buffer) - length,
661 "%u %u %u\n",
662 map->inner, map->outer, map->count);
663 if (count < 0 || count + length > sizeof(buffer))
664 errx(EXIT_FAILURE,
665 _("%s too large for kernel 4k limit"), path);
666 length += count;
667 }
668
669 fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY);
670 if (fd < 0)
671 err(EXIT_FAILURE, _("failed to open %s"), path);
672 if (write_all(fd, buffer, length) < 0)
673 err(EXIT_FAILURE, _("failed to write %s"), path);
674 close(fd);
675 free(path);
676 }
677
678 /**
679 * map_ids_from_child() - Set up a new uid/gid map
680 * @fd: The eventfd to wait on
681 * @mapuser: The user to map the current user to (or -1)
682 * @usermap: The range of UIDs to map (or %NULL)
683 * @mapgroup: The group to map the current group to (or -1)
684 * @groupmap: The range of GIDs to map (or %NULL)
685 *
686 * fork_and_wait() for our parent to call sync_with_child() on @fd. Upon
687 * recieving the go-ahead, use newuidmap and newgidmap to set the uid/gid map
688 * for our parent's PID.
689 *
690 * Return: The pid of the child.
691 */
692 static pid_t map_ids_from_child(int *fd, uid_t mapuser,
693 struct map_range *usermap, gid_t mapgroup,
694 struct map_range *groupmap)
695 {
696 pid_t child, pid = 0;
697 pid_t ppid = getpid();
698
699 child = fork_and_wait(fd);
700 if (child)
701 return child;
702
703 if (usermap)
704 add_single_map_range(&usermap, geteuid(), mapuser);
705 if (groupmap)
706 add_single_map_range(&groupmap, getegid(), mapgroup);
707
708 if (geteuid() == 0) {
709 if (usermap)
710 map_ids_internal("uid_map", ppid, usermap);
711 if (groupmap)
712 map_ids_internal("gid_map", ppid, groupmap);
713 exit(EXIT_SUCCESS);
714 }
715
716 /* Avoid forking more than we need to */
717 if (usermap && groupmap) {
718 pid = fork();
719 if (pid < 0)
720 err(EXIT_FAILURE, _("fork failed"));
721 if (pid)
722 waitchild(pid);
723 }
724
725 if (!pid && usermap)
726 map_ids_external("newuidmap", ppid, usermap);
727 if (groupmap)
728 map_ids_external("newgidmap", ppid, groupmap);
729 exit(EXIT_SUCCESS);
730 }
731
732 static void __attribute__((__noreturn__)) usage(void)
733 {
734 FILE *out = stdout;
735
736 fputs(USAGE_HEADER, out);
737 fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"),
738 program_invocation_short_name);
739
740 fputs(USAGE_SEPARATOR, out);
741 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out);
742
743 fputs(USAGE_OPTIONS, out);
744 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out);
745 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out);
746 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out);
747 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out);
748 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out);
749 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out);
750 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out);
751 fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out);
752 fputs(USAGE_SEPARATOR, out);
753 fputs(_(" -f, --fork fork before launching <program>\n"), out);
754 fputs(_(" --map-user=<uid>|<name> map current user to uid (implies --user)\n"), out);
755 fputs(_(" --map-group=<gid>|<name> map current group to gid (implies --user)\n"), out);
756 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out);
757 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out);
758 fputs(_(" --map-auto map users and groups automatically (implies --user)\n"), out);
759 fputs(_(" --map-users=<inneruid>:<outeruid>:<count>\n"
760 " map count users from outeruid to inneruid (implies --user)\n"), out);
761 fputs(_(" --map-groups=<innergid>:<outergid>:<count>\n"
762 " map count groups from outergid to innergid (implies --user)\n"), out);
763 fputs(USAGE_SEPARATOR, out);
764 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
765 " defaults to SIGKILL\n"), out);
766 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out);
767 fputs(_(" --propagation slave|shared|private|unchanged\n"
768 " modify mount propagation in mount namespace\n"), out);
769 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out);
770 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out);
771 fputs(USAGE_SEPARATOR, out);
772 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out);
773 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out);
774 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out);
775 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out);
776 fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out);
777 fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out);
778
779 fputs(USAGE_SEPARATOR, out);
780 fprintf(out, USAGE_HELP_OPTIONS(27));
781 fprintf(out, USAGE_MAN_TAIL("unshare(1)"));
782
783 exit(EXIT_SUCCESS);
784 }
785
786 int main(int argc, char *argv[])
787 {
788 enum {
789 OPT_MOUNTPROC = CHAR_MAX + 1,
790 OPT_PROPAGATION,
791 OPT_SETGROUPS,
792 OPT_KILLCHILD,
793 OPT_KEEPCAPS,
794 OPT_MONOTONIC,
795 OPT_BOOTTIME,
796 OPT_MAPUSER,
797 OPT_MAPUSERS,
798 OPT_MAPGROUP,
799 OPT_MAPGROUPS,
800 OPT_MAPAUTO,
801 };
802 static const struct option longopts[] = {
803 { "help", no_argument, NULL, 'h' },
804 { "version", no_argument, NULL, 'V' },
805
806 { "mount", optional_argument, NULL, 'm' },
807 { "uts", optional_argument, NULL, 'u' },
808 { "ipc", optional_argument, NULL, 'i' },
809 { "net", optional_argument, NULL, 'n' },
810 { "pid", optional_argument, NULL, 'p' },
811 { "user", optional_argument, NULL, 'U' },
812 { "cgroup", optional_argument, NULL, 'C' },
813 { "time", optional_argument, NULL, 'T' },
814
815 { "fork", no_argument, NULL, 'f' },
816 { "kill-child", optional_argument, NULL, OPT_KILLCHILD },
817 { "mount-proc", optional_argument, NULL, OPT_MOUNTPROC },
818 { "map-user", required_argument, NULL, OPT_MAPUSER },
819 { "map-users", required_argument, NULL, OPT_MAPUSERS },
820 { "map-group", required_argument, NULL, OPT_MAPGROUP },
821 { "map-groups", required_argument, NULL, OPT_MAPGROUPS },
822 { "map-root-user", no_argument, NULL, 'r' },
823 { "map-current-user", no_argument, NULL, 'c' },
824 { "map-auto", no_argument, NULL, OPT_MAPAUTO },
825 { "propagation", required_argument, NULL, OPT_PROPAGATION },
826 { "setgroups", required_argument, NULL, OPT_SETGROUPS },
827 { "keep-caps", no_argument, NULL, OPT_KEEPCAPS },
828 { "setuid", required_argument, NULL, 'S' },
829 { "setgid", required_argument, NULL, 'G' },
830 { "root", required_argument, NULL, 'R' },
831 { "wd", required_argument, NULL, 'w' },
832 { "monotonic", required_argument, NULL, OPT_MONOTONIC },
833 { "boottime", required_argument, NULL, OPT_BOOTTIME },
834 { NULL, 0, NULL, 0 }
835 };
836
837 int setgrpcmd = SETGROUPS_NONE;
838 int unshare_flags = 0;
839 int c, forkit = 0;
840 uid_t mapuser = -1;
841 gid_t mapgroup = -1;
842 struct map_range *usermap = NULL;
843 struct map_range *groupmap = NULL;
844 int kill_child_signo = 0; /* 0 means --kill-child was not used */
845 const char *procmnt = NULL;
846 const char *newroot = NULL;
847 const char *newdir = NULL;
848 pid_t pid_bind = 0, pid_idmap = 0;
849 pid_t pid = 0;
850 #ifdef UL_HAVE_PIDFD
851 int fd_parent_pid = -1;
852 #endif
853 int fd_idmap, fd_bind = -1;
854 sigset_t sigset, oldsigset;
855 int status;
856 unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT;
857 int force_uid = 0, force_gid = 0;
858 uid_t uid = 0, real_euid = geteuid();
859 gid_t gid = 0, real_egid = getegid();
860 int keepcaps = 0;
861 int64_t monotonic = 0;
862 int64_t boottime = 0;
863 int force_monotonic = 0;
864 int force_boottime = 0;
865
866 setlocale(LC_ALL, "");
867 bindtextdomain(PACKAGE, LOCALEDIR);
868 textdomain(PACKAGE);
869 close_stdout_atexit();
870
871 while ((c = getopt_long(argc, argv, "+fhVmuinpCTUrR:w:S:G:c", longopts, NULL)) != -1) {
872 switch (c) {
873 case 'f':
874 forkit = 1;
875 break;
876 case 'm':
877 unshare_flags |= CLONE_NEWNS;
878 if (optarg)
879 set_ns_target(CLONE_NEWNS, optarg);
880 break;
881 case 'u':
882 unshare_flags |= CLONE_NEWUTS;
883 if (optarg)
884 set_ns_target(CLONE_NEWUTS, optarg);
885 break;
886 case 'i':
887 unshare_flags |= CLONE_NEWIPC;
888 if (optarg)
889 set_ns_target(CLONE_NEWIPC, optarg);
890 break;
891 case 'n':
892 unshare_flags |= CLONE_NEWNET;
893 if (optarg)
894 set_ns_target(CLONE_NEWNET, optarg);
895 break;
896 case 'p':
897 unshare_flags |= CLONE_NEWPID;
898 if (optarg)
899 set_ns_target(CLONE_NEWPID, optarg);
900 break;
901 case 'U':
902 unshare_flags |= CLONE_NEWUSER;
903 if (optarg)
904 set_ns_target(CLONE_NEWUSER, optarg);
905 break;
906 case 'C':
907 unshare_flags |= CLONE_NEWCGROUP;
908 if (optarg)
909 set_ns_target(CLONE_NEWCGROUP, optarg);
910 break;
911 case 'T':
912 unshare_flags |= CLONE_NEWTIME;
913 if (optarg)
914 set_ns_target(CLONE_NEWTIME, optarg);
915 break;
916 case OPT_MOUNTPROC:
917 unshare_flags |= CLONE_NEWNS;
918 procmnt = optarg ? optarg : "/proc";
919 break;
920 case OPT_MAPUSER:
921 unshare_flags |= CLONE_NEWUSER;
922 mapuser = get_user(optarg, _("failed to parse uid"));
923 break;
924 case OPT_MAPGROUP:
925 unshare_flags |= CLONE_NEWUSER;
926 mapgroup = get_group(optarg, _("failed to parse gid"));
927 break;
928 case 'r':
929 unshare_flags |= CLONE_NEWUSER;
930 mapuser = 0;
931 mapgroup = 0;
932 break;
933 case 'c':
934 unshare_flags |= CLONE_NEWUSER;
935 mapuser = real_euid;
936 mapgroup = real_egid;
937 break;
938 case OPT_MAPUSERS:
939 unshare_flags |= CLONE_NEWUSER;
940 if (!strcmp(optarg, "auto"))
941 insert_map_range(&usermap,
942 read_subid_range(_PATH_SUBUID, real_euid));
943 else if (!strcmp(optarg, "all"))
944 read_kernel_map(&usermap, _PATH_PROC_UIDMAP);
945 else
946 insert_map_range(&usermap, get_map_range(optarg));
947 break;
948 case OPT_MAPGROUPS:
949 unshare_flags |= CLONE_NEWUSER;
950 if (!strcmp(optarg, "auto"))
951 insert_map_range(&groupmap,
952 read_subid_range(_PATH_SUBGID, real_euid));
953 else if (!strcmp(optarg, "all"))
954 read_kernel_map(&groupmap, _PATH_PROC_GIDMAP);
955 else
956 insert_map_range(&groupmap, get_map_range(optarg));
957 break;
958 case OPT_MAPAUTO:
959 unshare_flags |= CLONE_NEWUSER;
960 insert_map_range(&usermap, read_subid_range(_PATH_SUBUID, real_euid));
961 insert_map_range(&groupmap, read_subid_range(_PATH_SUBGID, real_euid));
962 break;
963 case OPT_SETGROUPS:
964 setgrpcmd = setgroups_str2id(optarg);
965 break;
966 case OPT_PROPAGATION:
967 propagation = parse_propagation(optarg);
968 break;
969 case OPT_KILLCHILD:
970 forkit = 1;
971 if (optarg) {
972 if ((kill_child_signo = signame_to_signum(optarg)) < 0)
973 errx(EXIT_FAILURE, _("unknown signal: %s"),
974 optarg);
975 } else {
976 kill_child_signo = SIGKILL;
977 }
978 break;
979 case OPT_KEEPCAPS:
980 keepcaps = 1;
981 cap_last_cap(); /* Force last cap to be cached before we fork. */
982 break;
983 case 'S':
984 uid = strtoul_or_err(optarg, _("failed to parse uid"));
985 force_uid = 1;
986 break;
987 case 'G':
988 gid = strtoul_or_err(optarg, _("failed to parse gid"));
989 force_gid = 1;
990 break;
991 case 'R':
992 newroot = optarg;
993 break;
994 case 'w':
995 newdir = optarg;
996 break;
997 case OPT_MONOTONIC:
998 monotonic = strtos64_or_err(optarg, _("failed to parse monotonic offset"));
999 force_monotonic = 1;
1000 break;
1001 case OPT_BOOTTIME:
1002 boottime = strtos64_or_err(optarg, _("failed to parse boottime offset"));
1003 force_boottime = 1;
1004 break;
1005
1006 case 'h':
1007 usage();
1008 case 'V':
1009 print_version(EXIT_SUCCESS);
1010 default:
1011 errtryhelp(EXIT_FAILURE);
1012 }
1013 }
1014
1015 if ((force_monotonic || force_boottime) && !(unshare_flags & CLONE_NEWTIME))
1016 errx(EXIT_FAILURE, _("options --monotonic and --boottime require "
1017 "unsharing of a time namespace (-T)"));
1018
1019 /* clear any inherited settings */
1020 signal(SIGCHLD, SIG_DFL);
1021
1022 if (npersists && (unshare_flags & CLONE_NEWNS))
1023 pid_bind = bind_ns_files_from_child(&fd_bind);
1024
1025 if (usermap || groupmap)
1026 pid_idmap = map_ids_from_child(&fd_idmap, mapuser, usermap,
1027 mapgroup, groupmap);
1028
1029 if (-1 == unshare(unshare_flags))
1030 err(EXIT_FAILURE, _("unshare failed"));
1031
1032 /* Tell child we've called unshare() */
1033 if (usermap || groupmap)
1034 sync_with_child(pid_idmap, fd_idmap);
1035
1036 if (force_boottime)
1037 settime(boottime, CLOCK_BOOTTIME);
1038
1039 if (force_monotonic)
1040 settime(monotonic, CLOCK_MONOTONIC);
1041
1042 if (forkit) {
1043 if (sigemptyset(&sigset) != 0 ||
1044 sigaddset(&sigset, SIGINT) != 0 ||
1045 sigaddset(&sigset, SIGTERM) != 0 ||
1046 sigprocmask(SIG_BLOCK, &sigset, &oldsigset) != 0)
1047 err(EXIT_FAILURE, _("sigprocmask block failed"));
1048 #ifdef UL_HAVE_PIDFD
1049 if (kill_child_signo != 0) {
1050 /* make a connection to the original process (parent) */
1051 fd_parent_pid = pidfd_open(getpid(), 0);
1052 if (0 > fd_parent_pid)
1053 err(EXIT_FAILURE, _("pidfd_open failed"));
1054 }
1055 #endif
1056 /* force child forking before mountspace binding so
1057 * pid_for_children is populated */
1058 pid = fork();
1059
1060 switch(pid) {
1061 case -1:
1062 err(EXIT_FAILURE, _("fork failed"));
1063 case 0: /* child */
1064 if (sigprocmask(SIG_SETMASK, &oldsigset, NULL))
1065 err(EXIT_FAILURE,
1066 _("sigprocmask restore failed"));
1067 if (npersists && (unshare_flags & CLONE_NEWNS))
1068 close(fd_bind);
1069 break;
1070 default: /* parent */
1071 break;
1072 }
1073 }
1074
1075 if (npersists && (pid || !forkit)) {
1076 /* run in parent */
1077 if (pid_bind && (unshare_flags & CLONE_NEWNS))
1078 sync_with_child(pid_bind, fd_bind);
1079 else
1080 /* simple way, just bind */
1081 bind_ns_files(getpid());
1082 }
1083
1084 if (pid) {
1085 if (waitpid(pid, &status, 0) == -1)
1086 err(EXIT_FAILURE, _("waitpid failed"));
1087
1088 if (WIFEXITED(status))
1089 return WEXITSTATUS(status);
1090 if (WIFSIGNALED(status)) {
1091
1092 /* Ensure the signal that terminated the child will
1093 * also terminate the parent. */
1094
1095 int termsig = WTERMSIG(status);
1096
1097 if (termsig != SIGKILL && signal(termsig, SIG_DFL) == SIG_ERR)
1098 err(EXIT_FAILURE,
1099 _("signal handler reset failed"));
1100 if (sigemptyset(&sigset) != 0 ||
1101 sigaddset(&sigset, termsig) != 0 ||
1102 sigprocmask(SIG_UNBLOCK, &sigset, NULL) != 0)
1103 err(EXIT_FAILURE,
1104 _("sigprocmask unblock failed"));
1105
1106 kill(getpid(), termsig);
1107 }
1108 err(EXIT_FAILURE, _("child exit failed"));
1109 }
1110
1111 if (kill_child_signo != 0) {
1112 if (prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0)
1113 err(EXIT_FAILURE, "prctl failed");
1114 #ifdef UL_HAVE_PIDFD
1115 /* Use poll() to check that there is still the original parent. */
1116 if (fd_parent_pid != -1) {
1117 struct pollfd pollfds[1] = {
1118 { .fd = fd_parent_pid, .events = POLLIN }
1119 };
1120 int nfds = poll(pollfds, 1, 0);
1121
1122 if (0 > nfds)
1123 err(EXIT_FAILURE, "poll parent pidfd failed");
1124
1125 /* If the child was re-parented before prctl(2) was called, the
1126 * new parent will likely not be interested in the precise exit
1127 * status of the orphan.
1128 */
1129 if (nfds)
1130 exit(EXIT_FAILURE);
1131
1132 close(fd_parent_pid);
1133 fd_parent_pid = -1;
1134 }
1135 #endif
1136 }
1137
1138 if (mapuser != (uid_t) -1 && !usermap)
1139 map_id(_PATH_PROC_UIDMAP, mapuser, real_euid);
1140
1141 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
1142 * has been disabled unless /proc/self/setgroups is written
1143 * first to permanently disable the ability to call setgroups
1144 * in that user namespace. */
1145 if (mapgroup != (gid_t) -1 && !groupmap) {
1146 if (setgrpcmd == SETGROUPS_ALLOW)
1147 errx(EXIT_FAILURE, _("options --setgroups=allow and "
1148 "--map-group are mutually exclusive"));
1149 setgroups_control(SETGROUPS_DENY);
1150 map_id(_PATH_PROC_GIDMAP, mapgroup, real_egid);
1151 }
1152
1153 if (setgrpcmd != SETGROUPS_NONE)
1154 setgroups_control(setgrpcmd);
1155
1156 if ((unshare_flags & CLONE_NEWNS) && propagation)
1157 set_propagation(propagation);
1158
1159 if (newroot) {
1160 if (chroot(newroot) != 0)
1161 err(EXIT_FAILURE,
1162 _("cannot change root directory to '%s'"), newroot);
1163 newdir = newdir ?: "/";
1164 }
1165 if (newdir && chdir(newdir))
1166 err(EXIT_FAILURE, _("cannot chdir to '%s'"), newdir);
1167
1168 if (procmnt) {
1169 /* When not changing root and using the default propagation flags
1170 then the recursive propagation change of root will
1171 automatically change that of an existing proc mount. */
1172 if (!newroot && propagation != (MS_PRIVATE|MS_REC)) {
1173 int rc = mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL);
1174
1175 /* Custom procmnt means that proc is very likely not mounted, causing EINVAL.
1176 Ignoring the error in this specific instance is considered safe. */
1177 if(rc != 0 && errno != EINVAL)
1178 err(EXIT_FAILURE, _("cannot change %s filesystem propagation"), procmnt);
1179 }
1180
1181 if (mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0)
1182 err(EXIT_FAILURE, _("mount %s failed"), procmnt);
1183 }
1184
1185 if (force_gid) {
1186 if (setgroups(0, NULL) != 0) /* drop supplementary groups */
1187 err(EXIT_FAILURE, _("setgroups failed"));
1188 if (setgid(gid) < 0) /* change GID */
1189 err(EXIT_FAILURE, _("setgid failed"));
1190 }
1191 if (force_uid && setuid(uid) < 0) /* change UID */
1192 err(EXIT_FAILURE, _("setuid failed"));
1193
1194 if (keepcaps && (unshare_flags & CLONE_NEWUSER))
1195 cap_permitted_to_ambient();
1196
1197 if (optind < argc) {
1198 execvp(argv[optind], argv + optind);
1199 errexec(argv[optind]);
1200 }
1201 exec_shell();
1202 }