2 * unshare(1) - command-line interface for unshare(2)
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
28 #include <sys/eventfd.h>
30 #include <sys/mount.h>
31 #include <sys/types.h>
33 #include <sys/prctl.h>
36 /* we only need some defines missing in sys/mount.h, no libmount linkage */
42 #include "closestream.h"
43 #include "namespace.h"
44 #include "pidfd-utils.h"
45 #include "exec_shell.h"
47 #include "pathnames.h"
53 /* synchronize parent and child by pipe */
54 #define PIPE_SYNC_BYTE 0x06
56 /* 'private' is kernel default */
57 #define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
59 /* /proc namespace files and mountpoints for binds */
60 static struct namespace_file
{
61 int type
; /* CLONE_NEW* */
62 const char *name
; /* ns/<type> */
63 const char *target
; /* user specified target for bind mount */
64 } namespace_files
[] = {
65 { .type
= CLONE_NEWUSER
, .name
= "ns/user" },
66 { .type
= CLONE_NEWCGROUP
,.name
= "ns/cgroup" },
67 { .type
= CLONE_NEWIPC
, .name
= "ns/ipc" },
68 { .type
= CLONE_NEWUTS
, .name
= "ns/uts" },
69 { .type
= CLONE_NEWNET
, .name
= "ns/net" },
70 { .type
= CLONE_NEWPID
, .name
= "ns/pid_for_children" },
71 { .type
= CLONE_NEWNS
, .name
= "ns/mnt" },
72 { .type
= CLONE_NEWTIME
, .name
= "ns/time_for_children" },
76 static int npersists
; /* number of persistent namespaces */
84 static const char *setgroups_strings
[] =
86 [SETGROUPS_DENY
] = "deny",
87 [SETGROUPS_ALLOW
] = "allow"
90 static int setgroups_str2id(const char *str
)
94 for (i
= 0; i
< ARRAY_SIZE(setgroups_strings
); i
++)
95 if (strcmp(str
, setgroups_strings
[i
]) == 0)
98 errx(EXIT_FAILURE
, _("unsupported --setgroups argument '%s'"), str
);
101 static void setgroups_control(int action
)
103 const char *file
= _PATH_PROC_SETGROUPS
;
107 if (action
< 0 || (size_t) action
>= ARRAY_SIZE(setgroups_strings
))
109 cmd
= setgroups_strings
[action
];
111 fd
= open(file
, O_WRONLY
);
115 err(EXIT_FAILURE
, _("cannot open %s"), file
);
118 if (write_all(fd
, cmd
, strlen(cmd
)))
119 err(EXIT_FAILURE
, _("write failed %s"), file
);
123 static void map_id(const char *file
, uint32_t from
, uint32_t to
)
128 fd
= open(file
, O_WRONLY
);
130 err(EXIT_FAILURE
, _("cannot open %s"), file
);
132 xasprintf(&buf
, "%u %u 1", from
, to
);
133 if (write_all(fd
, buf
, strlen(buf
)))
134 err(EXIT_FAILURE
, _("write failed %s"), file
);
139 static unsigned long parse_propagation(const char *str
)
142 static const struct prop_opts
{
146 { "slave", MS_REC
| MS_SLAVE
},
147 { "private", MS_REC
| MS_PRIVATE
},
148 { "shared", MS_REC
| MS_SHARED
},
152 for (i
= 0; i
< ARRAY_SIZE(opts
); i
++) {
153 if (strcmp(opts
[i
].name
, str
) == 0)
157 errx(EXIT_FAILURE
, _("unsupported propagation mode: %s"), str
);
160 static void set_propagation(unsigned long flags
)
165 if (mount("none", "/", NULL
, flags
, NULL
) != 0)
166 err(EXIT_FAILURE
, _("cannot change root filesystem propagation"));
170 static int set_ns_target(int type
, const char *path
)
172 struct namespace_file
*ns
;
174 for (ns
= namespace_files
; ns
->name
; ns
++) {
175 if (ns
->type
!= type
)
185 static int bind_ns_files(pid_t pid
)
187 struct namespace_file
*ns
;
190 for (ns
= namespace_files
; ns
->name
; ns
++) {
194 snprintf(src
, sizeof(src
), "/proc/%u/%s", (unsigned) pid
, ns
->name
);
196 if (mount(src
, ns
->target
, NULL
, MS_BIND
, NULL
) != 0)
197 err(EXIT_FAILURE
, _("mount %s on %s failed"), src
, ns
->target
);
203 static ino_t
get_mnt_ino(pid_t pid
)
208 snprintf(path
, sizeof(path
), "/proc/%u/ns/mnt", (unsigned) pid
);
210 if (stat(path
, &st
) != 0)
211 err(EXIT_FAILURE
, _("stat of %s failed"), path
);
215 static void settime(int64_t offset
, clockid_t clk_id
)
217 char buf
[sizeof(stringify_value(ULONG_MAX
)) * 3];
220 len
= snprintf(buf
, sizeof(buf
), "%d %" PRId64
" 0", clk_id
, offset
);
222 fd
= open("/proc/self/timens_offsets", O_WRONLY
);
224 err(EXIT_FAILURE
, _("failed to open /proc/self/timens_offsets"));
226 if (write(fd
, buf
, len
) != len
)
227 err(EXIT_FAILURE
, _("failed to write to /proc/self/timens_offsets"));
233 * waitchild() - Wait for a process to exit successfully
234 * @pid: PID of the process to wait for
236 * Wait for a process to exit successfully. If it exits with a non-zero return
237 * code, then exit() with the same status.
239 static void waitchild(int pid
)
244 rc
= waitpid(pid
, &status
, 0);
248 err(EXIT_FAILURE
, _("waitpid failed"));
250 if (WIFEXITED(status
) &&
251 WEXITSTATUS(status
) != EXIT_SUCCESS
)
252 exit(WEXITSTATUS(status
));
257 * sync_with_child() - Tell our child we're ready and wait for it to exit
258 * @pid: The pid of our child
259 * @fd: A file descriptor created with eventfd()
261 * This tells a child created with fork_and_wait() that we are ready for it to
262 * continue. Once we have done that, wait for our child to exit.
264 static void sync_with_child(pid_t pid
, int fd
)
266 uint64_t ch
= PIPE_SYNC_BYTE
;
268 write_all(fd
, &ch
, sizeof(ch
));
275 * fork_and_wait() - Fork and wait to be sync'd with
276 * @fd - A file descriptor created with eventfd() which should be passed to
279 * This creates an eventfd and forks. The parent process returns immediately,
280 * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
281 * This allows the parent to perform some tasks before the child starts its
282 * work. The parent should call sync_with_child() once it is ready for the
285 * Return: The pid from fork()
287 static pid_t
fork_and_wait(int *fd
)
294 err(EXIT_FAILURE
, _("eventfd failed"));
298 err(EXIT_FAILURE
, _("fork failed"));
301 /* wait for the our parent to tell us to continue */
302 if (read_all(*fd
, (char *)&ch
, sizeof(ch
)) != sizeof(ch
) ||
303 ch
!= PIPE_SYNC_BYTE
)
304 err(EXIT_FAILURE
, _("failed to read eventfd"));
311 static pid_t
bind_ns_files_from_child(int *fd
)
313 pid_t child
, ppid
= getpid();
314 ino_t ino
= get_mnt_ino(ppid
);
316 child
= fork_and_wait(fd
);
320 if (get_mnt_ino(ppid
) == ino
)
326 static uid_t
get_user(const char *s
, const char *err
)
332 pw
= xgetpwnam(s
, &buf
);
338 ret
= strtoul_or_err(s
, err
);
344 static gid_t
get_group(const char *s
, const char *err
)
350 gr
= xgetgrnam(s
, &buf
);
356 ret
= strtoul_or_err(s
, err
);
363 * struct map_range - A range of IDs to map
364 * @outer: First ID mapped on the outside of the namespace
365 * @inner: First ID mapped on the inside of the namespace
366 * @count: Length of the inside and outside ranges
367 * @next: Next range of IDs in the chain
369 * A range of uids/gids to map using new[gu]idmap.
375 struct map_range
*next
;
378 static void insert_map_range(struct map_range
**chain
, struct map_range map
)
380 struct map_range
*tail
= *chain
;
381 *chain
= xmalloc(sizeof(**chain
));
382 memcpy(*chain
, &map
, sizeof(**chain
));
383 (*chain
)->next
= tail
;
387 * get_map_range() - Parse a mapping range from a string
388 * @s: A string of the format inner:outer:count or outer,inner,count
390 * Parse a string of the form inner:outer:count or outer,inner,count into
391 * a new mapping range.
393 * Return: A struct map_range
395 static struct map_range
get_map_range(const char *s
)
398 struct map_range ret
;
402 if (sscanf(s
, "%u:%u:%u%n", &ret
.inner
, &ret
.outer
, &ret
.count
,
403 &end
) >= 3 && !s
[end
])
404 return ret
; /* inner:outer:count */
406 if (sscanf(s
, "%u,%u,%u%n", &ret
.outer
, &ret
.inner
, &ret
.count
,
407 &end
) >= 3 && !s
[end
])
408 return ret
; /* outer,inner,count */
410 errx(EXIT_FAILURE
, _("invalid mapping '%s'"), s
);
414 * read_subid_range() - Look up a user's sub[gu]id range
415 * @filename: The file to look up the range from. This should be either
416 * ``/etc/subuid`` or ``/etc/subgid``.
417 * @uid: The uid of the user whose range we should look up.
419 * This finds the first subid range matching @uid in @filename.
421 static struct map_range
read_subid_range(char *filename
, uid_t uid
)
423 char *line
= NULL
, *pwbuf
;
427 struct map_range map
;
432 pw
= xgetpwuid(uid
, &pwbuf
);
434 errx(EXIT_FAILURE
, _("you (user %d) don't exist."), uid
);
436 idmap
= fopen(filename
, "r");
438 err(EXIT_FAILURE
, _("could not open '%s'"), filename
);
441 * Each line in sub[ug]idmap looks like
442 * username:subuid:count
446 while (getline(&line
, &n
, idmap
) != -1) {
449 rest
= strchr(line
, ':');
454 if (strcmp(line
, pw
->pw_name
) &&
455 strtoul(line
, NULL
, 10) != pw
->pw_uid
)
459 rest
= strchr(s
, ':');
463 map
.outer
= strtoul_or_err(s
, _("failed to parse subid map"));
466 rest
= strchr(s
, '\n');
469 map
.count
= strtoul_or_err(s
, _("failed to parse subid map"));
478 errx(EXIT_FAILURE
, _("no line matching user \"%s\" in %s"),
479 pw
->pw_name
, filename
);
483 * read_kernel_map() - Read all available IDs from the kernel
484 * @chain: destination list to receive pass-through ID mappings
485 * @filename: either /proc/self/uid_map or /proc/self/gid_map
487 * This is used by --map-users=all and --map-groups=all to construct
488 * pass-through mappings for all IDs available in the parent namespace.
490 static void read_kernel_map(struct map_range
**chain
, char *filename
)
496 idmap
= fopen(filename
, "r");
498 err(EXIT_FAILURE
, _("could not open '%s'"), filename
);
500 while (getline(&line
, &size
, idmap
) != -1) {
501 unsigned int start
, count
;
502 if (sscanf(line
, " %u %*u %u", &start
, &count
) < 2)
504 insert_map_range(chain
, (struct map_range
) {
516 * add_single_map_range() - Add a single-ID map into a list without overlap
517 * @chain: A linked list of ID range mappings
518 * @outer: ID outside the namespace for a single map.
519 * @inner: ID inside the namespace for a single map, or -1 for no map.
521 * Prepend a mapping to @chain for the single ID @outer to the single ID
522 * @inner. The tricky bit is that we cannot let existing mappings overlap it.
523 * We accomplish this by removing a "hole" from each existing range @map, if
524 * @outer or @inner overlap it. This may result in one less than @map->count
525 * IDs being mapped from @map. The unmapped IDs are always the topmost IDs
526 * of the mapping (either in the parent or the child namespace).
528 * Most of the time, this function will be called with a single mapping range
529 * @map, @map->outer as some large ID, @map->inner as 0, and @map->count as a
530 * large number (at least 1000, but less than @map->outer). Typically, there
531 * will be no conflict with @outer. However, @inner may split the mapping for
532 * e.g. --map-current-user.
535 static void add_single_map_range(struct map_range
**chain
, unsigned int outer
,
538 struct map_range
*map
= *chain
;
541 outer
= (unsigned int) -1;
545 struct map_range lo
, mid
, hi
, *next
= map
->next
;
546 unsigned int inner_offset
, outer_offset
;
549 * Start inner IDs from zero for an auto mapping; otherwise, if
550 * the single mapping exists and overlaps the range, remove an ID
552 if (map
->inner
+ 1 == 0)
554 else if (inner
+ 1 != 0 &&
555 ((outer
>= map
->outer
&& outer
<= map
->outer
+ map
->count
) ||
556 (inner
>= map
->inner
&& inner
<= map
->inner
+ map
->count
)))
559 /* Determine where the splits between lo, mid, and hi will be */
560 outer_offset
= min(outer
> map
->outer
? outer
- map
->outer
: 0,
562 inner_offset
= min(inner
> map
->inner
? inner
- map
->inner
: 0,
566 * In the worst case, we need three mappings:
567 * From the bottom of map to either inner or outer
569 lo
.outer
= map
->outer
;
570 lo
.inner
= map
->inner
;
571 lo
.count
= min(inner_offset
, outer_offset
);
573 /* From the lower of inner or outer to the higher */
574 mid
.outer
= lo
.outer
+ lo
.count
;
575 mid
.outer
+= mid
.outer
== outer
;
576 mid
.inner
= lo
.inner
+ lo
.count
;
577 mid
.inner
+= mid
.inner
== inner
;
578 mid
.count
= abs_diff(outer_offset
, inner_offset
);
580 /* And from the higher of inner or outer to the end of the map */
581 hi
.outer
= mid
.outer
+ mid
.count
;
582 hi
.outer
+= hi
.outer
== outer
;
583 hi
.inner
= mid
.inner
+ mid
.count
;
584 hi
.inner
+= hi
.inner
== inner
;
585 hi
.count
= map
->count
- lo
.count
- mid
.count
;
587 /* Insert non-empty mappings into the output chain */
589 insert_map_range(chain
, hi
);
591 insert_map_range(chain
, mid
);
593 insert_map_range(chain
, lo
);
599 if (inner
+ 1 != 0) {
600 /* Insert single ID mapping as the first entry in the chain */
601 insert_map_range(chain
, (struct map_range
) {
610 * map_ids_external() - Create a new uid/gid map using setuid helper
611 * @idmapper: Either newuidmap or newgidmap
612 * @ppid: Pid to set the map for
613 * @chain: A linked list of ID range mappings
615 * This creates a new uid/gid map for @ppid using @idmapper to set the
616 * mapping for each of the ranges in @chain.
618 * This function always exec()s or errors out and does not return.
620 static void __attribute__((__noreturn__
))
621 map_ids_external(const char *idmapper
, int ppid
, struct map_range
*chain
)
623 unsigned int i
= 0, length
= 3;
626 for (struct map_range
*map
= chain
; map
; map
= map
->next
)
628 argv
= xcalloc(length
, sizeof(*argv
));
629 argv
[i
++] = xstrdup(idmapper
);
630 xasprintf(&argv
[i
++], "%u", ppid
);
632 for (struct map_range
*map
= chain
; map
; map
= map
->next
) {
633 xasprintf(&argv
[i
++], "%u", map
->inner
);
634 xasprintf(&argv
[i
++], "%u", map
->outer
);
635 xasprintf(&argv
[i
++], "%u", map
->count
);
639 execvp(idmapper
, argv
);
644 * map_ids_internal() - Create a new uid/gid map using root privilege
645 * @type: Either uid_map or gid_map
646 * @ppid: Pid to set the map for
647 * @chain: A linked list of ID range mappings
649 * This creates a new uid/gid map for @ppid using a privileged write to
650 * /proc/@ppid/@type to set a mapping for each of the ranges in @chain.
652 static void map_ids_internal(const char *type
, int ppid
, struct map_range
*chain
)
655 unsigned int length
= 0;
656 char buffer
[4096], *path
;
658 xasprintf(&path
, "/proc/%u/%s", ppid
, type
);
659 for (struct map_range
*map
= chain
; map
; map
= map
->next
) {
660 count
= snprintf(buffer
+ length
, sizeof(buffer
) - length
,
662 map
->inner
, map
->outer
, map
->count
);
663 if (count
< 0 || count
+ length
> sizeof(buffer
))
665 _("%s too large for kernel 4k limit"), path
);
669 fd
= open(path
, O_WRONLY
| O_CLOEXEC
| O_NOCTTY
);
671 err(EXIT_FAILURE
, _("failed to open %s"), path
);
672 if (write_all(fd
, buffer
, length
) < 0)
673 err(EXIT_FAILURE
, _("failed to write %s"), path
);
679 * map_ids_from_child() - Set up a new uid/gid map
680 * @fd: The eventfd to wait on
681 * @mapuser: The user to map the current user to (or -1)
682 * @usermap: The range of UIDs to map (or %NULL)
683 * @mapgroup: The group to map the current group to (or -1)
684 * @groupmap: The range of GIDs to map (or %NULL)
686 * fork_and_wait() for our parent to call sync_with_child() on @fd. Upon
687 * recieving the go-ahead, use newuidmap and newgidmap to set the uid/gid map
688 * for our parent's PID.
690 * Return: The pid of the child.
692 static pid_t
map_ids_from_child(int *fd
, uid_t mapuser
,
693 struct map_range
*usermap
, gid_t mapgroup
,
694 struct map_range
*groupmap
)
696 pid_t child
, pid
= 0;
697 pid_t ppid
= getpid();
699 child
= fork_and_wait(fd
);
704 add_single_map_range(&usermap
, geteuid(), mapuser
);
706 add_single_map_range(&groupmap
, getegid(), mapgroup
);
708 if (geteuid() == 0) {
710 map_ids_internal("uid_map", ppid
, usermap
);
712 map_ids_internal("gid_map", ppid
, groupmap
);
716 /* Avoid forking more than we need to */
717 if (usermap
&& groupmap
) {
720 err(EXIT_FAILURE
, _("fork failed"));
726 map_ids_external("newuidmap", ppid
, usermap
);
728 map_ids_external("newgidmap", ppid
, groupmap
);
732 static void __attribute__((__noreturn__
)) usage(void)
736 fputs(USAGE_HEADER
, out
);
737 fprintf(out
, _(" %s [options] [<program> [<argument>...]]\n"),
738 program_invocation_short_name
);
740 fputs(USAGE_SEPARATOR
, out
);
741 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out
);
743 fputs(USAGE_OPTIONS
, out
);
744 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out
);
745 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out
);
746 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out
);
747 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out
);
748 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out
);
749 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out
);
750 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out
);
751 fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out
);
752 fputs(USAGE_SEPARATOR
, out
);
753 fputs(_(" -f, --fork fork before launching <program>\n"), out
);
754 fputs(_(" --map-user=<uid>|<name> map current user to uid (implies --user)\n"), out
);
755 fputs(_(" --map-group=<gid>|<name> map current group to gid (implies --user)\n"), out
);
756 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out
);
757 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out
);
758 fputs(_(" --map-auto map users and groups automatically (implies --user)\n"), out
);
759 fputs(_(" --map-users=<inneruid>:<outeruid>:<count>\n"
760 " map count users from outeruid to inneruid (implies --user)\n"), out
);
761 fputs(_(" --map-groups=<innergid>:<outergid>:<count>\n"
762 " map count groups from outergid to innergid (implies --user)\n"), out
);
763 fputs(USAGE_SEPARATOR
, out
);
764 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
765 " defaults to SIGKILL\n"), out
);
766 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out
);
767 fputs(_(" --propagation slave|shared|private|unchanged\n"
768 " modify mount propagation in mount namespace\n"), out
);
769 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out
);
770 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out
);
771 fputs(USAGE_SEPARATOR
, out
);
772 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out
);
773 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out
);
774 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out
);
775 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out
);
776 fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out
);
777 fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out
);
779 fputs(USAGE_SEPARATOR
, out
);
780 fprintf(out
, USAGE_HELP_OPTIONS(27));
781 fprintf(out
, USAGE_MAN_TAIL("unshare(1)"));
786 int main(int argc
, char *argv
[])
789 OPT_MOUNTPROC
= CHAR_MAX
+ 1,
802 static const struct option longopts
[] = {
803 { "help", no_argument
, NULL
, 'h' },
804 { "version", no_argument
, NULL
, 'V' },
806 { "mount", optional_argument
, NULL
, 'm' },
807 { "uts", optional_argument
, NULL
, 'u' },
808 { "ipc", optional_argument
, NULL
, 'i' },
809 { "net", optional_argument
, NULL
, 'n' },
810 { "pid", optional_argument
, NULL
, 'p' },
811 { "user", optional_argument
, NULL
, 'U' },
812 { "cgroup", optional_argument
, NULL
, 'C' },
813 { "time", optional_argument
, NULL
, 'T' },
815 { "fork", no_argument
, NULL
, 'f' },
816 { "kill-child", optional_argument
, NULL
, OPT_KILLCHILD
},
817 { "mount-proc", optional_argument
, NULL
, OPT_MOUNTPROC
},
818 { "map-user", required_argument
, NULL
, OPT_MAPUSER
},
819 { "map-users", required_argument
, NULL
, OPT_MAPUSERS
},
820 { "map-group", required_argument
, NULL
, OPT_MAPGROUP
},
821 { "map-groups", required_argument
, NULL
, OPT_MAPGROUPS
},
822 { "map-root-user", no_argument
, NULL
, 'r' },
823 { "map-current-user", no_argument
, NULL
, 'c' },
824 { "map-auto", no_argument
, NULL
, OPT_MAPAUTO
},
825 { "propagation", required_argument
, NULL
, OPT_PROPAGATION
},
826 { "setgroups", required_argument
, NULL
, OPT_SETGROUPS
},
827 { "keep-caps", no_argument
, NULL
, OPT_KEEPCAPS
},
828 { "setuid", required_argument
, NULL
, 'S' },
829 { "setgid", required_argument
, NULL
, 'G' },
830 { "root", required_argument
, NULL
, 'R' },
831 { "wd", required_argument
, NULL
, 'w' },
832 { "monotonic", required_argument
, NULL
, OPT_MONOTONIC
},
833 { "boottime", required_argument
, NULL
, OPT_BOOTTIME
},
837 int setgrpcmd
= SETGROUPS_NONE
;
838 int unshare_flags
= 0;
842 struct map_range
*usermap
= NULL
;
843 struct map_range
*groupmap
= NULL
;
844 int kill_child_signo
= 0; /* 0 means --kill-child was not used */
845 const char *procmnt
= NULL
;
846 const char *newroot
= NULL
;
847 const char *newdir
= NULL
;
848 pid_t pid_bind
= 0, pid_idmap
= 0;
851 int fd_parent_pid
= -1;
853 int fd_idmap
, fd_bind
= -1;
854 sigset_t sigset
, oldsigset
;
856 unsigned long propagation
= UNSHARE_PROPAGATION_DEFAULT
;
857 int force_uid
= 0, force_gid
= 0;
858 uid_t uid
= 0, real_euid
= geteuid();
859 gid_t gid
= 0, real_egid
= getegid();
861 int64_t monotonic
= 0;
862 int64_t boottime
= 0;
863 int force_monotonic
= 0;
864 int force_boottime
= 0;
866 setlocale(LC_ALL
, "");
867 bindtextdomain(PACKAGE
, LOCALEDIR
);
869 close_stdout_atexit();
871 while ((c
= getopt_long(argc
, argv
, "+fhVmuinpCTUrR:w:S:G:c", longopts
, NULL
)) != -1) {
877 unshare_flags
|= CLONE_NEWNS
;
879 set_ns_target(CLONE_NEWNS
, optarg
);
882 unshare_flags
|= CLONE_NEWUTS
;
884 set_ns_target(CLONE_NEWUTS
, optarg
);
887 unshare_flags
|= CLONE_NEWIPC
;
889 set_ns_target(CLONE_NEWIPC
, optarg
);
892 unshare_flags
|= CLONE_NEWNET
;
894 set_ns_target(CLONE_NEWNET
, optarg
);
897 unshare_flags
|= CLONE_NEWPID
;
899 set_ns_target(CLONE_NEWPID
, optarg
);
902 unshare_flags
|= CLONE_NEWUSER
;
904 set_ns_target(CLONE_NEWUSER
, optarg
);
907 unshare_flags
|= CLONE_NEWCGROUP
;
909 set_ns_target(CLONE_NEWCGROUP
, optarg
);
912 unshare_flags
|= CLONE_NEWTIME
;
914 set_ns_target(CLONE_NEWTIME
, optarg
);
917 unshare_flags
|= CLONE_NEWNS
;
918 procmnt
= optarg
? optarg
: "/proc";
921 unshare_flags
|= CLONE_NEWUSER
;
922 mapuser
= get_user(optarg
, _("failed to parse uid"));
925 unshare_flags
|= CLONE_NEWUSER
;
926 mapgroup
= get_group(optarg
, _("failed to parse gid"));
929 unshare_flags
|= CLONE_NEWUSER
;
934 unshare_flags
|= CLONE_NEWUSER
;
936 mapgroup
= real_egid
;
939 unshare_flags
|= CLONE_NEWUSER
;
940 if (!strcmp(optarg
, "auto"))
941 insert_map_range(&usermap
,
942 read_subid_range(_PATH_SUBUID
, real_euid
));
943 else if (!strcmp(optarg
, "all"))
944 read_kernel_map(&usermap
, _PATH_PROC_UIDMAP
);
946 insert_map_range(&usermap
, get_map_range(optarg
));
949 unshare_flags
|= CLONE_NEWUSER
;
950 if (!strcmp(optarg
, "auto"))
951 insert_map_range(&groupmap
,
952 read_subid_range(_PATH_SUBGID
, real_euid
));
953 else if (!strcmp(optarg
, "all"))
954 read_kernel_map(&groupmap
, _PATH_PROC_GIDMAP
);
956 insert_map_range(&groupmap
, get_map_range(optarg
));
959 unshare_flags
|= CLONE_NEWUSER
;
960 insert_map_range(&usermap
, read_subid_range(_PATH_SUBUID
, real_euid
));
961 insert_map_range(&groupmap
, read_subid_range(_PATH_SUBGID
, real_euid
));
964 setgrpcmd
= setgroups_str2id(optarg
);
966 case OPT_PROPAGATION
:
967 propagation
= parse_propagation(optarg
);
972 if ((kill_child_signo
= signame_to_signum(optarg
)) < 0)
973 errx(EXIT_FAILURE
, _("unknown signal: %s"),
976 kill_child_signo
= SIGKILL
;
981 cap_last_cap(); /* Force last cap to be cached before we fork. */
984 uid
= strtoul_or_err(optarg
, _("failed to parse uid"));
988 gid
= strtoul_or_err(optarg
, _("failed to parse gid"));
998 monotonic
= strtos64_or_err(optarg
, _("failed to parse monotonic offset"));
1002 boottime
= strtos64_or_err(optarg
, _("failed to parse boottime offset"));
1009 print_version(EXIT_SUCCESS
);
1011 errtryhelp(EXIT_FAILURE
);
1015 if ((force_monotonic
|| force_boottime
) && !(unshare_flags
& CLONE_NEWTIME
))
1016 errx(EXIT_FAILURE
, _("options --monotonic and --boottime require "
1017 "unsharing of a time namespace (-T)"));
1019 /* clear any inherited settings */
1020 signal(SIGCHLD
, SIG_DFL
);
1022 if (npersists
&& (unshare_flags
& CLONE_NEWNS
))
1023 pid_bind
= bind_ns_files_from_child(&fd_bind
);
1025 if (usermap
|| groupmap
)
1026 pid_idmap
= map_ids_from_child(&fd_idmap
, mapuser
, usermap
,
1027 mapgroup
, groupmap
);
1029 if (-1 == unshare(unshare_flags
))
1030 err(EXIT_FAILURE
, _("unshare failed"));
1032 /* Tell child we've called unshare() */
1033 if (usermap
|| groupmap
)
1034 sync_with_child(pid_idmap
, fd_idmap
);
1037 settime(boottime
, CLOCK_BOOTTIME
);
1039 if (force_monotonic
)
1040 settime(monotonic
, CLOCK_MONOTONIC
);
1043 if (sigemptyset(&sigset
) != 0 ||
1044 sigaddset(&sigset
, SIGINT
) != 0 ||
1045 sigaddset(&sigset
, SIGTERM
) != 0 ||
1046 sigprocmask(SIG_BLOCK
, &sigset
, &oldsigset
) != 0)
1047 err(EXIT_FAILURE
, _("sigprocmask block failed"));
1048 #ifdef UL_HAVE_PIDFD
1049 if (kill_child_signo
!= 0) {
1050 /* make a connection to the original process (parent) */
1051 fd_parent_pid
= pidfd_open(getpid(), 0);
1052 if (0 > fd_parent_pid
)
1053 err(EXIT_FAILURE
, _("pidfd_open failed"));
1056 /* force child forking before mountspace binding so
1057 * pid_for_children is populated */
1062 err(EXIT_FAILURE
, _("fork failed"));
1064 if (sigprocmask(SIG_SETMASK
, &oldsigset
, NULL
))
1066 _("sigprocmask restore failed"));
1067 if (npersists
&& (unshare_flags
& CLONE_NEWNS
))
1070 default: /* parent */
1075 if (npersists
&& (pid
|| !forkit
)) {
1077 if (pid_bind
&& (unshare_flags
& CLONE_NEWNS
))
1078 sync_with_child(pid_bind
, fd_bind
);
1080 /* simple way, just bind */
1081 bind_ns_files(getpid());
1085 if (waitpid(pid
, &status
, 0) == -1)
1086 err(EXIT_FAILURE
, _("waitpid failed"));
1088 if (WIFEXITED(status
))
1089 return WEXITSTATUS(status
);
1090 if (WIFSIGNALED(status
)) {
1092 /* Ensure the signal that terminated the child will
1093 * also terminate the parent. */
1095 int termsig
= WTERMSIG(status
);
1097 if (termsig
!= SIGKILL
&& signal(termsig
, SIG_DFL
) == SIG_ERR
)
1099 _("signal handler reset failed"));
1100 if (sigemptyset(&sigset
) != 0 ||
1101 sigaddset(&sigset
, termsig
) != 0 ||
1102 sigprocmask(SIG_UNBLOCK
, &sigset
, NULL
) != 0)
1104 _("sigprocmask unblock failed"));
1106 kill(getpid(), termsig
);
1108 err(EXIT_FAILURE
, _("child exit failed"));
1111 if (kill_child_signo
!= 0) {
1112 if (prctl(PR_SET_PDEATHSIG
, kill_child_signo
) < 0)
1113 err(EXIT_FAILURE
, "prctl failed");
1114 #ifdef UL_HAVE_PIDFD
1115 /* Use poll() to check that there is still the original parent. */
1116 if (fd_parent_pid
!= -1) {
1117 struct pollfd pollfds
[1] = {
1118 { .fd
= fd_parent_pid
, .events
= POLLIN
}
1120 int nfds
= poll(pollfds
, 1, 0);
1123 err(EXIT_FAILURE
, "poll parent pidfd failed");
1125 /* If the child was re-parented before prctl(2) was called, the
1126 * new parent will likely not be interested in the precise exit
1127 * status of the orphan.
1132 close(fd_parent_pid
);
1138 if (mapuser
!= (uid_t
) -1 && !usermap
)
1139 map_id(_PATH_PROC_UIDMAP
, mapuser
, real_euid
);
1141 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
1142 * has been disabled unless /proc/self/setgroups is written
1143 * first to permanently disable the ability to call setgroups
1144 * in that user namespace. */
1145 if (mapgroup
!= (gid_t
) -1 && !groupmap
) {
1146 if (setgrpcmd
== SETGROUPS_ALLOW
)
1147 errx(EXIT_FAILURE
, _("options --setgroups=allow and "
1148 "--map-group are mutually exclusive"));
1149 setgroups_control(SETGROUPS_DENY
);
1150 map_id(_PATH_PROC_GIDMAP
, mapgroup
, real_egid
);
1153 if (setgrpcmd
!= SETGROUPS_NONE
)
1154 setgroups_control(setgrpcmd
);
1156 if ((unshare_flags
& CLONE_NEWNS
) && propagation
)
1157 set_propagation(propagation
);
1160 if (chroot(newroot
) != 0)
1162 _("cannot change root directory to '%s'"), newroot
);
1163 newdir
= newdir
?: "/";
1165 if (newdir
&& chdir(newdir
))
1166 err(EXIT_FAILURE
, _("cannot chdir to '%s'"), newdir
);
1169 /* When not changing root and using the default propagation flags
1170 then the recursive propagation change of root will
1171 automatically change that of an existing proc mount. */
1172 if (!newroot
&& propagation
!= (MS_PRIVATE
|MS_REC
)) {
1173 int rc
= mount("none", procmnt
, NULL
, MS_PRIVATE
|MS_REC
, NULL
);
1175 /* Custom procmnt means that proc is very likely not mounted, causing EINVAL.
1176 Ignoring the error in this specific instance is considered safe. */
1177 if(rc
!= 0 && errno
!= EINVAL
)
1178 err(EXIT_FAILURE
, _("cannot change %s filesystem propagation"), procmnt
);
1181 if (mount("proc", procmnt
, "proc", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
) != 0)
1182 err(EXIT_FAILURE
, _("mount %s failed"), procmnt
);
1186 if (setgroups(0, NULL
) != 0) /* drop supplementary groups */
1187 err(EXIT_FAILURE
, _("setgroups failed"));
1188 if (setgid(gid
) < 0) /* change GID */
1189 err(EXIT_FAILURE
, _("setgid failed"));
1191 if (force_uid
&& setuid(uid
) < 0) /* change UID */
1192 err(EXIT_FAILURE
, _("setuid failed"));
1194 if (keepcaps
&& (unshare_flags
& CLONE_NEWUSER
))
1195 cap_permitted_to_ambient();
1197 if (optind
< argc
) {
1198 execvp(argv
[optind
], argv
+ optind
);
1199 errexec(argv
[optind
]);