2 * unshare(1) - command-line interface for unshare(2)
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 #include <sys/eventfd.h>
29 #include <sys/mount.h>
30 #include <sys/types.h>
32 #include <sys/prctl.h>
35 /* we only need some defines missing in sys/mount.h, no libmount linkage */
41 #include "closestream.h"
42 #include "namespace.h"
43 #include "exec_shell.h"
45 #include "pathnames.h"
51 /* synchronize parent and child by pipe */
52 #define PIPE_SYNC_BYTE 0x06
54 /* 'private' is kernel default */
55 #define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
57 /* /proc namespace files and mountpoints for binds */
58 static struct namespace_file
{
59 int type
; /* CLONE_NEW* */
60 const char *name
; /* ns/<type> */
61 const char *target
; /* user specified target for bind mount */
62 } namespace_files
[] = {
63 { .type
= CLONE_NEWUSER
, .name
= "ns/user" },
64 { .type
= CLONE_NEWCGROUP
,.name
= "ns/cgroup" },
65 { .type
= CLONE_NEWIPC
, .name
= "ns/ipc" },
66 { .type
= CLONE_NEWUTS
, .name
= "ns/uts" },
67 { .type
= CLONE_NEWNET
, .name
= "ns/net" },
68 { .type
= CLONE_NEWPID
, .name
= "ns/pid_for_children" },
69 { .type
= CLONE_NEWNS
, .name
= "ns/mnt" },
70 { .type
= CLONE_NEWTIME
, .name
= "ns/time_for_children" },
74 static int npersists
; /* number of persistent namespaces */
82 static const char *setgroups_strings
[] =
84 [SETGROUPS_DENY
] = "deny",
85 [SETGROUPS_ALLOW
] = "allow"
88 static int setgroups_str2id(const char *str
)
92 for (i
= 0; i
< ARRAY_SIZE(setgroups_strings
); i
++)
93 if (strcmp(str
, setgroups_strings
[i
]) == 0)
96 errx(EXIT_FAILURE
, _("unsupported --setgroups argument '%s'"), str
);
99 static void setgroups_control(int action
)
101 const char *file
= _PATH_PROC_SETGROUPS
;
105 if (action
< 0 || (size_t) action
>= ARRAY_SIZE(setgroups_strings
))
107 cmd
= setgroups_strings
[action
];
109 fd
= open(file
, O_WRONLY
);
113 err(EXIT_FAILURE
, _("cannot open %s"), file
);
116 if (write_all(fd
, cmd
, strlen(cmd
)))
117 err(EXIT_FAILURE
, _("write failed %s"), file
);
121 static void map_id(const char *file
, uint32_t from
, uint32_t to
)
126 fd
= open(file
, O_WRONLY
);
128 err(EXIT_FAILURE
, _("cannot open %s"), file
);
130 xasprintf(&buf
, "%u %u 1", from
, to
);
131 if (write_all(fd
, buf
, strlen(buf
)))
132 err(EXIT_FAILURE
, _("write failed %s"), file
);
137 static unsigned long parse_propagation(const char *str
)
140 static const struct prop_opts
{
144 { "slave", MS_REC
| MS_SLAVE
},
145 { "private", MS_REC
| MS_PRIVATE
},
146 { "shared", MS_REC
| MS_SHARED
},
150 for (i
= 0; i
< ARRAY_SIZE(opts
); i
++) {
151 if (strcmp(opts
[i
].name
, str
) == 0)
155 errx(EXIT_FAILURE
, _("unsupported propagation mode: %s"), str
);
158 static void set_propagation(unsigned long flags
)
163 if (mount("none", "/", NULL
, flags
, NULL
) != 0)
164 err(EXIT_FAILURE
, _("cannot change root filesystem propagation"));
168 static int set_ns_target(int type
, const char *path
)
170 struct namespace_file
*ns
;
172 for (ns
= namespace_files
; ns
->name
; ns
++) {
173 if (ns
->type
!= type
)
183 static int bind_ns_files(pid_t pid
)
185 struct namespace_file
*ns
;
188 for (ns
= namespace_files
; ns
->name
; ns
++) {
192 snprintf(src
, sizeof(src
), "/proc/%u/%s", (unsigned) pid
, ns
->name
);
194 if (mount(src
, ns
->target
, NULL
, MS_BIND
, NULL
) != 0)
195 err(EXIT_FAILURE
, _("mount %s on %s failed"), src
, ns
->target
);
201 static ino_t
get_mnt_ino(pid_t pid
)
206 snprintf(path
, sizeof(path
), "/proc/%u/ns/mnt", (unsigned) pid
);
208 if (stat(path
, &st
) != 0)
209 err(EXIT_FAILURE
, _("stat of %s failed"), path
);
213 static void settime(time_t offset
, clockid_t clk_id
)
215 char buf
[sizeof(stringify_value(ULONG_MAX
)) * 3];
218 len
= snprintf(buf
, sizeof(buf
), "%d %ld 0", clk_id
, offset
);
220 fd
= open("/proc/self/timens_offsets", O_WRONLY
);
222 err(EXIT_FAILURE
, _("failed to open /proc/self/timens_offsets"));
224 if (write(fd
, buf
, len
) != len
)
225 err(EXIT_FAILURE
, _("failed to write to /proc/self/timens_offsets"));
231 * waitchild() - Wait for a process to exit successfully
232 * @pid: PID of the process to wait for
234 * Wait for a process to exit successfully. If it exits with a non-zero return
235 * code, then exit() with the same status.
237 static void waitchild(int pid
)
242 rc
= waitpid(pid
, &status
, 0);
246 err(EXIT_FAILURE
, _("waitpid failed"));
248 if (WIFEXITED(status
) &&
249 WEXITSTATUS(status
) != EXIT_SUCCESS
)
250 exit(WEXITSTATUS(status
));
255 * sync_with_child() - Tell our child we're ready and wait for it to exit
256 * @pid: The pid of our child
257 * @fd: A file descriptor created with eventfd()
259 * This tells a child created with fork_and_wait() that we are ready for it to
260 * continue. Once we have done that, wait for our child to exit.
262 static void sync_with_child(pid_t pid
, int fd
)
264 uint64_t ch
= PIPE_SYNC_BYTE
;
266 write_all(fd
, &ch
, sizeof(ch
));
273 * fork_and_wait() - Fork and wait to be sync'd with
274 * @fd - A file descriptor created with eventfd() which should be passed to
277 * This creates an eventfd and forks. The parent process returns immediately,
278 * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
279 * This allows the parent to perform some tasks before the child starts its
280 * work. The parent should call sync_with_child() once it is ready for the
283 * Return: The pid from fork()
285 static pid_t
fork_and_wait(int *fd
)
292 err(EXIT_FAILURE
, _("eventfd failed"));
296 err(EXIT_FAILURE
, _("fork failed"));
299 /* wait for the our parent to tell us to continue */
300 if (read_all(*fd
, (char *)&ch
, sizeof(ch
)) != sizeof(ch
) ||
301 ch
!= PIPE_SYNC_BYTE
)
302 err(EXIT_FAILURE
, _("failed to read eventfd"));
309 static pid_t
bind_ns_files_from_child(int *fd
)
311 pid_t child
, ppid
= getpid();
312 ino_t ino
= get_mnt_ino(ppid
);
314 child
= fork_and_wait(fd
);
318 if (get_mnt_ino(ppid
) == ino
)
324 static uid_t
get_user(const char *s
, const char *err
)
330 pw
= xgetpwnam(s
, &buf
);
336 ret
= strtoul_or_err(s
, err
);
342 static gid_t
get_group(const char *s
, const char *err
)
348 gr
= xgetgrnam(s
, &buf
);
354 ret
= strtoul_or_err(s
, err
);
361 * struct map_range - A range of IDs to map
362 * @outer: First ID mapped on the outside of the namespace
363 * @inner: First ID mapped on the inside of the namespace
364 * @count: Length of the inside and outside ranges
366 * A range of uids/gids to map using new[gu]idmap.
374 #define UID_BUFSIZ sizeof(stringify_value(ULONG_MAX))
377 * uint_to_id() - Convert a string into a user/group ID
378 * @name: The string representation of the ID
379 * @sz: The length of @name, without an (optional) nul-terminator
381 * This converts a (possibly not nul-terminated_ string into user or group ID.
382 * No name lookup is performed.
384 * Return: @name as a numeric ID
386 static int uint_to_id(const char *name
, size_t sz
)
388 char buf
[UID_BUFSIZ
];
390 mem2strcpy(buf
, name
, sz
, sizeof(buf
));
391 return strtoul_or_err(name
, _("could not parse ID"));
395 * get_map_range() - Parse a mapping range from a string
396 * @s: A string of the format outer,inner,count
398 * Parse a string of the form outer,inner,count into a new mapping range.
400 * Return: A new &struct map_range
402 static struct map_range
*get_map_range(const char *s
)
405 struct map_range
*ret
;
407 n
= string_to_idarray(s
, map
, ARRAY_SIZE(map
), uint_to_id
);
409 errx(EXIT_FAILURE
, _("too many elements for mapping '%s'"), s
);
410 if (n
!= ARRAY_SIZE(map
))
411 errx(EXIT_FAILURE
, _("mapping '%s' contains only %d elements"),
414 ret
= xmalloc(sizeof(*ret
));
422 * read_subid_range() - Look up a user's sub[gu]id range
423 * @filename: The file to look up the range from. This should be either
424 * ``/etc/subuid`` or ``/etc/subgid``.
425 * @uid: The uid of the user whose range we should look up.
427 * This finds the first subid range matching @uid in @filename.
429 static struct map_range
*read_subid_range(char *filename
, uid_t uid
)
431 char *line
= NULL
, *pwbuf
;
435 struct map_range
*map
;
437 map
= xmalloc(sizeof(*map
));
440 pw
= xgetpwuid(uid
, &pwbuf
);
442 errx(EXIT_FAILURE
, _("you (user %d) don't exist."), uid
);
444 idmap
= fopen(filename
, "r");
446 err(EXIT_FAILURE
, _("could not open '%s'"), filename
);
449 * Each line in sub[ug]idmap looks like
450 * username:subuid:count
454 while (getline(&line
, &n
, idmap
) != -1) {
457 rest
= strchr(line
, ':');
462 if (strcmp(line
, pw
->pw_name
) &&
463 strtoul(line
, NULL
, 10) != pw
->pw_uid
)
467 rest
= strchr(s
, ':');
471 map
->outer
= strtoul_or_err(s
, _("failed to parse subid map"));
474 rest
= strchr(s
, '\n');
477 map
->count
= strtoul_or_err(s
, _("failed to parse subid map"));
486 err(EXIT_FAILURE
, _("no line matching user \"%s\" in %s"),
487 pw
->pw_name
, filename
);
491 * map_ids() - Create a new uid/gid map
492 * @idmapper: Either newuidmap or newgidmap
493 * @ppid: Pid to set the map for
494 * @outer: ID outside the namespace for a single map.
495 * @inner: ID inside the namespace for a single map. May be -1 to only use @map.
496 * @map: A range of IDs to map
498 * This creates a new uid/gid map for @ppid using @idmapper. The ID @outer in
499 * the parent (our) namespace is mapped to the ID @inner in the child (@ppid's)
500 * namespace. In addition, the range of IDs beginning at @map->outer is mapped
501 * to the range of IDs beginning at @map->inner. The tricky bit is that we
502 * cannot let these mappings overlap. We accomplish this by removing a "hole"
503 * from @map, if @outer or @inner overlap it. This may result in one less than
504 * @map->count IDs being mapped from @map. The unmapped IDs are always the
505 * topmost IDs of the mapping (either in the parent or the child namespace).
507 * Most of the time, this function will be called with @map->outer as some
508 * large ID, @map->inner as 0, and @map->count as a large number (at least
509 * 1000, but less than @map->outer). Typically, there will be no conflict with
510 * @outer. However, @inner may split the mapping for e.g. --map-current-user.
512 * This function always exec()s or errors out and does not return.
514 static void __attribute__((__noreturn__
))
515 map_ids(const char *idmapper
, int ppid
, unsigned int outer
, unsigned int inner
,
516 struct map_range
*map
)
518 /* idmapper + pid + 4 * map + NULL */
520 /* argv - idmapper - "1" - NULL */
521 char args
[12][UID_BUFSIZ
];
523 struct map_range lo
, mid
, hi
;
524 unsigned int inner_offset
, outer_offset
;
526 /* Some helper macros to reduce bookkeeping */
527 #define push_str(s) do { \
530 #define push_ul(x) do { \
531 snprintf(args[j], sizeof(args[j]), "%u", x); \
532 push_str(args[j++]); \
535 push_str(xstrdup(idmapper
));
537 if ((int)inner
== -1) {
539 * If we don't have a "single" mapping, then we can just use
547 execvp(idmapper
, argv
);
551 /* If the mappings overlap, remove an ID from map */
552 if ((outer
>= map
->outer
&& outer
<= map
->outer
+ map
->count
) ||
553 (inner
>= map
->inner
&& inner
<= map
->inner
+ map
->count
))
556 /* Determine where the splits between lo, mid, and hi will be */
557 outer_offset
= min(outer
> map
->outer
? outer
- map
->outer
: 0,
559 inner_offset
= min(inner
> map
->inner
? inner
- map
->inner
: 0,
563 * In the worst case, we need three mappings:
564 * From the bottom of map to either inner or outer
566 lo
.outer
= map
->outer
;
567 lo
.inner
= map
->inner
;
568 lo
.count
= min(inner_offset
, outer_offset
);
570 /* From the lower of inner or outer to the higher */
571 mid
.outer
= lo
.outer
+ lo
.count
;
572 mid
.outer
+= mid
.outer
== outer
;
573 mid
.inner
= lo
.inner
+ lo
.count
;
574 mid
.inner
+= mid
.inner
== inner
;
575 mid
.count
= abs_diff(outer_offset
, inner_offset
);
577 /* And from the higher of inner or outer to the end of the map */
578 hi
.outer
= mid
.outer
+ mid
.count
;
579 hi
.outer
+= hi
.outer
== outer
;
580 hi
.inner
= mid
.inner
+ mid
.count
;
581 hi
.inner
+= hi
.inner
== inner
;
582 hi
.count
= map
->count
- lo
.count
- mid
.count
;
587 /* new[gu]idmap doesn't like zero-length mappings, so skip them */
604 execvp(idmapper
, argv
);
609 * map_ids_from_child() - Set up a new uid/gid map
610 * @fd: The eventfd to wait on
611 * @mapuser: The user to map the current user to (or -1)
612 * @usermap: The range of UIDs to map (or %NULL)
613 * @mapgroup: The group to map the current group to (or -1)
614 * @groupmap: The range of GIDs to map (or %NULL)
616 * fork_and_wait() for our parent to call sync_with_child() on @fd. Upon
617 * recieving the go-ahead, use newuidmap and newgidmap to set the uid/gid map
618 * for our parent's PID.
620 * Return: The pid of the child.
622 static pid_t
map_ids_from_child(int *fd
, uid_t mapuser
,
623 struct map_range
*usermap
, gid_t mapgroup
,
624 struct map_range
*groupmap
)
626 pid_t child
, pid
= 0;
627 pid_t ppid
= getpid();
629 child
= fork_and_wait(fd
);
633 /* Avoid forking more than we need to */
634 if (usermap
&& groupmap
) {
637 err(EXIT_FAILURE
, _("fork failed"));
643 map_ids("newuidmap", ppid
, geteuid(), mapuser
, usermap
);
645 map_ids("newgidmap", ppid
, getegid(), mapgroup
, groupmap
);
649 static void __attribute__((__noreturn__
)) usage(void)
653 fputs(USAGE_HEADER
, out
);
654 fprintf(out
, _(" %s [options] [<program> [<argument>...]]\n"),
655 program_invocation_short_name
);
657 fputs(USAGE_SEPARATOR
, out
);
658 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out
);
660 fputs(USAGE_OPTIONS
, out
);
661 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out
);
662 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out
);
663 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out
);
664 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out
);
665 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out
);
666 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out
);
667 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out
);
668 fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out
);
669 fputs(USAGE_SEPARATOR
, out
);
670 fputs(_(" -f, --fork fork before launching <program>\n"), out
);
671 fputs(_(" --map-user=<uid>|<name> map current user to uid (implies --user)\n"), out
);
672 fputs(_(" --map-group=<gid>|<name> map current group to gid (implies --user)\n"), out
);
673 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out
);
674 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out
);
675 fputs(_(" --map-auto map users and groups automatically (implies --user)\n"), out
);
676 fputs(_(" --map-users=<outeruid>,<inneruid>,<count>\n"
677 " map count users from outeruid to inneruid (implies --user)\n"), out
);
678 fputs(_(" --map-groups=<outergid>,<innergid>,<count>\n"
679 " map count groups from outergid to innergid (implies --user)\n"), out
);
680 fputs(USAGE_SEPARATOR
, out
);
681 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
682 " defaults to SIGKILL\n"), out
);
683 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out
);
684 fputs(_(" --propagation slave|shared|private|unchanged\n"
685 " modify mount propagation in mount namespace\n"), out
);
686 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out
);
687 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out
);
688 fputs(USAGE_SEPARATOR
, out
);
689 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out
);
690 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out
);
691 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out
);
692 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out
);
693 fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out
);
694 fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out
);
696 fputs(USAGE_SEPARATOR
, out
);
697 printf(USAGE_HELP_OPTIONS(27));
698 printf(USAGE_MAN_TAIL("unshare(1)"));
703 int main(int argc
, char *argv
[])
706 OPT_MOUNTPROC
= CHAR_MAX
+ 1,
719 static const struct option longopts
[] = {
720 { "help", no_argument
, NULL
, 'h' },
721 { "version", no_argument
, NULL
, 'V' },
723 { "mount", optional_argument
, NULL
, 'm' },
724 { "uts", optional_argument
, NULL
, 'u' },
725 { "ipc", optional_argument
, NULL
, 'i' },
726 { "net", optional_argument
, NULL
, 'n' },
727 { "pid", optional_argument
, NULL
, 'p' },
728 { "user", optional_argument
, NULL
, 'U' },
729 { "cgroup", optional_argument
, NULL
, 'C' },
730 { "time", optional_argument
, NULL
, 'T' },
732 { "fork", no_argument
, NULL
, 'f' },
733 { "kill-child", optional_argument
, NULL
, OPT_KILLCHILD
},
734 { "mount-proc", optional_argument
, NULL
, OPT_MOUNTPROC
},
735 { "map-user", required_argument
, NULL
, OPT_MAPUSER
},
736 { "map-users", required_argument
, NULL
, OPT_MAPUSERS
},
737 { "map-group", required_argument
, NULL
, OPT_MAPGROUP
},
738 { "map-groups", required_argument
, NULL
, OPT_MAPGROUPS
},
739 { "map-root-user", no_argument
, NULL
, 'r' },
740 { "map-current-user", no_argument
, NULL
, 'c' },
741 { "map-auto", no_argument
, NULL
, OPT_MAPAUTO
},
742 { "propagation", required_argument
, NULL
, OPT_PROPAGATION
},
743 { "setgroups", required_argument
, NULL
, OPT_SETGROUPS
},
744 { "keep-caps", no_argument
, NULL
, OPT_KEEPCAPS
},
745 { "setuid", required_argument
, NULL
, 'S' },
746 { "setgid", required_argument
, NULL
, 'G' },
747 { "root", required_argument
, NULL
, 'R' },
748 { "wd", required_argument
, NULL
, 'w' },
749 { "monotonic", required_argument
, NULL
, OPT_MONOTONIC
},
750 { "boottime", required_argument
, NULL
, OPT_BOOTTIME
},
754 int setgrpcmd
= SETGROUPS_NONE
;
755 int unshare_flags
= 0;
759 struct map_range
*usermap
= NULL
;
760 struct map_range
*groupmap
= NULL
;
761 int kill_child_signo
= 0; /* 0 means --kill-child was not used */
762 const char *procmnt
= NULL
;
763 const char *newroot
= NULL
;
764 const char *newdir
= NULL
;
765 pid_t pid_bind
= 0, pid_idmap
= 0;
766 pid_t pid
= 0, pid_parent
= 0;
767 int fd_idmap
, fd_bind
= -1;
768 sigset_t sigset
, oldsigset
;
770 unsigned long propagation
= UNSHARE_PROPAGATION_DEFAULT
;
771 int force_uid
= 0, force_gid
= 0;
772 uid_t uid
= 0, real_euid
= geteuid();
773 gid_t gid
= 0, real_egid
= getegid();
775 time_t monotonic
= 0;
777 int force_monotonic
= 0;
778 int force_boottime
= 0;
780 setlocale(LC_ALL
, "");
781 bindtextdomain(PACKAGE
, LOCALEDIR
);
783 close_stdout_atexit();
785 while ((c
= getopt_long(argc
, argv
, "+fhVmuinpCTUrR:w:S:G:c", longopts
, NULL
)) != -1) {
791 unshare_flags
|= CLONE_NEWNS
;
793 set_ns_target(CLONE_NEWNS
, optarg
);
796 unshare_flags
|= CLONE_NEWUTS
;
798 set_ns_target(CLONE_NEWUTS
, optarg
);
801 unshare_flags
|= CLONE_NEWIPC
;
803 set_ns_target(CLONE_NEWIPC
, optarg
);
806 unshare_flags
|= CLONE_NEWNET
;
808 set_ns_target(CLONE_NEWNET
, optarg
);
811 unshare_flags
|= CLONE_NEWPID
;
813 set_ns_target(CLONE_NEWPID
, optarg
);
816 unshare_flags
|= CLONE_NEWUSER
;
818 set_ns_target(CLONE_NEWUSER
, optarg
);
821 unshare_flags
|= CLONE_NEWCGROUP
;
823 set_ns_target(CLONE_NEWCGROUP
, optarg
);
826 unshare_flags
|= CLONE_NEWTIME
;
828 set_ns_target(CLONE_NEWTIME
, optarg
);
831 unshare_flags
|= CLONE_NEWNS
;
832 procmnt
= optarg
? optarg
: "/proc";
835 unshare_flags
|= CLONE_NEWUSER
;
836 mapuser
= get_user(optarg
, _("failed to parse uid"));
839 unshare_flags
|= CLONE_NEWUSER
;
840 mapgroup
= get_group(optarg
, _("failed to parse gid"));
843 unshare_flags
|= CLONE_NEWUSER
;
848 unshare_flags
|= CLONE_NEWUSER
;
850 mapgroup
= real_egid
;
853 unshare_flags
|= CLONE_NEWUSER
;
854 if (!strcmp(optarg
, "auto"))
855 usermap
= read_subid_range(_PATH_SUBUID
, real_euid
);
857 usermap
= get_map_range(optarg
);
860 unshare_flags
|= CLONE_NEWUSER
;
861 if (!strcmp(optarg
, "auto"))
862 groupmap
= read_subid_range(_PATH_SUBGID
, real_egid
);
864 groupmap
= get_map_range(optarg
);
867 unshare_flags
|= CLONE_NEWUSER
;
868 usermap
= read_subid_range(_PATH_SUBUID
, real_euid
);
869 groupmap
= read_subid_range(_PATH_SUBGID
, real_egid
);
872 setgrpcmd
= setgroups_str2id(optarg
);
874 case OPT_PROPAGATION
:
875 propagation
= parse_propagation(optarg
);
880 if ((kill_child_signo
= signame_to_signum(optarg
)) < 0)
881 errx(EXIT_FAILURE
, _("unknown signal: %s"),
884 kill_child_signo
= SIGKILL
;
889 cap_last_cap(); /* Force last cap to be cached before we fork. */
892 uid
= strtoul_or_err(optarg
, _("failed to parse uid"));
896 gid
= strtoul_or_err(optarg
, _("failed to parse gid"));
906 monotonic
= strtoul_or_err(optarg
, _("failed to parse monotonic offset"));
910 boottime
= strtoul_or_err(optarg
, _("failed to parse boottime offset"));
917 print_version(EXIT_SUCCESS
);
919 errtryhelp(EXIT_FAILURE
);
923 if ((force_monotonic
|| force_boottime
) && !(unshare_flags
& CLONE_NEWTIME
))
924 errx(EXIT_FAILURE
, _("options --monotonic and --boottime require "
925 "unsharing of a time namespace (-t)"));
927 /* clear any inherited settings */
928 signal(SIGCHLD
, SIG_DFL
);
930 if (npersists
&& (unshare_flags
& CLONE_NEWNS
))
931 pid_bind
= bind_ns_files_from_child(&fd_bind
);
933 if (usermap
|| groupmap
)
934 pid_idmap
= map_ids_from_child(&fd_idmap
, mapuser
, usermap
,
937 if (-1 == unshare(unshare_flags
))
938 err(EXIT_FAILURE
, _("unshare failed"));
940 /* Tell child we've called unshare() */
941 if (usermap
|| groupmap
)
942 sync_with_child(pid_idmap
, fd_idmap
);
945 settime(boottime
, CLOCK_BOOTTIME
);
948 settime(monotonic
, CLOCK_MONOTONIC
);
951 if (sigemptyset(&sigset
) != 0 ||
952 sigaddset(&sigset
, SIGINT
) != 0 ||
953 sigaddset(&sigset
, SIGTERM
) != 0 ||
954 sigprocmask(SIG_BLOCK
, &sigset
, &oldsigset
) != 0)
955 err(EXIT_FAILURE
, _("sigprocmask block failed"));
957 /* force child forking before mountspace binding
958 * so pid_for_children is populated */
959 pid_parent
= getpid();
964 err(EXIT_FAILURE
, _("fork failed"));
966 if (sigprocmask(SIG_SETMASK
, &oldsigset
, NULL
))
968 _("sigprocmask restore failed"));
969 if (npersists
&& (unshare_flags
& CLONE_NEWNS
))
972 default: /* parent */
977 if (npersists
&& (pid
|| !forkit
)) {
979 if (pid_bind
&& (unshare_flags
& CLONE_NEWNS
))
980 sync_with_child(pid_bind
, fd_bind
);
982 /* simple way, just bind */
983 bind_ns_files(getpid());
987 if (waitpid(pid
, &status
, 0) == -1)
988 err(EXIT_FAILURE
, _("waitpid failed"));
990 if (WIFEXITED(status
))
991 return WEXITSTATUS(status
);
992 if (WIFSIGNALED(status
)) {
994 /* Ensure the signal that terminated the child will
995 * also terminate the parent. */
997 int termsig
= WTERMSIG(status
);
999 if (signal(termsig
, SIG_DFL
) == SIG_ERR
||
1000 sigemptyset(&sigset
) != 0 ||
1001 sigaddset(&sigset
, termsig
) != 0 ||
1002 sigprocmask(SIG_UNBLOCK
, &sigset
, NULL
) != 0)
1004 _("sigprocmask unblock failed"));
1006 kill(getpid(), termsig
);
1008 err(EXIT_FAILURE
, _("child exit failed"));
1011 if (kill_child_signo
!= 0) {
1012 if (prctl(PR_SET_PDEATHSIG
, kill_child_signo
) < 0)
1013 err(EXIT_FAILURE
, "prctl failed");
1015 if (getppid() != pid_parent
) {
1016 if (kill(getpid(), kill_child_signo
) != 0)
1017 err(EXIT_FAILURE
, _("child kill failed"));
1018 /* The selected kill_child_signo be blocked, or
1019 * might not cause termination. */
1023 if (mapuser
!= (uid_t
) -1 && !usermap
)
1024 map_id(_PATH_PROC_UIDMAP
, mapuser
, real_euid
);
1026 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
1027 * has been disabled unless /proc/self/setgroups is written
1028 * first to permanently disable the ability to call setgroups
1029 * in that user namespace. */
1030 if (mapgroup
!= (gid_t
) -1 && !groupmap
) {
1031 if (setgrpcmd
== SETGROUPS_ALLOW
)
1032 errx(EXIT_FAILURE
, _("options --setgroups=allow and "
1033 "--map-group are mutually exclusive"));
1034 setgroups_control(SETGROUPS_DENY
);
1035 map_id(_PATH_PROC_GIDMAP
, mapgroup
, real_egid
);
1038 if (setgrpcmd
!= SETGROUPS_NONE
)
1039 setgroups_control(setgrpcmd
);
1041 if ((unshare_flags
& CLONE_NEWNS
) && propagation
)
1042 set_propagation(propagation
);
1045 if (chroot(newroot
) != 0)
1047 _("cannot change root directory to '%s'"), newroot
);
1048 newdir
= newdir
?: "/";
1050 if (newdir
&& chdir(newdir
))
1051 err(EXIT_FAILURE
, _("cannot chdir to '%s'"), newdir
);
1054 /* When not changing root and using the default propagation flags
1055 then the recursive propagation change of root will
1056 automatically change that of an existing proc mount. */
1057 if (!newroot
&& propagation
!= (MS_PRIVATE
|MS_REC
)) {
1058 int rc
= mount("none", procmnt
, NULL
, MS_PRIVATE
|MS_REC
, NULL
);
1060 /* Custom procmnt means that proc is very likely not mounted, causing EINVAL.
1061 Ignoring the error in this specific instance is considered safe. */
1062 if(rc
!= 0 && errno
!= EINVAL
)
1063 err(EXIT_FAILURE
, _("cannot change %s filesystem propagation"), procmnt
);
1066 if (mount("proc", procmnt
, "proc", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
) != 0)
1067 err(EXIT_FAILURE
, _("mount %s failed"), procmnt
);
1071 if (setgroups(0, NULL
) != 0) /* drop supplementary groups */
1072 err(EXIT_FAILURE
, _("setgroups failed"));
1073 if (setgid(gid
) < 0) /* change GID */
1074 err(EXIT_FAILURE
, _("setgid failed"));
1076 if (force_uid
&& setuid(uid
) < 0) /* change UID */
1077 err(EXIT_FAILURE
, _("setuid failed"));
1079 /* We use capabilities system calls to propagate the permitted
1080 * capabilities into the ambient set because we have already
1081 * forked so are in async-signal-safe context. */
1082 if (keepcaps
&& (unshare_flags
& CLONE_NEWUSER
)) {
1083 struct __user_cap_header_struct header
= {
1084 .version
= _LINUX_CAPABILITY_VERSION_3
,
1088 struct __user_cap_data_struct payload
[_LINUX_CAPABILITY_U32S_3
] = {{ 0 }};
1089 uint64_t effective
, cap
;
1091 if (capget(&header
, payload
) < 0)
1092 err(EXIT_FAILURE
, _("capget failed"));
1094 /* In order the make capabilities ambient, we first need to ensure
1095 * that they are all inheritable. */
1096 payload
[0].inheritable
= payload
[0].permitted
;
1097 payload
[1].inheritable
= payload
[1].permitted
;
1099 if (capset(&header
, payload
) < 0)
1100 err(EXIT_FAILURE
, _("capset failed"));
1102 effective
= ((uint64_t)payload
[1].effective
<< 32) | (uint64_t)payload
[0].effective
;
1104 for (cap
= 0; cap
< (sizeof(effective
) * 8); cap
++) {
1105 /* This is the same check as cap_valid(), but using
1106 * the runtime value for the last valid cap. */
1107 if (cap
> (uint64_t) cap_last_cap())
1110 if ((effective
& (1 << cap
))
1111 && prctl(PR_CAP_AMBIENT
, PR_CAP_AMBIENT_RAISE
, cap
, 0, 0) < 0)
1112 err(EXIT_FAILURE
, _("prctl(PR_CAP_AMBIENT) failed"));
1116 if (optind
< argc
) {
1117 execvp(argv
[optind
], argv
+ optind
);
1118 errexec(argv
[optind
]);