2 * unshare(1) - command-line interface for unshare(2)
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
28 #include <sys/mount.h>
29 #include <sys/types.h>
31 #include <sys/prctl.h>
34 /* we only need some defines missing in sys/mount.h, no libmount linkage */
40 #include "closestream.h"
41 #include "namespace.h"
42 #include "exec_shell.h"
44 #include "pathnames.h"
50 /* synchronize parent and child by pipe */
51 #define PIPE_SYNC_BYTE 0x06
53 /* 'private' is kernel default */
54 #define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
56 /* /proc namespace files and mountpoints for binds */
57 static struct namespace_file
{
58 int type
; /* CLONE_NEW* */
59 const char *name
; /* ns/<type> */
60 const char *target
; /* user specified target for bind mount */
61 } namespace_files
[] = {
62 { .type
= CLONE_NEWUSER
, .name
= "ns/user" },
63 { .type
= CLONE_NEWCGROUP
,.name
= "ns/cgroup" },
64 { .type
= CLONE_NEWIPC
, .name
= "ns/ipc" },
65 { .type
= CLONE_NEWUTS
, .name
= "ns/uts" },
66 { .type
= CLONE_NEWNET
, .name
= "ns/net" },
67 { .type
= CLONE_NEWPID
, .name
= "ns/pid" },
68 { .type
= CLONE_NEWNS
, .name
= "ns/mnt" },
69 { .type
= CLONE_NEWTIME
, .name
= "ns/time" },
73 static int npersists
; /* number of persistent namespaces */
81 static const char *setgroups_strings
[] =
83 [SETGROUPS_DENY
] = "deny",
84 [SETGROUPS_ALLOW
] = "allow"
87 static int setgroups_str2id(const char *str
)
91 for (i
= 0; i
< ARRAY_SIZE(setgroups_strings
); i
++)
92 if (strcmp(str
, setgroups_strings
[i
]) == 0)
95 errx(EXIT_FAILURE
, _("unsupported --setgroups argument '%s'"), str
);
98 static void setgroups_control(int action
)
100 const char *file
= _PATH_PROC_SETGROUPS
;
104 if (action
< 0 || (size_t) action
>= ARRAY_SIZE(setgroups_strings
))
106 cmd
= setgroups_strings
[action
];
108 fd
= open(file
, O_WRONLY
);
112 err(EXIT_FAILURE
, _("cannot open %s"), file
);
115 if (write_all(fd
, cmd
, strlen(cmd
)))
116 err(EXIT_FAILURE
, _("write failed %s"), file
);
120 static void map_id(const char *file
, uint32_t from
, uint32_t to
)
125 fd
= open(file
, O_WRONLY
);
127 err(EXIT_FAILURE
, _("cannot open %s"), file
);
129 xasprintf(&buf
, "%u %u 1", from
, to
);
130 if (write_all(fd
, buf
, strlen(buf
)))
131 err(EXIT_FAILURE
, _("write failed %s"), file
);
136 static unsigned long parse_propagation(const char *str
)
139 static const struct prop_opts
{
143 { "slave", MS_REC
| MS_SLAVE
},
144 { "private", MS_REC
| MS_PRIVATE
},
145 { "shared", MS_REC
| MS_SHARED
},
149 for (i
= 0; i
< ARRAY_SIZE(opts
); i
++) {
150 if (strcmp(opts
[i
].name
, str
) == 0)
154 errx(EXIT_FAILURE
, _("unsupported propagation mode: %s"), str
);
157 static void set_propagation(unsigned long flags
)
162 if (mount("none", "/", NULL
, flags
, NULL
) != 0)
163 err(EXIT_FAILURE
, _("cannot change root filesystem propagation"));
167 static int set_ns_target(int type
, const char *path
)
169 struct namespace_file
*ns
;
171 for (ns
= namespace_files
; ns
->name
; ns
++) {
172 if (ns
->type
!= type
)
182 static int bind_ns_files(pid_t pid
)
184 struct namespace_file
*ns
;
187 for (ns
= namespace_files
; ns
->name
; ns
++) {
191 snprintf(src
, sizeof(src
), "/proc/%u/%s", (unsigned) pid
, ns
->name
);
193 if (mount(src
, ns
->target
, NULL
, MS_BIND
, NULL
) != 0)
194 err(EXIT_FAILURE
, _("mount %s on %s failed"), src
, ns
->target
);
200 static ino_t
get_mnt_ino(pid_t pid
)
205 snprintf(path
, sizeof(path
), "/proc/%u/ns/mnt", (unsigned) pid
);
207 if (stat(path
, &st
) != 0)
208 err(EXIT_FAILURE
, _("cannot stat %s"), path
);
212 static void settime(time_t offset
, clockid_t clk_id
)
214 char buf
[sizeof(stringify_value(ULONG_MAX
)) * 3];
217 len
= snprintf(buf
, sizeof(buf
), "%d %ld 0", clk_id
, offset
);
219 fd
= open("/proc/self/timens_offsets", O_WRONLY
);
221 err(EXIT_FAILURE
, _("failed to open /proc/self/timens_offsets"));
223 if (write(fd
, buf
, len
) != len
)
224 err(EXIT_FAILURE
, _("failed to write to /proc/self/timens_offsets"));
229 static void bind_ns_files_from_child(pid_t
*child
, int fds
[2])
232 pid_t ppid
= getpid();
233 ino_t ino
= get_mnt_ino(ppid
);
236 err(EXIT_FAILURE
, _("pipe failed"));
242 err(EXIT_FAILURE
, _("fork failed"));
248 /* wait for parent */
249 if (read_all(fds
[0], &ch
, 1) != 1 && ch
!= PIPE_SYNC_BYTE
)
250 err(EXIT_FAILURE
, _("failed to read pipe"));
251 if (get_mnt_ino(ppid
) == ino
)
257 default: /* parent */
264 static uid_t
get_user(const char *s
, const char *err
)
270 pw
= xgetpwnam(s
, &buf
);
276 ret
= strtoul_or_err(s
, err
);
282 static gid_t
get_group(const char *s
, const char *err
)
288 gr
= xgetgrnam(s
, &buf
);
294 ret
= strtoul_or_err(s
, err
);
300 static void __attribute__((__noreturn__
)) usage(void)
304 fputs(USAGE_HEADER
, out
);
305 fprintf(out
, _(" %s [options] [<program> [<argument>...]]\n"),
306 program_invocation_short_name
);
308 fputs(USAGE_SEPARATOR
, out
);
309 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out
);
311 fputs(USAGE_OPTIONS
, out
);
312 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out
);
313 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out
);
314 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out
);
315 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out
);
316 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out
);
317 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out
);
318 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out
);
319 fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out
);
320 fputs(USAGE_SEPARATOR
, out
);
321 fputs(_(" -f, --fork fork before launching <program>\n"), out
);
322 fputs(_(" --map-user=<uid>|<name> map current user to uid (implies --user)\n"), out
);
323 fputs(_(" --map-group=<gid>|<name> map current group to gid (implies --user)\n"), out
);
324 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out
);
325 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out
);
326 fputs(USAGE_SEPARATOR
, out
);
327 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
328 " defaults to SIGKILL\n"), out
);
329 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out
);
330 fputs(_(" --propagation slave|shared|private|unchanged\n"
331 " modify mount propagation in mount namespace\n"), out
);
332 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out
);
333 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out
);
334 fputs(USAGE_SEPARATOR
, out
);
335 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out
);
336 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out
);
337 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out
);
338 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out
);
339 fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out
);
340 fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out
);
342 fputs(USAGE_SEPARATOR
, out
);
343 printf(USAGE_HELP_OPTIONS(27));
344 printf(USAGE_MAN_TAIL("unshare(1)"));
349 int main(int argc
, char *argv
[])
352 OPT_MOUNTPROC
= CHAR_MAX
+ 1,
362 static const struct option longopts
[] = {
363 { "help", no_argument
, NULL
, 'h' },
364 { "version", no_argument
, NULL
, 'V' },
366 { "mount", optional_argument
, NULL
, 'm' },
367 { "uts", optional_argument
, NULL
, 'u' },
368 { "ipc", optional_argument
, NULL
, 'i' },
369 { "net", optional_argument
, NULL
, 'n' },
370 { "pid", optional_argument
, NULL
, 'p' },
371 { "user", optional_argument
, NULL
, 'U' },
372 { "cgroup", optional_argument
, NULL
, 'C' },
373 { "time", optional_argument
, NULL
, 'T' },
375 { "fork", no_argument
, NULL
, 'f' },
376 { "kill-child", optional_argument
, NULL
, OPT_KILLCHILD
},
377 { "mount-proc", optional_argument
, NULL
, OPT_MOUNTPROC
},
378 { "map-user", required_argument
, NULL
, OPT_MAPUSER
},
379 { "map-group", required_argument
, NULL
, OPT_MAPGROUP
},
380 { "map-root-user", no_argument
, NULL
, 'r' },
381 { "map-current-user", no_argument
, NULL
, 'c' },
382 { "propagation", required_argument
, NULL
, OPT_PROPAGATION
},
383 { "setgroups", required_argument
, NULL
, OPT_SETGROUPS
},
384 { "keep-caps", no_argument
, NULL
, OPT_KEEPCAPS
},
385 { "setuid", required_argument
, NULL
, 'S' },
386 { "setgid", required_argument
, NULL
, 'G' },
387 { "root", required_argument
, NULL
, 'R' },
388 { "wd", required_argument
, NULL
, 'w' },
389 { "monotonic", required_argument
, NULL
, OPT_MONOTONIC
},
390 { "boottime", required_argument
, NULL
, OPT_BOOTTIME
},
394 int setgrpcmd
= SETGROUPS_NONE
;
395 int unshare_flags
= 0;
399 int kill_child_signo
= 0; /* 0 means --kill-child was not used */
400 const char *procmnt
= NULL
;
401 const char *newroot
= NULL
;
402 const char *newdir
= NULL
;
406 unsigned long propagation
= UNSHARE_PROPAGATION_DEFAULT
;
407 int force_uid
= 0, force_gid
= 0;
408 uid_t uid
= 0, real_euid
= geteuid();
409 gid_t gid
= 0, real_egid
= getegid();
411 time_t monotonic
= 0;
413 int force_monotonic
= 0;
414 int force_boottime
= 0;
416 setlocale(LC_ALL
, "");
417 bindtextdomain(PACKAGE
, LOCALEDIR
);
419 close_stdout_atexit();
421 while ((c
= getopt_long(argc
, argv
, "+fhVmuinpCTUrR:w:S:G:c", longopts
, NULL
)) != -1) {
427 unshare_flags
|= CLONE_NEWNS
;
429 set_ns_target(CLONE_NEWNS
, optarg
);
432 unshare_flags
|= CLONE_NEWUTS
;
434 set_ns_target(CLONE_NEWUTS
, optarg
);
437 unshare_flags
|= CLONE_NEWIPC
;
439 set_ns_target(CLONE_NEWIPC
, optarg
);
442 unshare_flags
|= CLONE_NEWNET
;
444 set_ns_target(CLONE_NEWNET
, optarg
);
447 unshare_flags
|= CLONE_NEWPID
;
449 set_ns_target(CLONE_NEWPID
, optarg
);
452 unshare_flags
|= CLONE_NEWUSER
;
454 set_ns_target(CLONE_NEWUSER
, optarg
);
457 unshare_flags
|= CLONE_NEWCGROUP
;
459 set_ns_target(CLONE_NEWCGROUP
, optarg
);
462 unshare_flags
|= CLONE_NEWTIME
;
464 set_ns_target(CLONE_NEWTIME
, optarg
);
467 unshare_flags
|= CLONE_NEWNS
;
468 procmnt
= optarg
? optarg
: "/proc";
471 unshare_flags
|= CLONE_NEWUSER
;
472 mapuser
= get_user(optarg
, _("failed to parse uid"));
475 unshare_flags
|= CLONE_NEWUSER
;
476 mapgroup
= get_group(optarg
, _("failed to parse gid"));
479 unshare_flags
|= CLONE_NEWUSER
;
484 unshare_flags
|= CLONE_NEWUSER
;
486 mapgroup
= real_egid
;
489 setgrpcmd
= setgroups_str2id(optarg
);
491 case OPT_PROPAGATION
:
492 propagation
= parse_propagation(optarg
);
497 if ((kill_child_signo
= signame_to_signum(optarg
)) < 0)
498 errx(EXIT_FAILURE
, _("unknown signal: %s"),
501 kill_child_signo
= SIGKILL
;
506 cap_last_cap(); /* Force last cap to be cached before we fork. */
509 uid
= strtoul_or_err(optarg
, _("failed to parse uid"));
513 gid
= strtoul_or_err(optarg
, _("failed to parse gid"));
523 monotonic
= strtoul_or_err(optarg
, _("failed to parse monotonic offset"));
527 boottime
= strtoul_or_err(optarg
, _("failed to parse boottime offset"));
534 print_version(EXIT_SUCCESS
);
536 errtryhelp(EXIT_FAILURE
);
540 if ((force_monotonic
|| force_boottime
) && !(unshare_flags
& CLONE_NEWTIME
))
541 errx(EXIT_FAILURE
, _("options --monotonic and --boottime require "
542 "unsharing of a time namespace (-t)"));
544 if (npersists
&& (unshare_flags
& CLONE_NEWNS
))
545 bind_ns_files_from_child(&pid
, fds
);
547 if (-1 == unshare(unshare_flags
))
548 err(EXIT_FAILURE
, _("unshare failed"));
551 if (pid
&& (unshare_flags
& CLONE_NEWNS
)) {
553 char ch
= PIPE_SYNC_BYTE
;
555 /* signal child we are ready */
556 write_all(fds
[1], &ch
, 1);
560 /* wait for bind_ns_files_from_child() */
562 rc
= waitpid(pid
, &status
, 0);
566 err(EXIT_FAILURE
, _("waitpid failed"));
568 if (WIFEXITED(status
) &&
569 WEXITSTATUS(status
) != EXIT_SUCCESS
)
570 return WEXITSTATUS(status
);
573 /* simple way, just bind */
574 bind_ns_files(getpid());
578 settime(boottime
, CLOCK_BOOTTIME
);
581 settime(monotonic
, CLOCK_MONOTONIC
);
588 err(EXIT_FAILURE
, _("fork failed"));
591 default: /* parent */
592 if (waitpid(pid
, &status
, 0) == -1)
593 err(EXIT_FAILURE
, _("waitpid failed"));
594 if (WIFEXITED(status
))
595 return WEXITSTATUS(status
);
596 if (WIFSIGNALED(status
))
597 kill(getpid(), WTERMSIG(status
));
598 err(EXIT_FAILURE
, _("child exit failed"));
602 if (kill_child_signo
!= 0 && prctl(PR_SET_PDEATHSIG
, kill_child_signo
) < 0)
603 err(EXIT_FAILURE
, "prctl failed");
605 if (mapuser
!= (uid_t
) -1)
606 map_id(_PATH_PROC_UIDMAP
, mapuser
, real_euid
);
608 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
609 * has been disabled unless /proc/self/setgroups is written
610 * first to permanently disable the ability to call setgroups
611 * in that user namespace. */
612 if (mapgroup
!= (gid_t
) -1) {
613 if (setgrpcmd
== SETGROUPS_ALLOW
)
614 errx(EXIT_FAILURE
, _("options --setgroups=allow and "
615 "--map-group are mutually exclusive"));
616 setgroups_control(SETGROUPS_DENY
);
617 map_id(_PATH_PROC_GIDMAP
, mapgroup
, real_egid
);
620 if (setgrpcmd
!= SETGROUPS_NONE
)
621 setgroups_control(setgrpcmd
);
623 if ((unshare_flags
& CLONE_NEWNS
) && propagation
)
624 set_propagation(propagation
);
627 if (chroot(newroot
) != 0)
629 _("cannot change root directory to '%s'"), newroot
);
630 newdir
= newdir
?: "/";
632 if (newdir
&& chdir(newdir
))
633 err(EXIT_FAILURE
, _("cannot chdir to '%s'"), newdir
);
636 if (!newroot
&& mount("none", procmnt
, NULL
, MS_PRIVATE
|MS_REC
, NULL
) != 0)
637 err(EXIT_FAILURE
, _("umount %s failed"), procmnt
);
638 if (mount("proc", procmnt
, "proc", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
) != 0)
639 err(EXIT_FAILURE
, _("mount %s failed"), procmnt
);
643 if (setgroups(0, NULL
) != 0) /* drop supplementary groups */
644 err(EXIT_FAILURE
, _("setgroups failed"));
645 if (setgid(gid
) < 0) /* change GID */
646 err(EXIT_FAILURE
, _("setgid failed"));
648 if (force_uid
&& setuid(uid
) < 0) /* change UID */
649 err(EXIT_FAILURE
, _("setuid failed"));
651 /* We use capabilities system calls to propagate the permitted
652 * capabilities into the ambient set because we have already
653 * forked so are in async-signal-safe context. */
654 if (keepcaps
&& (unshare_flags
& CLONE_NEWUSER
)) {
655 struct __user_cap_header_struct header
= {
656 .version
= _LINUX_CAPABILITY_VERSION_3
,
660 struct __user_cap_data_struct payload
[_LINUX_CAPABILITY_U32S_3
] = { 0 };
664 if (capget(&header
, payload
) < 0)
665 err(EXIT_FAILURE
, _("capget failed"));
667 /* In order the make capabilities ambient, we first need to ensure
668 * that they are all inheritable. */
669 payload
[0].inheritable
= payload
[0].permitted
;
670 payload
[1].inheritable
= payload
[1].permitted
;
672 if (capset(&header
, payload
) < 0)
673 err(EXIT_FAILURE
, _("capset failed"));
675 effective
= ((uint64_t)payload
[1].effective
<< 32) | (uint64_t)payload
[0].effective
;
677 for (cap
= 0; cap
< 64; cap
++) {
678 /* This is the same check as cap_valid(), but using
679 * the runtime value for the last valid cap. */
680 if (cap
> cap_last_cap())
683 if ((effective
& (1 << cap
))
684 && prctl(PR_CAP_AMBIENT
, PR_CAP_AMBIENT_RAISE
, cap
, 0, 0) < 0)
685 err(EXIT_FAILURE
, _("prctl(PR_CAP_AMBIENT) failed"));
690 execvp(argv
[optind
], argv
+ optind
);
691 errexec(argv
[optind
]);