2 * unshare(1) - command-line interface for unshare(2)
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
28 #include <sys/mount.h>
29 #include <sys/types.h>
31 #include <sys/prctl.h>
34 /* we only need some defines missing in sys/mount.h, no libmount linkage */
40 #include "closestream.h"
41 #include "namespace.h"
42 #include "exec_shell.h"
44 #include "pathnames.h"
49 /* synchronize parent and child by pipe */
50 #define PIPE_SYNC_BYTE 0x06
52 /* 'private' is kernel default */
53 #define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
55 /* /proc namespace files and mountpoints for binds */
56 static struct namespace_file
{
57 int type
; /* CLONE_NEW* */
58 const char *name
; /* ns/<type> */
59 const char *target
; /* user specified target for bind mount */
60 } namespace_files
[] = {
61 { .type
= CLONE_NEWUSER
, .name
= "ns/user" },
62 { .type
= CLONE_NEWCGROUP
,.name
= "ns/cgroup" },
63 { .type
= CLONE_NEWIPC
, .name
= "ns/ipc" },
64 { .type
= CLONE_NEWUTS
, .name
= "ns/uts" },
65 { .type
= CLONE_NEWNET
, .name
= "ns/net" },
66 { .type
= CLONE_NEWPID
, .name
= "ns/pid" },
67 { .type
= CLONE_NEWNS
, .name
= "ns/mnt" },
68 { .type
= CLONE_NEWTIME
, .name
= "ns/time" },
72 static int npersists
; /* number of persistent namespaces */
80 static const char *setgroups_strings
[] =
82 [SETGROUPS_DENY
] = "deny",
83 [SETGROUPS_ALLOW
] = "allow"
86 static int setgroups_str2id(const char *str
)
90 for (i
= 0; i
< ARRAY_SIZE(setgroups_strings
); i
++)
91 if (strcmp(str
, setgroups_strings
[i
]) == 0)
94 errx(EXIT_FAILURE
, _("unsupported --setgroups argument '%s'"), str
);
97 static void setgroups_control(int action
)
99 const char *file
= _PATH_PROC_SETGROUPS
;
103 if (action
< 0 || (size_t) action
>= ARRAY_SIZE(setgroups_strings
))
105 cmd
= setgroups_strings
[action
];
107 fd
= open(file
, O_WRONLY
);
111 err(EXIT_FAILURE
, _("cannot open %s"), file
);
114 if (write_all(fd
, cmd
, strlen(cmd
)))
115 err(EXIT_FAILURE
, _("write failed %s"), file
);
119 static void map_id(const char *file
, uint32_t from
, uint32_t to
)
124 fd
= open(file
, O_WRONLY
);
126 err(EXIT_FAILURE
, _("cannot open %s"), file
);
128 xasprintf(&buf
, "%u %u 1", from
, to
);
129 if (write_all(fd
, buf
, strlen(buf
)))
130 err(EXIT_FAILURE
, _("write failed %s"), file
);
135 static unsigned long parse_propagation(const char *str
)
138 static const struct prop_opts
{
142 { "slave", MS_REC
| MS_SLAVE
},
143 { "private", MS_REC
| MS_PRIVATE
},
144 { "shared", MS_REC
| MS_SHARED
},
148 for (i
= 0; i
< ARRAY_SIZE(opts
); i
++) {
149 if (strcmp(opts
[i
].name
, str
) == 0)
153 errx(EXIT_FAILURE
, _("unsupported propagation mode: %s"), str
);
156 static void set_propagation(unsigned long flags
)
161 if (mount("none", "/", NULL
, flags
, NULL
) != 0)
162 err(EXIT_FAILURE
, _("cannot change root filesystem propagation"));
166 static int set_ns_target(int type
, const char *path
)
168 struct namespace_file
*ns
;
170 for (ns
= namespace_files
; ns
->name
; ns
++) {
171 if (ns
->type
!= type
)
181 static int bind_ns_files(pid_t pid
)
183 struct namespace_file
*ns
;
186 for (ns
= namespace_files
; ns
->name
; ns
++) {
190 snprintf(src
, sizeof(src
), "/proc/%u/%s", (unsigned) pid
, ns
->name
);
192 if (mount(src
, ns
->target
, NULL
, MS_BIND
, NULL
) != 0)
193 err(EXIT_FAILURE
, _("mount %s on %s failed"), src
, ns
->target
);
199 static ino_t
get_mnt_ino(pid_t pid
)
204 snprintf(path
, sizeof(path
), "/proc/%u/ns/mnt", (unsigned) pid
);
206 if (stat(path
, &st
) != 0)
207 err(EXIT_FAILURE
, _("cannot stat %s"), path
);
211 static void settime(time_t offset
, clockid_t clk_id
)
213 char buf
[sizeof(stringify_value(ULONG_MAX
)) * 3];
216 len
= snprintf(buf
, sizeof(buf
), "%d %ld 0", clk_id
, offset
);
218 fd
= open("/proc/self/timens_offsets", O_WRONLY
);
220 err(EXIT_FAILURE
, _("failed to open /proc/self/timens_offsets"));
222 if (write(fd
, buf
, len
) != len
)
223 err(EXIT_FAILURE
, _("failed to write to /proc/self/timens_offsets"));
228 static void bind_ns_files_from_child(pid_t
*child
, int fds
[2])
231 pid_t ppid
= getpid();
232 ino_t ino
= get_mnt_ino(ppid
);
235 err(EXIT_FAILURE
, _("pipe failed"));
241 err(EXIT_FAILURE
, _("fork failed"));
247 /* wait for parent */
248 if (read_all(fds
[0], &ch
, 1) != 1 && ch
!= PIPE_SYNC_BYTE
)
249 err(EXIT_FAILURE
, _("failed to read pipe"));
250 if (get_mnt_ino(ppid
) == ino
)
256 default: /* parent */
263 static void __attribute__((__noreturn__
)) usage(void)
267 fputs(USAGE_HEADER
, out
);
268 fprintf(out
, _(" %s [options] [<program> [<argument>...]]\n"),
269 program_invocation_short_name
);
271 fputs(USAGE_SEPARATOR
, out
);
272 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out
);
274 fputs(USAGE_OPTIONS
, out
);
275 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out
);
276 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out
);
277 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out
);
278 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out
);
279 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out
);
280 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out
);
281 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out
);
282 fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out
);
283 fputs(USAGE_SEPARATOR
, out
);
284 fputs(_(" -f, --fork fork before launching <program>\n"), out
);
285 fputs(_(" --map-user=<uid> map current user to uid (implies --user)\n"), out
);
286 fputs(_(" --map-group=<gid> map current group to gid (implies --user)\n"), out
);
287 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out
);
288 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out
);
289 fputs(USAGE_SEPARATOR
, out
);
290 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
291 " defaults to SIGKILL\n"), out
);
292 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out
);
293 fputs(_(" --propagation slave|shared|private|unchanged\n"
294 " modify mount propagation in mount namespace\n"), out
);
295 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out
);
296 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out
);
297 fputs(USAGE_SEPARATOR
, out
);
298 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out
);
299 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out
);
300 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out
);
301 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out
);
302 fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out
);
303 fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out
);
305 fputs(USAGE_SEPARATOR
, out
);
306 printf(USAGE_HELP_OPTIONS(27));
307 printf(USAGE_MAN_TAIL("unshare(1)"));
312 int main(int argc
, char *argv
[])
315 OPT_MOUNTPROC
= CHAR_MAX
+ 1,
325 static const struct option longopts
[] = {
326 { "help", no_argument
, NULL
, 'h' },
327 { "version", no_argument
, NULL
, 'V' },
329 { "mount", optional_argument
, NULL
, 'm' },
330 { "uts", optional_argument
, NULL
, 'u' },
331 { "ipc", optional_argument
, NULL
, 'i' },
332 { "net", optional_argument
, NULL
, 'n' },
333 { "pid", optional_argument
, NULL
, 'p' },
334 { "user", optional_argument
, NULL
, 'U' },
335 { "cgroup", optional_argument
, NULL
, 'C' },
336 { "time", optional_argument
, NULL
, 'T' },
338 { "fork", no_argument
, NULL
, 'f' },
339 { "kill-child", optional_argument
, NULL
, OPT_KILLCHILD
},
340 { "mount-proc", optional_argument
, NULL
, OPT_MOUNTPROC
},
341 { "map-user", required_argument
, NULL
, OPT_MAPUSER
},
342 { "map-group", required_argument
, NULL
, OPT_MAPGROUP
},
343 { "map-root-user", no_argument
, NULL
, 'r' },
344 { "map-current-user", no_argument
, NULL
, 'c' },
345 { "propagation", required_argument
, NULL
, OPT_PROPAGATION
},
346 { "setgroups", required_argument
, NULL
, OPT_SETGROUPS
},
347 { "keep-caps", no_argument
, NULL
, OPT_KEEPCAPS
},
348 { "setuid", required_argument
, NULL
, 'S' },
349 { "setgid", required_argument
, NULL
, 'G' },
350 { "root", required_argument
, NULL
, 'R' },
351 { "wd", required_argument
, NULL
, 'w' },
352 { "monotonic", required_argument
, NULL
, OPT_MONOTONIC
},
353 { "boottime", required_argument
, NULL
, OPT_BOOTTIME
},
357 int setgrpcmd
= SETGROUPS_NONE
;
358 int unshare_flags
= 0;
362 int kill_child_signo
= 0; /* 0 means --kill-child was not used */
363 const char *procmnt
= NULL
;
364 const char *newroot
= NULL
;
365 const char *newdir
= NULL
;
369 unsigned long propagation
= UNSHARE_PROPAGATION_DEFAULT
;
370 int force_uid
= 0, force_gid
= 0;
371 uid_t uid
= 0, real_euid
= geteuid();
372 gid_t gid
= 0, real_egid
= getegid();
374 time_t monotonic
= 0;
376 int force_monotonic
= 0;
377 int force_boottime
= 0;
379 setlocale(LC_ALL
, "");
380 bindtextdomain(PACKAGE
, LOCALEDIR
);
382 close_stdout_atexit();
384 while ((c
= getopt_long(argc
, argv
, "+fhVmuinpCTUrR:w:S:G:c", longopts
, NULL
)) != -1) {
390 unshare_flags
|= CLONE_NEWNS
;
392 set_ns_target(CLONE_NEWNS
, optarg
);
395 unshare_flags
|= CLONE_NEWUTS
;
397 set_ns_target(CLONE_NEWUTS
, optarg
);
400 unshare_flags
|= CLONE_NEWIPC
;
402 set_ns_target(CLONE_NEWIPC
, optarg
);
405 unshare_flags
|= CLONE_NEWNET
;
407 set_ns_target(CLONE_NEWNET
, optarg
);
410 unshare_flags
|= CLONE_NEWPID
;
412 set_ns_target(CLONE_NEWPID
, optarg
);
415 unshare_flags
|= CLONE_NEWUSER
;
417 set_ns_target(CLONE_NEWUSER
, optarg
);
420 unshare_flags
|= CLONE_NEWCGROUP
;
422 set_ns_target(CLONE_NEWCGROUP
, optarg
);
425 unshare_flags
|= CLONE_NEWTIME
;
427 set_ns_target(CLONE_NEWTIME
, optarg
);
430 unshare_flags
|= CLONE_NEWNS
;
431 procmnt
= optarg
? optarg
: "/proc";
434 unshare_flags
|= CLONE_NEWUSER
;
435 mapuser
= strtoul_or_err(optarg
, _("failed to parse uid"));
438 unshare_flags
|= CLONE_NEWUSER
;
439 mapgroup
= strtoul_or_err(optarg
, _("failed to parse gid"));
442 unshare_flags
|= CLONE_NEWUSER
;
447 unshare_flags
|= CLONE_NEWUSER
;
449 mapgroup
= real_egid
;
452 setgrpcmd
= setgroups_str2id(optarg
);
454 case OPT_PROPAGATION
:
455 propagation
= parse_propagation(optarg
);
460 if ((kill_child_signo
= signame_to_signum(optarg
)) < 0)
461 errx(EXIT_FAILURE
, _("unknown signal: %s"),
464 kill_child_signo
= SIGKILL
;
469 cap_last_cap(); /* Force last cap to be cached before we fork. */
472 uid
= strtoul_or_err(optarg
, _("failed to parse uid"));
476 gid
= strtoul_or_err(optarg
, _("failed to parse gid"));
486 monotonic
= strtoul_or_err(optarg
, _("failed to parse monotonic offset"));
490 boottime
= strtoul_or_err(optarg
, _("failed to parse boottime offset"));
497 print_version(EXIT_SUCCESS
);
499 errtryhelp(EXIT_FAILURE
);
503 if ((force_monotonic
|| force_boottime
) && !(unshare_flags
& CLONE_NEWTIME
))
504 errx(EXIT_FAILURE
, _("options --monotonic and --boottime require "
505 "unsharing of a time namespace (-t)"));
507 if (npersists
&& (unshare_flags
& CLONE_NEWNS
))
508 bind_ns_files_from_child(&pid
, fds
);
510 if (-1 == unshare(unshare_flags
))
511 err(EXIT_FAILURE
, _("unshare failed"));
514 if (pid
&& (unshare_flags
& CLONE_NEWNS
)) {
516 char ch
= PIPE_SYNC_BYTE
;
518 /* signal child we are ready */
519 write_all(fds
[1], &ch
, 1);
523 /* wait for bind_ns_files_from_child() */
525 rc
= waitpid(pid
, &status
, 0);
529 err(EXIT_FAILURE
, _("waitpid failed"));
531 if (WIFEXITED(status
) &&
532 WEXITSTATUS(status
) != EXIT_SUCCESS
)
533 return WEXITSTATUS(status
);
536 /* simple way, just bind */
537 bind_ns_files(getpid());
541 settime(boottime
, CLOCK_BOOTTIME
);
544 settime(monotonic
, CLOCK_MONOTONIC
);
551 err(EXIT_FAILURE
, _("fork failed"));
554 default: /* parent */
555 if (waitpid(pid
, &status
, 0) == -1)
556 err(EXIT_FAILURE
, _("waitpid failed"));
557 if (WIFEXITED(status
))
558 return WEXITSTATUS(status
);
559 else if (WIFSIGNALED(status
))
560 kill(getpid(), WTERMSIG(status
));
561 err(EXIT_FAILURE
, _("child exit failed"));
565 if (kill_child_signo
!= 0 && prctl(PR_SET_PDEATHSIG
, kill_child_signo
) < 0)
566 err(EXIT_FAILURE
, "prctl failed");
568 if (mapuser
!= (uid_t
) -1)
569 map_id(_PATH_PROC_UIDMAP
, mapuser
, real_euid
);
571 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
572 * has been disabled unless /proc/self/setgroups is written
573 * first to permanently disable the ability to call setgroups
574 * in that user namespace. */
575 if (mapgroup
!= (gid_t
) -1) {
576 if (setgrpcmd
== SETGROUPS_ALLOW
)
577 errx(EXIT_FAILURE
, _("options --setgroups=allow and "
578 "--map-group are mutually exclusive"));
579 setgroups_control(SETGROUPS_DENY
);
580 map_id(_PATH_PROC_GIDMAP
, mapgroup
, real_egid
);
583 if (setgrpcmd
!= SETGROUPS_NONE
)
584 setgroups_control(setgrpcmd
);
586 if ((unshare_flags
& CLONE_NEWNS
) && propagation
)
587 set_propagation(propagation
);
590 if (chroot(newroot
) != 0)
592 _("cannot change root directory to '%s'"), newroot
);
593 newdir
= newdir
?: "/";
595 if (newdir
&& chdir(newdir
))
596 err(EXIT_FAILURE
, _("cannot chdir to '%s'"), newdir
);
599 if (!newroot
&& mount("none", procmnt
, NULL
, MS_PRIVATE
|MS_REC
, NULL
) != 0)
600 err(EXIT_FAILURE
, _("umount %s failed"), procmnt
);
601 if (mount("proc", procmnt
, "proc", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
) != 0)
602 err(EXIT_FAILURE
, _("mount %s failed"), procmnt
);
606 if (setgroups(0, NULL
) != 0) /* drop supplementary groups */
607 err(EXIT_FAILURE
, _("setgroups failed"));
608 if (setgid(gid
) < 0) /* change GID */
609 err(EXIT_FAILURE
, _("setgid failed"));
611 if (force_uid
&& setuid(uid
) < 0) /* change UID */
612 err(EXIT_FAILURE
, _("setuid failed"));
614 /* We use capabilities system calls to propagate the permitted
615 * capabilities into the ambient set because we have already
616 * forked so are in async-signal-safe context. */
617 if (keepcaps
&& (unshare_flags
& CLONE_NEWUSER
)) {
618 struct __user_cap_header_struct header
= {
619 .version
= _LINUX_CAPABILITY_VERSION_3
,
623 struct __user_cap_data_struct payload
[_LINUX_CAPABILITY_U32S_3
] = { 0 };
627 if (capget(&header
, payload
) < 0)
628 err(EXIT_FAILURE
, _("capget failed"));
630 /* In order the make capabilities ambient, we first need to ensure
631 * that they are all inheritable. */
632 payload
[0].inheritable
= payload
[0].permitted
;
633 payload
[1].inheritable
= payload
[1].permitted
;
635 if (capset(&header
, payload
) < 0)
636 err(EXIT_FAILURE
, _("capset failed"));
638 effective
= ((uint64_t)payload
[1].effective
<< 32) | (uint64_t)payload
[0].effective
;
640 for (cap
= 0; cap
< 64; cap
++) {
641 /* This is the same check as cap_valid(), but using
642 * the runtime value for the last valid cap. */
643 if (cap
> cap_last_cap())
646 if ((effective
& (1 << cap
))
647 && prctl(PR_CAP_AMBIENT
, PR_CAP_AMBIENT_RAISE
, cap
, 0, 0) < 0)
648 err(EXIT_FAILURE
, _("prctl(PR_CAP_AMBIENT) failed"));
653 execvp(argv
[optind
], argv
+ optind
);
654 errexec(argv
[optind
]);