]> git.ipfire.org Git - thirdparty/util-linux.git/blobdiff - sys-utils/unshare.c
docs: update year in libs docs
[thirdparty/util-linux.git] / sys-utils / unshare.c
index 83c4a005998524547c3e7e05c975c34f0a741307..8d33f2273524657fe4f48da4765f80ca34098e0a 100644 (file)
 #include <unistd.h>
 #include <sys/wait.h>
 #include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/prctl.h>
+#include <grp.h>
 
 /* we only need some defines missing in sys/mount.h, no libmount linkage */
 #include <libmount.h>
 
 #include "nls.h"
 #include "c.h"
+#include "caputils.h"
 #include "closestream.h"
 #include "namespace.h"
 #include "exec_shell.h"
 #include "xalloc.h"
 #include "pathnames.h"
 #include "all-io.h"
+#include "signames.h"
+#include "strutils.h"
+
+/* synchronize parent and child by pipe */
+#define PIPE_SYNC_BYTE 0x06
+
+/* 'private' is kernel default */
+#define UNSHARE_PROPAGATION_DEFAULT    (MS_REC | MS_PRIVATE)
+
+/* /proc namespace files and mountpoints for binds */
+static struct namespace_file {
+       int             type;           /* CLONE_NEW* */
+       const char      *name;          /* ns/<type> */
+       const char      *target;        /* user specified target for bind mount */
+} namespace_files[] = {
+       { .type = CLONE_NEWUSER,  .name = "ns/user" },
+       { .type = CLONE_NEWCGROUP,.name = "ns/cgroup" },
+       { .type = CLONE_NEWIPC,   .name = "ns/ipc"  },
+       { .type = CLONE_NEWUTS,   .name = "ns/uts"  },
+       { .type = CLONE_NEWNET,   .name = "ns/net"  },
+       { .type = CLONE_NEWPID,   .name = "ns/pid"  },
+       { .type = CLONE_NEWNS,    .name = "ns/mnt"  },
+       { .name = NULL }
+};
+
+static int npersists;  /* number of persistent namespaces */
 
 enum {
        SETGROUPS_NONE = -1,
@@ -45,6 +76,12 @@ enum {
        SETGROUPS_ALLOW = 1,
 };
 
+enum {
+       MAP_USER_NONE,
+       MAP_USER_ROOT,
+       MAP_USER_CURRENT,
+};
+
 static const char *setgroups_strings[] =
 {
        [SETGROUPS_DENY] = "deny",
@@ -76,7 +113,7 @@ static void setgroups_control(int action)
        if (fd < 0) {
                if (errno == ENOENT)
                        return;
-                err(EXIT_FAILURE, _("cannot open %s"), file);
+               err(EXIT_FAILURE, _("cannot open %s"), file);
        }
 
        if (write_all(fd, cmd, strlen(cmd)))
@@ -100,121 +137,357 @@ static void map_id(const char *file, uint32_t from, uint32_t to)
        close(fd);
 }
 
-static void usage(int status)
+static unsigned long parse_propagation(const char *str)
+{
+       size_t i;
+       static const struct prop_opts {
+               const char *name;
+               unsigned long flag;
+       } opts[] = {
+               { "slave",      MS_REC | MS_SLAVE },
+               { "private",    MS_REC | MS_PRIVATE },
+               { "shared",     MS_REC | MS_SHARED },
+               { "unchanged",        0 }
+       };
+
+       for (i = 0; i < ARRAY_SIZE(opts); i++) {
+               if (strcmp(opts[i].name, str) == 0)
+                       return opts[i].flag;
+       }
+
+       errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str);
+}
+
+static void set_propagation(unsigned long flags)
+{
+       if (flags == 0)
+               return;
+
+       if (mount("none", "/", NULL, flags, NULL) != 0)
+               err(EXIT_FAILURE, _("cannot change root filesystem propagation"));
+}
+
+
+static int set_ns_target(int type, const char *path)
+{
+       struct namespace_file *ns;
+
+       for (ns = namespace_files; ns->name; ns++) {
+               if (ns->type != type)
+                       continue;
+               ns->target = path;
+               npersists++;
+               return 0;
+       }
+
+       return -EINVAL;
+}
+
+static int bind_ns_files(pid_t pid)
+{
+       struct namespace_file *ns;
+       char src[PATH_MAX];
+
+       for (ns = namespace_files; ns->name; ns++) {
+               if (!ns->target)
+                       continue;
+
+               snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name);
+
+               if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0)
+                       err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target);
+       }
+
+       return 0;
+}
+
+static ino_t get_mnt_ino(pid_t pid)
+{
+       struct stat st;
+       char path[PATH_MAX];
+
+       snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
+
+       if (stat(path, &st) != 0)
+               err(EXIT_FAILURE, _("cannot stat %s"), path);
+       return st.st_ino;
+}
+
+static void bind_ns_files_from_child(pid_t *child, int fds[2])
+{
+       char ch;
+       pid_t ppid = getpid();
+       ino_t ino = get_mnt_ino(ppid);
+
+       if (pipe(fds) < 0)
+               err(EXIT_FAILURE, _("pipe failed"));
+
+       *child = fork();
+
+       switch (*child) {
+       case -1:
+               err(EXIT_FAILURE, _("fork failed"));
+
+       case 0: /* child */
+               close(fds[1]);
+               fds[1] = -1;
+
+               /* wait for parent */
+               if (read_all(fds[0], &ch, 1) != 1 && ch != PIPE_SYNC_BYTE)
+                       err(EXIT_FAILURE, _("failed to read pipe"));
+               if (get_mnt_ino(ppid) == ino)
+                       exit(EXIT_FAILURE);
+               bind_ns_files(ppid);
+               exit(EXIT_SUCCESS);
+               break;
+
+       default: /* parent */
+               close(fds[0]);
+               fds[0] = -1;
+               break;
+       }
+}
+
+static void __attribute__((__noreturn__)) usage(void)
 {
-       FILE *out = status == EXIT_SUCCESS ? stdout : stderr;
+       FILE *out = stdout;
 
        fputs(USAGE_HEADER, out);
-       fprintf(out, _(" %s [options] <program> [<argument>...]\n"),
+       fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"),
                program_invocation_short_name);
 
        fputs(USAGE_SEPARATOR, out);
        fputs(_("Run a program with some namespaces unshared from the parent.\n"), out);
 
        fputs(USAGE_OPTIONS, out);
-       fputs(_(" -m, --mount               unshare mounts namespace\n"), out);
-       fputs(_(" -u, --uts                 unshare UTS namespace (hostname etc)\n"), out);
-       fputs(_(" -i, --ipc                 unshare System V IPC namespace\n"), out);
-       fputs(_(" -n, --net                 unshare network namespace\n"), out);
-       fputs(_(" -p, --pid                 unshare pid namespace\n"), out);
-       fputs(_(" -U, --user                unshare user namespace\n"), out);
+       fputs(_(" -m, --mount[=<file>]      unshare mounts namespace\n"), out);
+       fputs(_(" -u, --uts[=<file>]        unshare UTS namespace (hostname etc)\n"), out);
+       fputs(_(" -i, --ipc[=<file>]        unshare System V IPC namespace\n"), out);
+       fputs(_(" -n, --net[=<file>]        unshare network namespace\n"), out);
+       fputs(_(" -p, --pid[=<file>]        unshare pid namespace\n"), out);
+       fputs(_(" -U, --user[=<file>]       unshare user namespace\n"), out);
+       fputs(_(" -C, --cgroup[=<file>]     unshare cgroup namespace\n"), out);
+       fputs(USAGE_SEPARATOR, out);
        fputs(_(" -f, --fork                fork before launching <program>\n"), out);
-       fputs(_("     --mount-proc[=<dir>]  mount proc filesystem first (implies --mount)\n"), out);
        fputs(_(" -r, --map-root-user       map current user to root (implies --user)\n"), out);
-       fputs(_(" -s, --setgroups <allow|deny>  control setgroups syscall in user namespaces\n"), out);
+       fputs(_(" -c, --map-current-user    map current user to itself (implies --user)\n"), out);
+       fputs(USAGE_SEPARATOR, out);
+       fputs(_(" --kill-child[=<signame>]  when dying, kill the forked child (implies --fork)\n"
+               "                             defaults to SIGKILL\n"), out);
+       fputs(_(" --mount-proc[=<dir>]      mount proc filesystem first (implies --mount)\n"), out);
+       fputs(_(" --propagation slave|shared|private|unchanged\n"
+               "                           modify mount propagation in mount namespace\n"), out);
+       fputs(_(" --setgroups allow|deny    control the setgroups syscall in user namespaces\n"), out);
+       fputs(_(" --keep-caps               retain capabilities granted in user namespaces\n"), out);
+       fputs(USAGE_SEPARATOR, out);
+       fputs(_(" -R, --root=<dir>          run the command with root directory set to <dir>\n"), out);
+       fputs(_(" -w, --wd=<dir>            change working directory to <dir>\n"), out);
+       fputs(_(" -S, --setuid <uid>        set uid in entered namespace\n"), out);
+       fputs(_(" -G, --setgid <gid>        set gid in entered namespace\n"), out);
 
        fputs(USAGE_SEPARATOR, out);
-       fputs(USAGE_HELP, out);
-       fputs(USAGE_VERSION, out);
-       fprintf(out, USAGE_MAN_TAIL("unshare(1)"));
+       printf(USAGE_HELP_OPTIONS(27));
+       printf(USAGE_MAN_TAIL("unshare(1)"));
 
-       exit(status);
+       exit(EXIT_SUCCESS);
 }
 
 int main(int argc, char *argv[])
 {
        enum {
                OPT_MOUNTPROC = CHAR_MAX + 1,
-               OPT_SETGROUPS
+               OPT_PROPAGATION,
+               OPT_SETGROUPS,
+               OPT_KILLCHILD,
+               OPT_KEEPCAPS,
        };
        static const struct option longopts[] = {
-               { "help", no_argument, 0, 'h' },
-               { "version", no_argument, 0, 'V'},
-               { "mount", no_argument, 0, 'm' },
-               { "uts", no_argument, 0, 'u' },
-               { "ipc", no_argument, 0, 'i' },
-               { "net", no_argument, 0, 'n' },
-               { "pid", no_argument, 0, 'p' },
-               { "user", no_argument, 0, 'U' },
-               { "fork", no_argument, 0, 'f' },
-               { "mount-proc", optional_argument, 0, OPT_MOUNTPROC },
-               { "map-root-user", no_argument, 0, 'r' },
-               { "setgroups", required_argument, 0, OPT_SETGROUPS },
-               { NULL, 0, 0, 0 }
+               { "help",          no_argument,       NULL, 'h'             },
+               { "version",       no_argument,       NULL, 'V'             },
+
+               { "mount",         optional_argument, NULL, 'm'             },
+               { "uts",           optional_argument, NULL, 'u'             },
+               { "ipc",           optional_argument, NULL, 'i'             },
+               { "net",           optional_argument, NULL, 'n'             },
+               { "pid",           optional_argument, NULL, 'p'             },
+               { "user",          optional_argument, NULL, 'U'             },
+               { "cgroup",        optional_argument, NULL, 'C'             },
+
+               { "fork",          no_argument,       NULL, 'f'             },
+               { "kill-child",    optional_argument, NULL, OPT_KILLCHILD   },
+               { "mount-proc",    optional_argument, NULL, OPT_MOUNTPROC   },
+               { "map-root-user", no_argument,       NULL, 'r'             },
+               { "map-current-user", no_argument,    NULL, 'c'             },
+               { "propagation",   required_argument, NULL, OPT_PROPAGATION },
+               { "setgroups",     required_argument, NULL, OPT_SETGROUPS   },
+               { "keep-caps",     no_argument,       NULL, OPT_KEEPCAPS    },
+               { "setuid",        required_argument, NULL, 'S'             },
+               { "setgid",        required_argument, NULL, 'G'             },
+               { "root",          required_argument, NULL, 'R'             },
+               { "wd",            required_argument, NULL, 'w'             },
+               { NULL, 0, NULL, 0 }
        };
 
        int setgrpcmd = SETGROUPS_NONE;
        int unshare_flags = 0;
-       int c, forkit = 0, maproot = 0;
+       int c, forkit = 0, mapuser = MAP_USER_NONE;
+       int kill_child_signo = 0; /* 0 means --kill-child was not used */
        const char *procmnt = NULL;
-       uid_t real_euid = geteuid();
-       gid_t real_egid = getegid();;
+       const char *newroot = NULL;
+       const char *newdir = NULL;
+       pid_t pid = 0;
+       int fds[2];
+       int status;
+       unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT;
+       int force_uid = 0, force_gid = 0;
+       uid_t uid = 0, real_euid = geteuid();
+       gid_t gid = 0, real_egid = getegid();
+       int keepcaps = 0;
 
        setlocale(LC_ALL, "");
        bindtextdomain(PACKAGE, LOCALEDIR);
        textdomain(PACKAGE);
-       atexit(close_stdout);
+       close_stdout_atexit();
 
-       while ((c = getopt_long(argc, argv, "+fhVmuinpUr", longopts, NULL)) != -1) {
+       while ((c = getopt_long(argc, argv, "+fhVmuinpCUrR:w:S:G:c", longopts, NULL)) != -1) {
                switch (c) {
                case 'f':
                        forkit = 1;
                        break;
-               case 'h':
-                       usage(EXIT_SUCCESS);
-               case 'V':
-                       printf(UTIL_LINUX_VERSION);
-                       return EXIT_SUCCESS;
                case 'm':
                        unshare_flags |= CLONE_NEWNS;
+                       if (optarg)
+                               set_ns_target(CLONE_NEWNS, optarg);
                        break;
                case 'u':
                        unshare_flags |= CLONE_NEWUTS;
+                       if (optarg)
+                               set_ns_target(CLONE_NEWUTS, optarg);
                        break;
                case 'i':
                        unshare_flags |= CLONE_NEWIPC;
+                       if (optarg)
+                               set_ns_target(CLONE_NEWIPC, optarg);
                        break;
                case 'n':
                        unshare_flags |= CLONE_NEWNET;
+                       if (optarg)
+                               set_ns_target(CLONE_NEWNET, optarg);
                        break;
                case 'p':
                        unshare_flags |= CLONE_NEWPID;
+                       if (optarg)
+                               set_ns_target(CLONE_NEWPID, optarg);
                        break;
                case 'U':
                        unshare_flags |= CLONE_NEWUSER;
+                       if (optarg)
+                               set_ns_target(CLONE_NEWUSER, optarg);
+                       break;
+               case 'C':
+                       unshare_flags |= CLONE_NEWCGROUP;
+                       if (optarg)
+                               set_ns_target(CLONE_NEWCGROUP, optarg);
                        break;
                case OPT_MOUNTPROC:
                        unshare_flags |= CLONE_NEWNS;
                        procmnt = optarg ? optarg : "/proc";
                        break;
                case 'r':
+                       if (mapuser == MAP_USER_CURRENT)
+                               errx(EXIT_FAILURE, _("options --map-root-user and "
+                                       "--map-current-user are mutually exclusive"));
+
+                       unshare_flags |= CLONE_NEWUSER;
+                       mapuser = MAP_USER_ROOT;
+                       break;
+               case 'c':
+                       if (mapuser == MAP_USER_ROOT)
+                               errx(EXIT_FAILURE, _("options --map-root-user and "
+                                       "--map-current-user are mutually exclusive"));
+
                        unshare_flags |= CLONE_NEWUSER;
-                       maproot = 1;
+                       mapuser = MAP_USER_CURRENT;
                        break;
                case OPT_SETGROUPS:
                        setgrpcmd = setgroups_str2id(optarg);
                        break;
+               case OPT_PROPAGATION:
+                       propagation = parse_propagation(optarg);
+                       break;
+               case OPT_KILLCHILD:
+                       forkit = 1;
+                       if (optarg) {
+                               if ((kill_child_signo = signame_to_signum(optarg)) < 0)
+                                       errx(EXIT_FAILURE, _("unknown signal: %s"),
+                                            optarg);
+                       } else {
+                               kill_child_signo = SIGKILL;
+                       }
+                       break;
+                case OPT_KEEPCAPS:
+                       keepcaps = 1;
+                       cap_last_cap(); /* Force last cap to be cached before we fork. */
+                       break;
+               case 'S':
+                       uid = strtoul_or_err(optarg, _("failed to parse uid"));
+                       force_uid = 1;
+                       break;
+               case 'G':
+                       gid = strtoul_or_err(optarg, _("failed to parse gid"));
+                       force_gid = 1;
+                       break;
+               case 'R':
+                       newroot = optarg;
+                       break;
+               case 'w':
+                       newdir = optarg;
+                       break;
+
+               case 'h':
+                       usage();
+               case 'V':
+                       print_version(EXIT_SUCCESS);
                default:
-                       usage(EXIT_FAILURE);
+                       errtryhelp(EXIT_FAILURE);
                }
        }
 
+       if (npersists && (unshare_flags & CLONE_NEWNS))
+               bind_ns_files_from_child(&pid, fds);
+
        if (-1 == unshare(unshare_flags))
                err(EXIT_FAILURE, _("unshare failed"));
 
+       if (npersists) {
+               if (pid && (unshare_flags & CLONE_NEWNS)) {
+                       int rc;
+                       char ch = PIPE_SYNC_BYTE;
+
+                       /* signal child we are ready */
+                       write_all(fds[1], &ch, 1);
+                       close(fds[1]);
+                       fds[1] = -1;
+
+                       /* wait for bind_ns_files_from_child() */
+                       do {
+                               rc = waitpid(pid, &status, 0);
+                               if (rc < 0) {
+                                       if (errno == EINTR)
+                                               continue;
+                                       err(EXIT_FAILURE, _("waitpid failed"));
+                               }
+                               if (WIFEXITED(status) &&
+                                   WEXITSTATUS(status) != EXIT_SUCCESS)
+                                       return WEXITSTATUS(status);
+                       } while (rc < 0);
+               } else
+                       /* simple way, just bind */
+                       bind_ns_files(getpid());
+       }
+
        if (forkit) {
-               int status;
-               pid_t pid = fork();
+               pid = fork();
 
                switch(pid) {
                case -1:
@@ -232,30 +505,106 @@ int main(int argc, char *argv[])
                }
        }
 
-       if (maproot) {
+       if (kill_child_signo != 0 && prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0)
+               err(EXIT_FAILURE, "prctl failed");
+
+        /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
+         * has been disabled unless /proc/self/setgroups is written
+         * first to permanently disable the ability to call setgroups
+         * in that user namespace. */
+        switch (mapuser) {
+        case MAP_USER_ROOT:
                if (setgrpcmd == SETGROUPS_ALLOW)
                        errx(EXIT_FAILURE, _("options --setgroups=allow and "
-                                       "--map-root-user are mutually exclusive."));
+                                       "--map-root-user are mutually exclusive"));
 
-               /* since Linux 3.19 unprivileged writing of /proc/self/gid_map
-                * has s been disabled unless /proc/self/setgroups is written
-                * first to permanently disable the ability to call setgroups
-                * in that user namespace. */
                setgroups_control(SETGROUPS_DENY);
                map_id(_PATH_PROC_UIDMAP, 0, real_euid);
                map_id(_PATH_PROC_GIDMAP, 0, real_egid);
+                break;
+        case MAP_USER_CURRENT:
+               if (setgrpcmd == SETGROUPS_ALLOW)
+                       errx(EXIT_FAILURE, _("options --setgroups=allow and "
+                                       "--map-current-user are mutually exclusive"));
 
-       } else if (setgrpcmd != SETGROUPS_NONE)
-               setgroups_control(setgrpcmd);
+               setgroups_control(SETGROUPS_DENY);
+               map_id(_PATH_PROC_UIDMAP, real_euid, real_euid);
+               map_id(_PATH_PROC_GIDMAP, real_egid, real_egid);
+                break;
+        case MAP_USER_NONE:
+               if (setgrpcmd != SETGROUPS_NONE)
+                       setgroups_control(setgrpcmd);
+        }
+
+       if ((unshare_flags & CLONE_NEWNS) && propagation)
+               set_propagation(propagation);
+
+       if (newroot) {
+               if (chroot(newroot) != 0)
+                       err(EXIT_FAILURE,
+                           _("cannot change root directory to '%s'"), newroot);
+               newdir = newdir ?: "/";
+       }
+       if (newdir && chdir(newdir))
+               err(EXIT_FAILURE, _("cannot chdir to '%s'"), newdir);
 
-       if (procmnt &&
-           (mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL) != 0 ||
-            mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0))
+       if (procmnt) {
+               if (!newroot && mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL) != 0)
+                       err(EXIT_FAILURE, _("umount %s failed"), procmnt);
+               if (mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0)
                        err(EXIT_FAILURE, _("mount %s failed"), procmnt);
+       }
+
+       if (force_gid) {
+               if (setgroups(0, NULL) != 0)    /* drop supplementary groups */
+                       err(EXIT_FAILURE, _("setgroups failed"));
+               if (setgid(gid) < 0)            /* change GID */
+                       err(EXIT_FAILURE, _("setgid failed"));
+       }
+       if (force_uid && setuid(uid) < 0)       /* change UID */
+               err(EXIT_FAILURE, _("setuid failed"));
+
+       /* We use capabilities system calls to propagate the permitted
+        * capabilities into the ambient set because we have already
+        * forked so are in async-signal-safe context. */
+       if (keepcaps && (unshare_flags & CLONE_NEWUSER)) {
+               struct __user_cap_header_struct header = {
+                       .version = _LINUX_CAPABILITY_VERSION_3,
+                       .pid = 0,
+               };
+
+               struct __user_cap_data_struct payload[_LINUX_CAPABILITY_U32S_3] = { 0 };
+               int cap;
+               uint64_t effective;
+
+               if (capget(&header, payload) < 0)
+                       err(EXIT_FAILURE, _("capget failed"));
+
+               /* In order the make capabilities ambient, we first need to ensure
+                * that they are all inheritable. */
+               payload[0].inheritable = payload[0].permitted;
+               payload[1].inheritable = payload[1].permitted;
+
+               if (capset(&header, payload) < 0)
+                       err(EXIT_FAILURE, _("capset failed"));
+
+               effective = ((uint64_t)payload[1].effective << 32) |  (uint64_t)payload[0].effective;
+
+               for (cap = 0; cap < 64; cap++) {
+                       /* This is the same check as cap_valid(), but using
+                        * the runtime value for the last valid cap. */
+                       if (cap > cap_last_cap())
+                               continue;
+
+                       if ((effective & (1 << cap))
+                           && prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0)
+                                       err(EXIT_FAILURE, _("prctl(PR_CAP_AMBIENT) failed"));
+                }
+        }
 
        if (optind < argc) {
                execvp(argv[optind], argv + optind);
-               err(EXIT_FAILURE, _("failed to execute %s"), argv[optind]);
+               errexec(argv[optind]);
        }
        exec_shell();
 }