]> git.ipfire.org Git - thirdparty/util-linux.git/blob - sys-utils/unshare.c
Merge branch 'kill-pidfd' of https://github.com/kerolasa/util-linux
[thirdparty/util-linux.git] / sys-utils / unshare.c
1 /*
2 * unshare(1) - command-line interface for unshare(2)
3 *
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21 #include <errno.h>
22 #include <getopt.h>
23 #include <sched.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <unistd.h>
27 #include <sys/wait.h>
28 #include <sys/mount.h>
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 #include <sys/prctl.h>
32 #include <grp.h>
33
34 /* we only need some defines missing in sys/mount.h, no libmount linkage */
35 #include <libmount.h>
36
37 #include "nls.h"
38 #include "c.h"
39 #include "caputils.h"
40 #include "closestream.h"
41 #include "namespace.h"
42 #include "exec_shell.h"
43 #include "xalloc.h"
44 #include "pathnames.h"
45 #include "all-io.h"
46 #include "signames.h"
47 #include "strutils.h"
48
49 /* synchronize parent and child by pipe */
50 #define PIPE_SYNC_BYTE 0x06
51
52 /* 'private' is kernel default */
53 #define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
54
55 /* /proc namespace files and mountpoints for binds */
56 static struct namespace_file {
57 int type; /* CLONE_NEW* */
58 const char *name; /* ns/<type> */
59 const char *target; /* user specified target for bind mount */
60 } namespace_files[] = {
61 { .type = CLONE_NEWUSER, .name = "ns/user" },
62 { .type = CLONE_NEWCGROUP,.name = "ns/cgroup" },
63 { .type = CLONE_NEWIPC, .name = "ns/ipc" },
64 { .type = CLONE_NEWUTS, .name = "ns/uts" },
65 { .type = CLONE_NEWNET, .name = "ns/net" },
66 { .type = CLONE_NEWPID, .name = "ns/pid" },
67 { .type = CLONE_NEWNS, .name = "ns/mnt" },
68 { .name = NULL }
69 };
70
71 static int npersists; /* number of persistent namespaces */
72
73 enum {
74 SETGROUPS_NONE = -1,
75 SETGROUPS_DENY = 0,
76 SETGROUPS_ALLOW = 1,
77 };
78
79 enum {
80 MAP_USER_NONE,
81 MAP_USER_ROOT,
82 MAP_USER_CURRENT,
83 };
84
85 static const char *setgroups_strings[] =
86 {
87 [SETGROUPS_DENY] = "deny",
88 [SETGROUPS_ALLOW] = "allow"
89 };
90
91 static int setgroups_str2id(const char *str)
92 {
93 size_t i;
94
95 for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++)
96 if (strcmp(str, setgroups_strings[i]) == 0)
97 return i;
98
99 errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str);
100 }
101
102 static void setgroups_control(int action)
103 {
104 const char *file = _PATH_PROC_SETGROUPS;
105 const char *cmd;
106 int fd;
107
108 if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings))
109 return;
110 cmd = setgroups_strings[action];
111
112 fd = open(file, O_WRONLY);
113 if (fd < 0) {
114 if (errno == ENOENT)
115 return;
116 err(EXIT_FAILURE, _("cannot open %s"), file);
117 }
118
119 if (write_all(fd, cmd, strlen(cmd)))
120 err(EXIT_FAILURE, _("write failed %s"), file);
121 close(fd);
122 }
123
124 static void map_id(const char *file, uint32_t from, uint32_t to)
125 {
126 char *buf;
127 int fd;
128
129 fd = open(file, O_WRONLY);
130 if (fd < 0)
131 err(EXIT_FAILURE, _("cannot open %s"), file);
132
133 xasprintf(&buf, "%u %u 1", from, to);
134 if (write_all(fd, buf, strlen(buf)))
135 err(EXIT_FAILURE, _("write failed %s"), file);
136 free(buf);
137 close(fd);
138 }
139
140 static unsigned long parse_propagation(const char *str)
141 {
142 size_t i;
143 static const struct prop_opts {
144 const char *name;
145 unsigned long flag;
146 } opts[] = {
147 { "slave", MS_REC | MS_SLAVE },
148 { "private", MS_REC | MS_PRIVATE },
149 { "shared", MS_REC | MS_SHARED },
150 { "unchanged", 0 }
151 };
152
153 for (i = 0; i < ARRAY_SIZE(opts); i++) {
154 if (strcmp(opts[i].name, str) == 0)
155 return opts[i].flag;
156 }
157
158 errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str);
159 }
160
161 static void set_propagation(unsigned long flags)
162 {
163 if (flags == 0)
164 return;
165
166 if (mount("none", "/", NULL, flags, NULL) != 0)
167 err(EXIT_FAILURE, _("cannot change root filesystem propagation"));
168 }
169
170
171 static int set_ns_target(int type, const char *path)
172 {
173 struct namespace_file *ns;
174
175 for (ns = namespace_files; ns->name; ns++) {
176 if (ns->type != type)
177 continue;
178 ns->target = path;
179 npersists++;
180 return 0;
181 }
182
183 return -EINVAL;
184 }
185
186 static int bind_ns_files(pid_t pid)
187 {
188 struct namespace_file *ns;
189 char src[PATH_MAX];
190
191 for (ns = namespace_files; ns->name; ns++) {
192 if (!ns->target)
193 continue;
194
195 snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name);
196
197 if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0)
198 err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target);
199 }
200
201 return 0;
202 }
203
204 static ino_t get_mnt_ino(pid_t pid)
205 {
206 struct stat st;
207 char path[PATH_MAX];
208
209 snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
210
211 if (stat(path, &st) != 0)
212 err(EXIT_FAILURE, _("cannot stat %s"), path);
213 return st.st_ino;
214 }
215
216 static void bind_ns_files_from_child(pid_t *child, int fds[2])
217 {
218 char ch;
219 pid_t ppid = getpid();
220 ino_t ino = get_mnt_ino(ppid);
221
222 if (pipe(fds) < 0)
223 err(EXIT_FAILURE, _("pipe failed"));
224
225 *child = fork();
226
227 switch (*child) {
228 case -1:
229 err(EXIT_FAILURE, _("fork failed"));
230
231 case 0: /* child */
232 close(fds[1]);
233 fds[1] = -1;
234
235 /* wait for parent */
236 if (read_all(fds[0], &ch, 1) != 1 && ch != PIPE_SYNC_BYTE)
237 err(EXIT_FAILURE, _("failed to read pipe"));
238 if (get_mnt_ino(ppid) == ino)
239 exit(EXIT_FAILURE);
240 bind_ns_files(ppid);
241 exit(EXIT_SUCCESS);
242 break;
243
244 default: /* parent */
245 close(fds[0]);
246 fds[0] = -1;
247 break;
248 }
249 }
250
251 static void __attribute__((__noreturn__)) usage(void)
252 {
253 FILE *out = stdout;
254
255 fputs(USAGE_HEADER, out);
256 fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"),
257 program_invocation_short_name);
258
259 fputs(USAGE_SEPARATOR, out);
260 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out);
261
262 fputs(USAGE_OPTIONS, out);
263 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out);
264 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out);
265 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out);
266 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out);
267 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out);
268 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out);
269 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out);
270 fputs(USAGE_SEPARATOR, out);
271 fputs(_(" -f, --fork fork before launching <program>\n"), out);
272 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out);
273 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out);
274 fputs(USAGE_SEPARATOR, out);
275 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
276 " defaults to SIGKILL\n"), out);
277 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out);
278 fputs(_(" --propagation slave|shared|private|unchanged\n"
279 " modify mount propagation in mount namespace\n"), out);
280 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out);
281 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out);
282 fputs(USAGE_SEPARATOR, out);
283 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out);
284 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out);
285 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out);
286 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out);
287
288 fputs(USAGE_SEPARATOR, out);
289 printf(USAGE_HELP_OPTIONS(27));
290 printf(USAGE_MAN_TAIL("unshare(1)"));
291
292 exit(EXIT_SUCCESS);
293 }
294
295 int main(int argc, char *argv[])
296 {
297 enum {
298 OPT_MOUNTPROC = CHAR_MAX + 1,
299 OPT_PROPAGATION,
300 OPT_SETGROUPS,
301 OPT_KILLCHILD,
302 OPT_KEEPCAPS,
303 };
304 static const struct option longopts[] = {
305 { "help", no_argument, NULL, 'h' },
306 { "version", no_argument, NULL, 'V' },
307
308 { "mount", optional_argument, NULL, 'm' },
309 { "uts", optional_argument, NULL, 'u' },
310 { "ipc", optional_argument, NULL, 'i' },
311 { "net", optional_argument, NULL, 'n' },
312 { "pid", optional_argument, NULL, 'p' },
313 { "user", optional_argument, NULL, 'U' },
314 { "cgroup", optional_argument, NULL, 'C' },
315
316 { "fork", no_argument, NULL, 'f' },
317 { "kill-child", optional_argument, NULL, OPT_KILLCHILD },
318 { "mount-proc", optional_argument, NULL, OPT_MOUNTPROC },
319 { "map-root-user", no_argument, NULL, 'r' },
320 { "map-current-user", no_argument, NULL, 'c' },
321 { "propagation", required_argument, NULL, OPT_PROPAGATION },
322 { "setgroups", required_argument, NULL, OPT_SETGROUPS },
323 { "keep-caps", no_argument, NULL, OPT_KEEPCAPS },
324 { "setuid", required_argument, NULL, 'S' },
325 { "setgid", required_argument, NULL, 'G' },
326 { "root", required_argument, NULL, 'R' },
327 { "wd", required_argument, NULL, 'w' },
328 { NULL, 0, NULL, 0 }
329 };
330
331 int setgrpcmd = SETGROUPS_NONE;
332 int unshare_flags = 0;
333 int c, forkit = 0, mapuser = MAP_USER_NONE;
334 int kill_child_signo = 0; /* 0 means --kill-child was not used */
335 const char *procmnt = NULL;
336 const char *newroot = NULL;
337 const char *newdir = NULL;
338 pid_t pid = 0;
339 int fds[2];
340 int status;
341 unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT;
342 int force_uid = 0, force_gid = 0;
343 uid_t uid = 0, real_euid = geteuid();
344 gid_t gid = 0, real_egid = getegid();
345 int keepcaps = 0;
346
347 setlocale(LC_ALL, "");
348 bindtextdomain(PACKAGE, LOCALEDIR);
349 textdomain(PACKAGE);
350 close_stdout_atexit();
351
352 while ((c = getopt_long(argc, argv, "+fhVmuinpCUrR:w:S:G:", longopts, NULL)) != -1) {
353 switch (c) {
354 case 'f':
355 forkit = 1;
356 break;
357 case 'm':
358 unshare_flags |= CLONE_NEWNS;
359 if (optarg)
360 set_ns_target(CLONE_NEWNS, optarg);
361 break;
362 case 'u':
363 unshare_flags |= CLONE_NEWUTS;
364 if (optarg)
365 set_ns_target(CLONE_NEWUTS, optarg);
366 break;
367 case 'i':
368 unshare_flags |= CLONE_NEWIPC;
369 if (optarg)
370 set_ns_target(CLONE_NEWIPC, optarg);
371 break;
372 case 'n':
373 unshare_flags |= CLONE_NEWNET;
374 if (optarg)
375 set_ns_target(CLONE_NEWNET, optarg);
376 break;
377 case 'p':
378 unshare_flags |= CLONE_NEWPID;
379 if (optarg)
380 set_ns_target(CLONE_NEWPID, optarg);
381 break;
382 case 'U':
383 unshare_flags |= CLONE_NEWUSER;
384 if (optarg)
385 set_ns_target(CLONE_NEWUSER, optarg);
386 break;
387 case 'C':
388 unshare_flags |= CLONE_NEWCGROUP;
389 if (optarg)
390 set_ns_target(CLONE_NEWCGROUP, optarg);
391 break;
392 case OPT_MOUNTPROC:
393 unshare_flags |= CLONE_NEWNS;
394 procmnt = optarg ? optarg : "/proc";
395 break;
396 case 'r':
397 if (mapuser == MAP_USER_CURRENT)
398 errx(EXIT_FAILURE, _("options --map-root-user and "
399 "--map-current-user are mutually exclusive"));
400
401 unshare_flags |= CLONE_NEWUSER;
402 mapuser = MAP_USER_ROOT;
403 break;
404 case 'c':
405 if (mapuser == MAP_USER_ROOT)
406 errx(EXIT_FAILURE, _("options --map-root-user and "
407 "--map-current-user are mutually exclusive"));
408
409 unshare_flags |= CLONE_NEWUSER;
410 mapuser = MAP_USER_CURRENT;
411 break;
412 case OPT_SETGROUPS:
413 setgrpcmd = setgroups_str2id(optarg);
414 break;
415 case OPT_PROPAGATION:
416 propagation = parse_propagation(optarg);
417 break;
418 case OPT_KILLCHILD:
419 forkit = 1;
420 if (optarg) {
421 if ((kill_child_signo = signame_to_signum(optarg)) < 0)
422 errx(EXIT_FAILURE, _("unknown signal: %s"),
423 optarg);
424 } else {
425 kill_child_signo = SIGKILL;
426 }
427 break;
428 case OPT_KEEPCAPS:
429 keepcaps = 1;
430 cap_last_cap(); /* Force last cap to be cached before we fork. */
431 break;
432 case 'S':
433 uid = strtoul_or_err(optarg, _("failed to parse uid"));
434 force_uid = 1;
435 break;
436 case 'G':
437 gid = strtoul_or_err(optarg, _("failed to parse gid"));
438 force_gid = 1;
439 break;
440 case 'R':
441 newroot = optarg;
442 break;
443 case 'w':
444 newdir = optarg;
445 break;
446
447 case 'h':
448 usage();
449 case 'V':
450 print_version(EXIT_SUCCESS);
451 default:
452 errtryhelp(EXIT_FAILURE);
453 }
454 }
455
456 if (npersists && (unshare_flags & CLONE_NEWNS))
457 bind_ns_files_from_child(&pid, fds);
458
459 if (-1 == unshare(unshare_flags))
460 err(EXIT_FAILURE, _("unshare failed"));
461
462 if (npersists) {
463 if (pid && (unshare_flags & CLONE_NEWNS)) {
464 int rc;
465 char ch = PIPE_SYNC_BYTE;
466
467 /* signal child we are ready */
468 write_all(fds[1], &ch, 1);
469 close(fds[1]);
470 fds[1] = -1;
471
472 /* wait for bind_ns_files_from_child() */
473 do {
474 rc = waitpid(pid, &status, 0);
475 if (rc < 0) {
476 if (errno == EINTR)
477 continue;
478 err(EXIT_FAILURE, _("waitpid failed"));
479 }
480 if (WIFEXITED(status) &&
481 WEXITSTATUS(status) != EXIT_SUCCESS)
482 return WEXITSTATUS(status);
483 } while (rc < 0);
484 } else
485 /* simple way, just bind */
486 bind_ns_files(getpid());
487 }
488
489 if (forkit) {
490 pid = fork();
491
492 switch(pid) {
493 case -1:
494 err(EXIT_FAILURE, _("fork failed"));
495 case 0: /* child */
496 break;
497 default: /* parent */
498 if (waitpid(pid, &status, 0) == -1)
499 err(EXIT_FAILURE, _("waitpid failed"));
500 if (WIFEXITED(status))
501 return WEXITSTATUS(status);
502 else if (WIFSIGNALED(status))
503 kill(getpid(), WTERMSIG(status));
504 err(EXIT_FAILURE, _("child exit failed"));
505 }
506 }
507
508 if (kill_child_signo != 0 && prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0)
509 err(EXIT_FAILURE, "prctl failed");
510
511 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
512 * has been disabled unless /proc/self/setgroups is written
513 * first to permanently disable the ability to call setgroups
514 * in that user namespace. */
515 switch (mapuser) {
516 case MAP_USER_ROOT:
517 if (setgrpcmd == SETGROUPS_ALLOW)
518 errx(EXIT_FAILURE, _("options --setgroups=allow and "
519 "--map-root-user are mutually exclusive"));
520
521 setgroups_control(SETGROUPS_DENY);
522 map_id(_PATH_PROC_UIDMAP, 0, real_euid);
523 map_id(_PATH_PROC_GIDMAP, 0, real_egid);
524 break;
525 case MAP_USER_CURRENT:
526 if (setgrpcmd == SETGROUPS_ALLOW)
527 errx(EXIT_FAILURE, _("options --setgroups=allow and "
528 "--map-current-user are mutually exclusive"));
529
530 setgroups_control(SETGROUPS_DENY);
531 map_id(_PATH_PROC_UIDMAP, real_euid, real_euid);
532 map_id(_PATH_PROC_GIDMAP, real_egid, real_egid);
533 break;
534 case MAP_USER_NONE:
535 if (setgrpcmd != SETGROUPS_NONE)
536 setgroups_control(setgrpcmd);
537 }
538
539 if ((unshare_flags & CLONE_NEWNS) && propagation)
540 set_propagation(propagation);
541
542 if (newroot) {
543 if (chroot(newroot) != 0)
544 err(EXIT_FAILURE,
545 _("cannot change root directory to '%s'"), newroot);
546 newdir = newdir ?: "/";
547 }
548 if (newdir && chdir(newdir))
549 err(EXIT_FAILURE, _("cannot chdir to '%s'"), newdir);
550
551 if (procmnt) {
552 if (!newroot && mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL) != 0)
553 err(EXIT_FAILURE, _("umount %s failed"), procmnt);
554 if (mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0)
555 err(EXIT_FAILURE, _("mount %s failed"), procmnt);
556 }
557
558 if (force_gid) {
559 if (setgroups(0, NULL) != 0) /* drop supplementary groups */
560 err(EXIT_FAILURE, _("setgroups failed"));
561 if (setgid(gid) < 0) /* change GID */
562 err(EXIT_FAILURE, _("setgid failed"));
563 }
564 if (force_uid && setuid(uid) < 0) /* change UID */
565 err(EXIT_FAILURE, _("setuid failed"));
566
567 /* We use capabilities system calls to propagate the permitted
568 * capabilities into the ambient set because we have already
569 * forked so are in async-signal-safe context. */
570 if (keepcaps && (unshare_flags & CLONE_NEWUSER)) {
571 struct __user_cap_header_struct header = {
572 .version = _LINUX_CAPABILITY_VERSION_3,
573 .pid = 0,
574 };
575
576 struct __user_cap_data_struct payload[_LINUX_CAPABILITY_U32S_3] = { 0 };
577 int cap;
578 uint64_t effective;
579
580 if (capget(&header, payload) < 0)
581 err(EXIT_FAILURE, _("capget failed"));
582
583 /* In order the make capabilities ambient, we first need to ensure
584 * that they are all inheritable. */
585 payload[0].inheritable = payload[0].permitted;
586 payload[1].inheritable = payload[1].permitted;
587
588 if (capset(&header, payload) < 0)
589 err(EXIT_FAILURE, _("capset failed"));
590
591 effective = ((uint64_t)payload[1].effective << 32) | (uint64_t)payload[0].effective;
592
593 for (cap = 0; cap < 64; cap++) {
594 /* This is the same check as cap_valid(), but using
595 * the runtime value for the last valid cap. */
596 if (cap > cap_last_cap())
597 continue;
598
599 if ((effective & (1 << cap))
600 && prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0)
601 err(EXIT_FAILURE, _("prctl(PR_CAP_AMBIENT) failed"));
602 }
603 }
604
605 if (optind < argc) {
606 execvp(argv[optind], argv + optind);
607 errexec(argv[optind]);
608 }
609 exec_shell();
610 }