]> git.ipfire.org Git - thirdparty/util-linux.git/blame - sys-utils/unshare.c
unshare: Propagate inherited signal handling to forked child
[thirdparty/util-linux.git] / sys-utils / unshare.c
CommitLineData
4205f1fd
MG
1/*
2 * unshare(1) - command-line interface for unshare(2)
3 *
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
7cebf0bb 18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
4205f1fd
MG
19 */
20
4205f1fd
MG
21#include <errno.h>
22#include <getopt.h>
23#include <sched.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <unistd.h>
ff5dc96e 27#include <sys/eventfd.h>
5088ec33 28#include <sys/wait.h>
6728ca10 29#include <sys/mount.h>
c84f2590
KZ
30#include <sys/types.h>
31#include <sys/stat.h>
8e8f0fa5 32#include <sys/prctl.h>
f0af42b5 33#include <grp.h>
c84f2590 34
d754315c
RM
35/* we only need some defines missing in sys/mount.h, no libmount linkage */
36#include <libmount.h>
37
4205f1fd 38#include "nls.h"
eb76ca98 39#include "c.h"
cef4decf 40#include "caputils.h"
efb8854f 41#include "closestream.h"
c91280a4 42#include "namespace.h"
57580694 43#include "exec_shell.h"
4da21e37
LR
44#include "xalloc.h"
45#include "pathnames.h"
46#include "all-io.h"
8b39a17c 47#include "signames.h"
f0af42b5 48#include "strutils.h"
987550cb 49#include "pwdutils.h"
4da21e37 50
99fcafdf
YK
51/* synchronize parent and child by pipe */
52#define PIPE_SYNC_BYTE 0x06
53
f0f22e9c
KZ
54/* 'private' is kernel default */
55#define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
56
0490a6ca
KZ
57/* /proc namespace files and mountpoints for binds */
58static struct namespace_file {
59 int type; /* CLONE_NEW* */
60 const char *name; /* ns/<type> */
61 const char *target; /* user specified target for bind mount */
62} namespace_files[] = {
f9e7b66d
SH
63 { .type = CLONE_NEWUSER, .name = "ns/user" },
64 { .type = CLONE_NEWCGROUP,.name = "ns/cgroup" },
65 { .type = CLONE_NEWIPC, .name = "ns/ipc" },
66 { .type = CLONE_NEWUTS, .name = "ns/uts" },
67 { .type = CLONE_NEWNET, .name = "ns/net" },
0d5260b6 68 { .type = CLONE_NEWPID, .name = "ns/pid_for_children" },
f9e7b66d 69 { .type = CLONE_NEWNS, .name = "ns/mnt" },
0d5260b6 70 { .type = CLONE_NEWTIME, .name = "ns/time_for_children" },
0490a6ca
KZ
71 { .name = NULL }
72};
73
74static int npersists; /* number of persistent namespaces */
75
fbceefde
KZ
76enum {
77 SETGROUPS_NONE = -1,
78 SETGROUPS_DENY = 0,
79 SETGROUPS_ALLOW = 1,
80};
81
82static const char *setgroups_strings[] =
83{
84 [SETGROUPS_DENY] = "deny",
85 [SETGROUPS_ALLOW] = "allow"
86};
87
88static int setgroups_str2id(const char *str)
89{
90 size_t i;
91
92 for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++)
93 if (strcmp(str, setgroups_strings[i]) == 0)
94 return i;
95
96 errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str);
97}
98
99static void setgroups_control(int action)
0bf15941
EB
100{
101 const char *file = _PATH_PROC_SETGROUPS;
fbceefde 102 const char *cmd;
0bf15941
EB
103 int fd;
104
fbceefde
KZ
105 if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings))
106 return;
107 cmd = setgroups_strings[action];
108
0bf15941
EB
109 fd = open(file, O_WRONLY);
110 if (fd < 0) {
111 if (errno == ENOENT)
112 return;
7ff635bf 113 err(EXIT_FAILURE, _("cannot open %s"), file);
0bf15941
EB
114 }
115
fbceefde 116 if (write_all(fd, cmd, strlen(cmd)))
0bf15941
EB
117 err(EXIT_FAILURE, _("write failed %s"), file);
118 close(fd);
119}
120
4da21e37
LR
121static void map_id(const char *file, uint32_t from, uint32_t to)
122{
123 char *buf;
124 int fd;
125
126 fd = open(file, O_WRONLY);
127 if (fd < 0)
128 err(EXIT_FAILURE, _("cannot open %s"), file);
129
130 xasprintf(&buf, "%u %u 1", from, to);
131 if (write_all(fd, buf, strlen(buf)))
132 err(EXIT_FAILURE, _("write failed %s"), file);
133 free(buf);
134 close(fd);
135}
4205f1fd 136
f0f22e9c
KZ
137static unsigned long parse_propagation(const char *str)
138{
139 size_t i;
140 static const struct prop_opts {
141 const char *name;
142 unsigned long flag;
143 } opts[] = {
144 { "slave", MS_REC | MS_SLAVE },
145 { "private", MS_REC | MS_PRIVATE },
146 { "shared", MS_REC | MS_SHARED },
147 { "unchanged", 0 }
148 };
149
150 for (i = 0; i < ARRAY_SIZE(opts); i++) {
151 if (strcmp(opts[i].name, str) == 0)
152 return opts[i].flag;
153 }
154
155 errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str);
156}
157
158static void set_propagation(unsigned long flags)
159{
160 if (flags == 0)
161 return;
162
163 if (mount("none", "/", NULL, flags, NULL) != 0)
164 err(EXIT_FAILURE, _("cannot change root filesystem propagation"));
165}
166
0490a6ca
KZ
167
168static int set_ns_target(int type, const char *path)
169{
170 struct namespace_file *ns;
171
172 for (ns = namespace_files; ns->name; ns++) {
173 if (ns->type != type)
174 continue;
175 ns->target = path;
176 npersists++;
177 return 0;
178 }
179
180 return -EINVAL;
181}
182
183static int bind_ns_files(pid_t pid)
184{
185 struct namespace_file *ns;
186 char src[PATH_MAX];
187
188 for (ns = namespace_files; ns->name; ns++) {
189 if (!ns->target)
190 continue;
191
192 snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name);
193
194 if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0)
195 err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target);
196 }
197
198 return 0;
199}
200
c84f2590
KZ
201static ino_t get_mnt_ino(pid_t pid)
202{
203 struct stat st;
204 char path[PATH_MAX];
205
206 snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
207
208 if (stat(path, &st) != 0)
1293b0f6 209 err(EXIT_FAILURE, _("stat of %s failed"), path);
c84f2590
KZ
210 return st.st_ino;
211}
212
be7df01a
AR
213static void settime(time_t offset, clockid_t clk_id)
214{
215 char buf[sizeof(stringify_value(ULONG_MAX)) * 3];
216 int fd, len;
217
218 len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset);
219
220 fd = open("/proc/self/timens_offsets", O_WRONLY);
221 if (fd < 0)
222 err(EXIT_FAILURE, _("failed to open /proc/self/timens_offsets"));
223
224 if (write(fd, buf, len) != len)
225 err(EXIT_FAILURE, _("failed to write to /proc/self/timens_offsets"));
226
227 close(fd);
228}
229
82ea6298
SA
230/**
231 * waitchild() - Wait for a process to exit successfully
232 * @pid: PID of the process to wait for
233 *
234 * Wait for a process to exit successfully. If it exits with a non-zero return
235 * code, then exit() with the same status.
236 */
237static void waitchild(int pid)
238{
239 int rc, status;
240
241 do {
242 rc = waitpid(pid, &status, 0);
243 if (rc < 0) {
244 if (errno == EINTR)
245 continue;
246 err(EXIT_FAILURE, _("waitpid failed"));
247 }
248 if (WIFEXITED(status) &&
249 WEXITSTATUS(status) != EXIT_SUCCESS)
250 exit(WEXITSTATUS(status));
251 } while (rc < 0);
252}
253
783bb52a
SA
254/**
255 * sync_with_child() - Tell our child we're ready and wait for it to exit
256 * @pid: The pid of our child
257 * @fd: A file descriptor created with eventfd()
258 *
259 * This tells a child created with fork_and_wait() that we are ready for it to
260 * continue. Once we have done that, wait for our child to exit.
261 */
262static void sync_with_child(pid_t pid, int fd)
c84f2590 263{
783bb52a
SA
264 uint64_t ch = PIPE_SYNC_BYTE;
265
266 write_all(fd, &ch, sizeof(ch));
267 close(fd);
c84f2590 268
783bb52a
SA
269 waitchild(pid);
270}
99fcafdf 271
783bb52a
SA
272/**
273 * fork_and_wait() - Fork and wait to be sync'd with
274 * @fd - A file descriptor created with eventfd() which should be passed to
275 * sync_with_child()
276 *
277 * This creates an eventfd and forks. The parent process returns immediately,
278 * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
279 * This allows the parent to perform some tasks before the child starts its
280 * work. The parent should call sync_with_child() once it is ready for the
281 * child to continue.
282 *
283 * Return: The pid from fork()
284 */
285static pid_t fork_and_wait(int *fd)
286{
287 pid_t pid;
288 uint64_t ch;
c84f2590 289
783bb52a
SA
290 *fd = eventfd(0, 0);
291 if (*fd < 0)
292 err(EXIT_FAILURE, _("eventfd failed"));
293
294 pid = fork();
295 if (pid < 0)
c84f2590 296 err(EXIT_FAILURE, _("fork failed"));
99fcafdf 297
783bb52a
SA
298 if (!pid) {
299 /* wait for the our parent to tell us to continue */
300 if (read_all(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) ||
301 ch != PIPE_SYNC_BYTE)
302 err(EXIT_FAILURE, _("failed to read eventfd"));
303 close(*fd);
c84f2590 304 }
783bb52a
SA
305
306 return pid;
307}
308
309static pid_t bind_ns_files_from_child(int *fd)
310{
311 pid_t child, ppid = getpid();
312 ino_t ino = get_mnt_ino(ppid);
313
314 child = fork_and_wait(fd);
315 if (child)
316 return child;
317
318 if (get_mnt_ino(ppid) == ino)
319 exit(EXIT_FAILURE);
320 bind_ns_files(ppid);
321 exit(EXIT_SUCCESS);
c84f2590
KZ
322}
323
987550cb
MHB
324static uid_t get_user(const char *s, const char *err)
325{
326 struct passwd *pw;
327 char *buf = NULL;
328 uid_t ret;
329
330 pw = xgetpwnam(s, &buf);
331 if (pw) {
332 ret = pw->pw_uid;
333 free(pw);
334 free(buf);
335 } else {
336 ret = strtoul_or_err(s, err);
337 }
338
339 return ret;
340}
341
342static gid_t get_group(const char *s, const char *err)
343{
344 struct group *gr;
345 char *buf = NULL;
346 gid_t ret;
347
348 gr = xgetgrnam(s, &buf);
349 if (gr) {
350 ret = gr->gr_gid;
351 free(gr);
352 free(buf);
353 } else {
354 ret = strtoul_or_err(s, err);
355 }
356
357 return ret;
358}
359
ff5dc96e
SA
360/**
361 * struct map_range - A range of IDs to map
362 * @outer: First ID inside the namespace
363 * @inner: First ID outside the namespace
364 * @count: Length of the inside and outside ranges
365 *
366 * A range of uids/gids to map using new[gu]idmap.
367 */
368struct map_range {
369 unsigned int outer;
370 unsigned int inner;
371 unsigned int count;
372};
373
374#define UID_BUFSIZ sizeof(stringify_value(ULONG_MAX))
375
376/**
377 * uint_to_id() - Convert a string into a user/group ID
378 * @name: The string representation of the ID
379 * @sz: The length of @name, without an (optional) nul-terminator
380 *
381 * This converts a (possibly not nul-terminated_ string into user or group ID.
382 * No name lookup is performed.
383 *
384 * Return: @name as a numeric ID
385 */
386static int uint_to_id(const char *name, size_t sz)
387{
388 char buf[UID_BUFSIZ];
389
390 mem2strcpy(buf, name, sz, sizeof(buf));
391 return strtoul_or_err(name, _("could not parse ID"));
392}
393
394/**
395 * get_map_range() - Parse a mapping range from a string
396 * @s: A string of the format upper,lower,count
397 *
398 * Parse a string of the form upper,lower,count into a new mapping range.
399 *
400 * Return: A new &struct map_range
401 */
402static struct map_range *get_map_range(const char *s)
403{
404 int n, map[3];
405 struct map_range *ret;
406
407 n = string_to_idarray(s, map, ARRAY_SIZE(map), uint_to_id);
408 if (n < 0)
409 errx(EXIT_FAILURE, _("too many elements for mapping '%s'"), s);
410 if (n != ARRAY_SIZE(map))
411 errx(EXIT_FAILURE, _("mapping '%s' contains only %d elements"),
412 s, n);
413
414 ret = xmalloc(sizeof(*ret));
415 ret->outer = map[0];
416 ret->inner = map[1];
417 ret->count = map[2];
418 return ret;
419}
420
e67b0ba3
SA
421/**
422 * read_subid_range() - Look up a user's sub[gu]id range
423 * @filename: The file to look up the range from. This should be either
424 * ``/etc/subuid`` or ``/etc/subgid``.
425 * @uid: The uid of the user whose range we should look up.
426 *
427 * This finds the first subid range matching @uid in @filename.
428 */
429static struct map_range *read_subid_range(char *filename, uid_t uid)
430{
431 char *line = NULL, *pwbuf;
432 FILE *idmap;
433 size_t n;
434 struct passwd *pw;
435 struct map_range *map;
436
437 map = xmalloc(sizeof(*map));
438 map->inner = 0;
439
440 pw = xgetpwuid(uid, &pwbuf);
441 if (!pw)
442 errx(EXIT_FAILURE, _("you (user %d) don't exist."), uid);
443
444 idmap = fopen(filename, "r");
445 if (!idmap)
446 err(EXIT_FAILURE, _("could not open '%s'"), filename);
447
448 /*
449 * Each line in sub[ug]idmap looks like
450 * username:subuid:count
451 * OR
452 * uid:subuid:count
453 */
454 while (getline(&line, &n, idmap) != -1) {
455 char *rest, *s;
456
457 rest = strchr(line, ':');
458 if (!rest)
459 continue;
460 *rest = '\0';
461
462 if (strcmp(line, pw->pw_name) &&
463 strtoul(line, NULL, 10) != pw->pw_uid)
464 continue;
465
466 s = rest + 1;
467 rest = strchr(s, ':');
468 if (!rest)
469 continue;
470 *rest = '\0';
471 map->outer = strtoul_or_err(s, _("failed to parse subid map"));
472
473 s = rest + 1;
474 rest = strchr(s, '\n');
475 if (rest)
476 *rest = '\0';
477 map->count = strtoul_or_err(s, _("failed to parse subid map"));
478
479 fclose(idmap);
d504b862
KZ
480 free(pw);
481 free(pwbuf);
482
e67b0ba3
SA
483 return map;
484 }
485
486 err(EXIT_FAILURE, _("no line matching user \"%s\" in %s"),
487 pw->pw_name, filename);
488}
489
ff5dc96e
SA
490/**
491 * map_ids() - Create a new uid/gid map
492 * @idmapper: Either newuidmap or newgidmap
493 * @ppid: Pid to set the map for
494 * @outer: ID outside the namespace for a single map.
495 * @inner: ID inside the namespace for a single map. May be -1 to only use @map.
496 * @map: A range of IDs to map
497 *
498 * This creates a new uid/gid map for @ppid using @idmapper. The ID @outer in
499 * the parent (our) namespace is mapped to the ID @inner in the child (@ppid's)
500 * namespace. In addition, the range of IDs beginning at @map->outer is mapped
501 * to the range of IDs beginning at @map->inner. The tricky bit is that we
502 * cannot let these mappings overlap. We accomplish this by removing a "hole"
503 * from @map, if @outer or @inner overlap it. This may result in one less than
504 * @map->count IDs being mapped from @map. The unmapped IDs are always the
505 * topmost IDs of the mapping (either in the parent or the child namespace).
506 *
507 * Most of the time, this function will be called with @map->outer as some
508 * large ID, @map->inner as 0, and @map->count as a large number (at least
509 * 1000, but less than @map->outer). Typically, there will be no conflict with
510 * @outer. However, @inner may split the mapping for e.g. --map-current-user.
511 *
512 * This function always exec()s or errors out and does not return.
513 */
514static void __attribute__((__noreturn__))
515map_ids(const char *idmapper, int ppid, unsigned int outer, unsigned int inner,
516 struct map_range *map)
517{
518 /* idmapper + pid + 4 * map + NULL */
519 char *argv[15];
520 /* argv - idmapper - "1" - NULL */
521 char args[12][UID_BUFSIZ];
522 int i = 0, j = 0;
523 struct map_range lo, mid, hi;
524 unsigned int inner_offset, outer_offset;
525
526 /* Some helper macros to reduce bookkeeping */
527#define push_str(s) do { \
528 argv[i++] = s; \
529} while (0)
530#define push_ul(x) do { \
531 snprintf(args[j], sizeof(args[j]), "%u", x); \
532 push_str(args[j++]); \
533} while (0)
534
535 push_str(xstrdup(idmapper));
536 push_ul(ppid);
537 if ((int)inner == -1) {
538 /*
539 * If we don't have a "single" mapping, then we can just use
540 * map directly
541 */
542 push_ul(map->inner);
543 push_ul(map->outer);
544 push_ul(map->count);
545 push_str(NULL);
546
547 execvp(idmapper, argv);
548 errexec(idmapper);
549 }
550
551 /* If the mappings overlap, remove an ID from map */
552 if ((outer >= map->outer && outer <= map->outer + map->count) ||
553 (inner >= map->inner && inner <= map->inner + map->count))
554 map->count--;
555
556 /* Determine where the splits between lo, mid, and hi will be */
557 outer_offset = min(outer > map->outer ? outer - map->outer : 0,
558 map->count);
559 inner_offset = min(inner > map->inner ? inner - map->inner : 0,
560 map->count);
561
562 /*
563 * In the worst case, we need three mappings:
564 * From the bottom of map to either inner or outer
565 */
566 lo.outer = map->outer;
567 lo.inner = map->inner;
568 lo.count = min(inner_offset, outer_offset);
569
570 /* From the lower of inner or outer to the higher */
571 mid.outer = lo.outer + lo.count;
572 mid.outer += mid.outer == outer;
573 mid.inner = lo.inner + lo.count;
574 mid.inner += mid.inner == inner;
575 mid.count = abs_diff(outer_offset, inner_offset);
576
577 /* And from the higher of inner or outer to the end of the map */
578 hi.outer = mid.outer + mid.count;
579 hi.outer += hi.outer == outer;
580 hi.inner = mid.inner + mid.count;
581 hi.inner += hi.inner == inner;
582 hi.count = map->count - lo.count - mid.count;
583
584 push_ul(inner);
585 push_ul(outer);
586 push_str("1");
587 /* new[gu]idmap doesn't like zero-length mappings, so skip them */
588 if (lo.count) {
589 push_ul(lo.inner);
590 push_ul(lo.outer);
591 push_ul(lo.count);
592 }
593 if (mid.count) {
594 push_ul(mid.inner);
595 push_ul(mid.outer);
596 push_ul(mid.count);
597 }
598 if (hi.count) {
599 push_ul(hi.inner);
600 push_ul(hi.outer);
601 push_ul(hi.count);
602 }
603 push_str(NULL);
604 execvp(idmapper, argv);
605 errexec(idmapper);
606}
607
608/**
609 * map_ids_from_child() - Set up a new uid/gid map
610 * @fd: The eventfd to wait on
611 * @mapuser: The user to map the current user to (or -1)
612 * @usermap: The range of UIDs to map (or %NULL)
613 * @mapgroup: The group to map the current group to (or -1)
614 * @groupmap: The range of GIDs to map (or %NULL)
615 *
616 * fork_and_wait() for our parent to call sync_with_child() on @fd. Upon
617 * recieving the go-ahead, use newuidmap and newgidmap to set the uid/gid map
618 * for our parent's PID.
619 *
620 * Return: The pid of the child.
621 */
622static pid_t map_ids_from_child(int *fd, uid_t mapuser,
623 struct map_range *usermap, gid_t mapgroup,
624 struct map_range *groupmap)
625{
626 pid_t child, pid = 0;
627 pid_t ppid = getpid();
628
629 child = fork_and_wait(fd);
630 if (child)
631 return child;
632
633 /* Avoid forking more than we need to */
634 if (usermap && groupmap) {
635 pid = fork();
636 if (pid < 0)
637 err(EXIT_FAILURE, _("fork failed"));
638 if (pid)
639 waitchild(pid);
640 }
641
642 if (!pid && usermap)
643 map_ids("newuidmap", ppid, geteuid(), mapuser, usermap);
644 if (groupmap)
645 map_ids("newgidmap", ppid, getegid(), mapgroup, groupmap);
646 exit(EXIT_SUCCESS);
647}
648
fa2cd89a 649static void __attribute__((__noreturn__)) usage(void)
4205f1fd 650{
fa2cd89a 651 FILE *out = stdout;
4205f1fd 652
6a87798a 653 fputs(USAGE_HEADER, out);
b5672517 654 fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"),
298dc4ff 655 program_invocation_short_name);
4205f1fd 656
451dbcfa
BS
657 fputs(USAGE_SEPARATOR, out);
658 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out);
659
6a87798a 660 fputs(USAGE_OPTIONS, out);
0490a6ca
KZ
661 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out);
662 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out);
663 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out);
664 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out);
665 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out);
666 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out);
f9e7b66d 667 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out);
f218fd97 668 fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out);
da639217 669 fputs(USAGE_SEPARATOR, out);
6728ca10 670 fputs(_(" -f, --fork fork before launching <program>\n"), out);
987550cb
MHB
671 fputs(_(" --map-user=<uid>|<name> map current user to uid (implies --user)\n"), out);
672 fputs(_(" --map-group=<gid>|<name> map current group to gid (implies --user)\n"), out);
4da21e37 673 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out);
4175f29e 674 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out);
e67b0ba3 675 fputs(_(" --map-auto map users and groups automatically (implies --user)\n"), out);
ff5dc96e
SA
676 fputs(_(" --map-users=<outeruid>,<inneruid>,<count>\n"
677 " map count users from outeruid to inneruid (implies --user)\n"), out);
678 fputs(_(" --map-groups=<outergid>,<innergid>,<count>\n"
679 " map count groups from outergid to innergid (implies --user)\n"), out);
da639217
KZ
680 fputs(USAGE_SEPARATOR, out);
681 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
682 " defaults to SIGKILL\n"), out);
683 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out);
684 fputs(_(" --propagation slave|shared|private|unchanged\n"
f0f22e9c 685 " modify mount propagation in mount namespace\n"), out);
da639217 686 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out);
cef4decf 687 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out);
bf8834d4 688 fputs(USAGE_SEPARATOR, out);
6671501c
AR
689 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out);
690 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out);
691 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out);
692 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out);
be7df01a
AR
693 fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out);
694 fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out);
4205f1fd 695
6a87798a 696 fputs(USAGE_SEPARATOR, out);
f45f3ec3
RM
697 printf(USAGE_HELP_OPTIONS(27));
698 printf(USAGE_MAN_TAIL("unshare(1)"));
6a87798a 699
fa2cd89a 700 exit(EXIT_SUCCESS);
4205f1fd
MG
701}
702
703int main(int argc, char *argv[])
704{
6728ca10 705 enum {
fbceefde 706 OPT_MOUNTPROC = CHAR_MAX + 1,
f0f22e9c 707 OPT_PROPAGATION,
8e8f0fa5 708 OPT_SETGROUPS,
bf8834d4 709 OPT_KILLCHILD,
cef4decf 710 OPT_KEEPCAPS,
be7df01a
AR
711 OPT_MONOTONIC,
712 OPT_BOOTTIME,
6e837b5a 713 OPT_MAPUSER,
ff5dc96e 714 OPT_MAPUSERS,
6e837b5a 715 OPT_MAPGROUP,
ff5dc96e 716 OPT_MAPGROUPS,
e67b0ba3 717 OPT_MAPAUTO,
6728ca10 718 };
6c7d5ae9 719 static const struct option longopts[] = {
87918040
SK
720 { "help", no_argument, NULL, 'h' },
721 { "version", no_argument, NULL, 'V' },
722
723 { "mount", optional_argument, NULL, 'm' },
724 { "uts", optional_argument, NULL, 'u' },
725 { "ipc", optional_argument, NULL, 'i' },
726 { "net", optional_argument, NULL, 'n' },
727 { "pid", optional_argument, NULL, 'p' },
728 { "user", optional_argument, NULL, 'U' },
729 { "cgroup", optional_argument, NULL, 'C' },
f218fd97 730 { "time", optional_argument, NULL, 'T' },
87918040
SK
731
732 { "fork", no_argument, NULL, 'f' },
8b39a17c 733 { "kill-child", optional_argument, NULL, OPT_KILLCHILD },
87918040 734 { "mount-proc", optional_argument, NULL, OPT_MOUNTPROC },
6e837b5a 735 { "map-user", required_argument, NULL, OPT_MAPUSER },
ff5dc96e 736 { "map-users", required_argument, NULL, OPT_MAPUSERS },
6e837b5a 737 { "map-group", required_argument, NULL, OPT_MAPGROUP },
ff5dc96e 738 { "map-groups", required_argument, NULL, OPT_MAPGROUPS },
87918040 739 { "map-root-user", no_argument, NULL, 'r' },
4175f29e 740 { "map-current-user", no_argument, NULL, 'c' },
e67b0ba3 741 { "map-auto", no_argument, NULL, OPT_MAPAUTO },
87918040
SK
742 { "propagation", required_argument, NULL, OPT_PROPAGATION },
743 { "setgroups", required_argument, NULL, OPT_SETGROUPS },
cef4decf 744 { "keep-caps", no_argument, NULL, OPT_KEEPCAPS },
f0af42b5
LV
745 { "setuid", required_argument, NULL, 'S' },
746 { "setgid", required_argument, NULL, 'G' },
bf8834d4
LV
747 { "root", required_argument, NULL, 'R' },
748 { "wd", required_argument, NULL, 'w' },
be7df01a
AR
749 { "monotonic", required_argument, NULL, OPT_MONOTONIC },
750 { "boottime", required_argument, NULL, OPT_BOOTTIME },
87918040 751 { NULL, 0, NULL, 0 }
4205f1fd
MG
752 };
753
fbceefde 754 int setgrpcmd = SETGROUPS_NONE;
4205f1fd 755 int unshare_flags = 0;
6e837b5a
MHB
756 int c, forkit = 0;
757 uid_t mapuser = -1;
758 gid_t mapgroup = -1;
ff5dc96e
SA
759 struct map_range *usermap = NULL;
760 struct map_range *groupmap = NULL;
8b39a17c 761 int kill_child_signo = 0; /* 0 means --kill-child was not used */
6728ca10 762 const char *procmnt = NULL;
bf8834d4
LV
763 const char *newroot = NULL;
764 const char *newdir = NULL;
ff5dc96e 765 pid_t pid_bind = 0, pid_idmap = 0;
c84f2590 766 pid_t pid = 0;
ff5dc96e 767 int fd_idmap, fd_bind = -1;
f2f98017 768 sigset_t sigset, oldsigset;
c84f2590 769 int status;
f0f22e9c 770 unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT;
f0af42b5
LV
771 int force_uid = 0, force_gid = 0;
772 uid_t uid = 0, real_euid = geteuid();
773 gid_t gid = 0, real_egid = getegid();
cef4decf 774 int keepcaps = 0;
be7df01a
AR
775 time_t monotonic = 0;
776 time_t boottime = 0;
777 int force_monotonic = 0;
778 int force_boottime = 0;
4205f1fd 779
999ac5e2 780 setlocale(LC_ALL, "");
4205f1fd
MG
781 bindtextdomain(PACKAGE, LOCALEDIR);
782 textdomain(PACKAGE);
2c308875 783 close_stdout_atexit();
4205f1fd 784
f218fd97 785 while ((c = getopt_long(argc, argv, "+fhVmuinpCTUrR:w:S:G:c", longopts, NULL)) != -1) {
2eefe517 786 switch (c) {
5088ec33
MF
787 case 'f':
788 forkit = 1;
789 break;
4205f1fd 790 case 'm':
ef6acdb8 791 unshare_flags |= CLONE_NEWNS;
0490a6ca
KZ
792 if (optarg)
793 set_ns_target(CLONE_NEWNS, optarg);
4205f1fd
MG
794 break;
795 case 'u':
ef6acdb8 796 unshare_flags |= CLONE_NEWUTS;
0490a6ca
KZ
797 if (optarg)
798 set_ns_target(CLONE_NEWUTS, optarg);
4205f1fd
MG
799 break;
800 case 'i':
ef6acdb8 801 unshare_flags |= CLONE_NEWIPC;
0490a6ca
KZ
802 if (optarg)
803 set_ns_target(CLONE_NEWIPC, optarg);
4205f1fd
MG
804 break;
805 case 'n':
ef6acdb8 806 unshare_flags |= CLONE_NEWNET;
0490a6ca
KZ
807 if (optarg)
808 set_ns_target(CLONE_NEWNET, optarg);
4205f1fd 809 break;
bc7f9b95
EB
810 case 'p':
811 unshare_flags |= CLONE_NEWPID;
0490a6ca
KZ
812 if (optarg)
813 set_ns_target(CLONE_NEWPID, optarg);
bc7f9b95
EB
814 break;
815 case 'U':
816 unshare_flags |= CLONE_NEWUSER;
0490a6ca
KZ
817 if (optarg)
818 set_ns_target(CLONE_NEWUSER, optarg);
bc7f9b95 819 break;
f9e7b66d
SH
820 case 'C':
821 unshare_flags |= CLONE_NEWCGROUP;
822 if (optarg)
823 set_ns_target(CLONE_NEWCGROUP, optarg);
824 break;
f218fd97 825 case 'T':
be7df01a
AR
826 unshare_flags |= CLONE_NEWTIME;
827 if (optarg)
828 set_ns_target(CLONE_NEWTIME, optarg);
829 break;
6728ca10
KZ
830 case OPT_MOUNTPROC:
831 unshare_flags |= CLONE_NEWNS;
832 procmnt = optarg ? optarg : "/proc";
833 break;
6e837b5a
MHB
834 case OPT_MAPUSER:
835 unshare_flags |= CLONE_NEWUSER;
987550cb 836 mapuser = get_user(optarg, _("failed to parse uid"));
6e837b5a
MHB
837 break;
838 case OPT_MAPGROUP:
839 unshare_flags |= CLONE_NEWUSER;
987550cb 840 mapgroup = get_group(optarg, _("failed to parse gid"));
6e837b5a 841 break;
4da21e37 842 case 'r':
4175f29e 843 unshare_flags |= CLONE_NEWUSER;
6e837b5a
MHB
844 mapuser = 0;
845 mapgroup = 0;
4175f29e
JP
846 break;
847 case 'c':
4da21e37 848 unshare_flags |= CLONE_NEWUSER;
6e837b5a
MHB
849 mapuser = real_euid;
850 mapgroup = real_egid;
4da21e37 851 break;
ff5dc96e
SA
852 case OPT_MAPUSERS:
853 unshare_flags |= CLONE_NEWUSER;
e67b0ba3
SA
854 if (!strcmp(optarg, "auto"))
855 usermap = read_subid_range(_PATH_SUBUID, real_euid);
856 else
857 usermap = get_map_range(optarg);
ff5dc96e
SA
858 break;
859 case OPT_MAPGROUPS:
860 unshare_flags |= CLONE_NEWUSER;
e67b0ba3
SA
861 if (!strcmp(optarg, "auto"))
862 groupmap = read_subid_range(_PATH_SUBGID, real_egid);
863 else
864 groupmap = get_map_range(optarg);
865 break;
866 case OPT_MAPAUTO:
867 unshare_flags |= CLONE_NEWUSER;
868 usermap = read_subid_range(_PATH_SUBUID, real_euid);
869 groupmap = read_subid_range(_PATH_SUBGID, real_egid);
ff5dc96e 870 break;
fbceefde
KZ
871 case OPT_SETGROUPS:
872 setgrpcmd = setgroups_str2id(optarg);
873 break;
f0f22e9c
KZ
874 case OPT_PROPAGATION:
875 propagation = parse_propagation(optarg);
876 break;
8e8f0fa5 877 case OPT_KILLCHILD:
8e8f0fa5 878 forkit = 1;
8b39a17c
NH
879 if (optarg) {
880 if ((kill_child_signo = signame_to_signum(optarg)) < 0)
881 errx(EXIT_FAILURE, _("unknown signal: %s"),
882 optarg);
883 } else {
884 kill_child_signo = SIGKILL;
885 }
8e8f0fa5 886 break;
cef4decf
JP
887 case OPT_KEEPCAPS:
888 keepcaps = 1;
889 cap_last_cap(); /* Force last cap to be cached before we fork. */
890 break;
f0af42b5
LV
891 case 'S':
892 uid = strtoul_or_err(optarg, _("failed to parse uid"));
893 force_uid = 1;
894 break;
895 case 'G':
896 gid = strtoul_or_err(optarg, _("failed to parse gid"));
897 force_gid = 1;
898 break;
bf8834d4
LV
899 case 'R':
900 newroot = optarg;
901 break;
902 case 'w':
903 newdir = optarg;
904 break;
be7df01a
AR
905 case OPT_MONOTONIC:
906 monotonic = strtoul_or_err(optarg, _("failed to parse monotonic offset"));
907 force_monotonic = 1;
908 break;
909 case OPT_BOOTTIME:
910 boottime = strtoul_or_err(optarg, _("failed to parse boottime offset"));
911 force_boottime = 1;
912 break;
2c308875
KZ
913
914 case 'h':
915 usage();
916 case 'V':
917 print_version(EXIT_SUCCESS);
4205f1fd 918 default:
677ec86c 919 errtryhelp(EXIT_FAILURE);
4205f1fd
MG
920 }
921 }
922
be7df01a
AR
923 if ((force_monotonic || force_boottime) && !(unshare_flags & CLONE_NEWTIME))
924 errx(EXIT_FAILURE, _("options --monotonic and --boottime require "
925 "unsharing of a time namespace (-t)"));
926
ec711d72
KZ
927 /* clear any inherited settings */
928 signal(SIGCHLD, SIG_DFL);
929
c84f2590 930 if (npersists && (unshare_flags & CLONE_NEWNS))
783bb52a 931 pid_bind = bind_ns_files_from_child(&fd_bind);
c84f2590 932
ff5dc96e
SA
933 if (usermap || groupmap)
934 pid_idmap = map_ids_from_child(&fd_idmap, mapuser, usermap,
935 mapgroup, groupmap);
936
2eefe517 937 if (-1 == unshare(unshare_flags))
4205f1fd
MG
938 err(EXIT_FAILURE, _("unshare failed"));
939
ff5dc96e
SA
940 /* Tell child we've called unshare() */
941 if (usermap || groupmap)
942 sync_with_child(pid_idmap, fd_idmap);
943
0d5260b6 944 if (force_boottime)
945 settime(boottime, CLOCK_BOOTTIME);
946
947 if (force_monotonic)
948 settime(monotonic, CLOCK_MONOTONIC);
949
950 if (forkit) {
f2f98017
EC
951 if (sigemptyset(&sigset) != 0 ||
952 sigaddset(&sigset, SIGINT) != 0 ||
953 sigaddset(&sigset, SIGTERM) != 0 ||
954 sigprocmask(SIG_BLOCK, &sigset, &oldsigset) != 0)
955 err(EXIT_FAILURE, _("sigprocmask block failed"));
3ba6736f 956
0d5260b6 957 /* force child forking before mountspace binding
958 * so pid_for_children is populated */
959 pid = fork();
960
961 switch(pid) {
962 case -1:
963 err(EXIT_FAILURE, _("fork failed"));
964 case 0: /* child */
f2f98017
EC
965 if (sigprocmask(SIG_SETMASK, &oldsigset, NULL))
966 err(EXIT_FAILURE,
967 _("sigprocmask restore failed"));
783bb52a
SA
968 if (npersists && (unshare_flags & CLONE_NEWNS))
969 close(fd_bind);
0d5260b6 970 break;
971 default: /* parent */
972 break;
973 }
974 }
975
976 if (npersists && (pid || !forkit)) {
977 /* run in parent */
783bb52a
SA
978 if (pid_bind && (unshare_flags & CLONE_NEWNS))
979 sync_with_child(pid_bind, fd_bind);
980 else
c84f2590
KZ
981 /* simple way, just bind */
982 bind_ns_files(getpid());
983 }
984
0d5260b6 985 if (pid) {
986 if (waitpid(pid, &status, 0) == -1)
987 err(EXIT_FAILURE, _("waitpid failed"));
3ba6736f 988
0d5260b6 989 if (WIFEXITED(status))
990 return WEXITSTATUS(status);
f2f98017
EC
991 if (WIFSIGNALED(status)) {
992
993 /* Ensure the signal that terminated the child will
994 * also terminate the parent. */
995
996 int termsig = WTERMSIG(status);
997
998 if (signal(termsig, SIG_DFL) == SIG_ERR ||
999 sigemptyset(&sigset) != 0 ||
1000 sigaddset(&sigset, termsig) != 0 ||
1001 sigprocmask(SIG_UNBLOCK, &sigset, NULL) != 0)
1002 err(EXIT_FAILURE,
1003 _("sigprocmask unblock failed"));
1004
1005 kill(getpid(), termsig);
1006 }
0d5260b6 1007 err(EXIT_FAILURE, _("child exit failed"));
5088ec33
MF
1008 }
1009
525a0ab2
KZ
1010 if (kill_child_signo != 0 && prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0)
1011 err(EXIT_FAILURE, "prctl failed");
0490a6ca 1012
ff5dc96e 1013 if (mapuser != (uid_t) -1 && !usermap)
6e837b5a
MHB
1014 map_id(_PATH_PROC_UIDMAP, mapuser, real_euid);
1015
4175f29e
JP
1016 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
1017 * has been disabled unless /proc/self/setgroups is written
1018 * first to permanently disable the ability to call setgroups
1019 * in that user namespace. */
ff5dc96e 1020 if (mapgroup != (gid_t) -1 && !groupmap) {
fbceefde
KZ
1021 if (setgrpcmd == SETGROUPS_ALLOW)
1022 errx(EXIT_FAILURE, _("options --setgroups=allow and "
6e837b5a 1023 "--map-group are mutually exclusive"));
fbceefde 1024 setgroups_control(SETGROUPS_DENY);
6e837b5a
MHB
1025 map_id(_PATH_PROC_GIDMAP, mapgroup, real_egid);
1026 }
fbceefde 1027
6e837b5a
MHB
1028 if (setgrpcmd != SETGROUPS_NONE)
1029 setgroups_control(setgrpcmd);
4da21e37 1030
f0f22e9c
KZ
1031 if ((unshare_flags & CLONE_NEWNS) && propagation)
1032 set_propagation(propagation);
1033
bf8834d4
LV
1034 if (newroot) {
1035 if (chroot(newroot) != 0)
1036 err(EXIT_FAILURE,
1037 _("cannot change root directory to '%s'"), newroot);
1038 newdir = newdir ?: "/";
1039 }
1040 if (newdir && chdir(newdir))
1041 err(EXIT_FAILURE, _("cannot chdir to '%s'"), newdir);
1042
1043 if (procmnt) {
ef7eccad 1044 /* When not changing root and using the default propagation flags
1045 then the recursive propagation change of root will
1046 automatically change that of an existing proc mount. */
1047 if (!newroot && propagation != (MS_PRIVATE|MS_REC)) {
1048 int rc = mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL);
1049
1050 /* Custom procmnt means that proc is very likely not mounted, causing EINVAL.
1051 Ignoring the error in this specific instance is considered safe. */
1052 if(rc != 0 && errno != EINVAL)
1053 err(EXIT_FAILURE, _("cannot change %s filesystem propagation"), procmnt);
1054 }
1055
bf8834d4 1056 if (mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0)
6728ca10 1057 err(EXIT_FAILURE, _("mount %s failed"), procmnt);
bf8834d4 1058 }
6728ca10 1059
f0af42b5
LV
1060 if (force_gid) {
1061 if (setgroups(0, NULL) != 0) /* drop supplementary groups */
1062 err(EXIT_FAILURE, _("setgroups failed"));
1063 if (setgid(gid) < 0) /* change GID */
1064 err(EXIT_FAILURE, _("setgid failed"));
1065 }
1066 if (force_uid && setuid(uid) < 0) /* change UID */
1067 err(EXIT_FAILURE, _("setuid failed"));
1068
cef4decf
JP
1069 /* We use capabilities system calls to propagate the permitted
1070 * capabilities into the ambient set because we have already
1071 * forked so are in async-signal-safe context. */
1072 if (keepcaps && (unshare_flags & CLONE_NEWUSER)) {
1073 struct __user_cap_header_struct header = {
1074 .version = _LINUX_CAPABILITY_VERSION_3,
1075 .pid = 0,
1076 };
1077
9eba8476 1078 struct __user_cap_data_struct payload[_LINUX_CAPABILITY_U32S_3] = {{ 0 }};
232fcae8 1079 uint64_t effective, cap;
cef4decf 1080
ac0391cc 1081 if (capget(&header, payload) < 0)
cef4decf 1082 err(EXIT_FAILURE, _("capget failed"));
cef4decf
JP
1083
1084 /* In order the make capabilities ambient, we first need to ensure
1085 * that they are all inheritable. */
1086 payload[0].inheritable = payload[0].permitted;
1087 payload[1].inheritable = payload[1].permitted;
1088
ac0391cc 1089 if (capset(&header, payload) < 0)
cef4decf 1090 err(EXIT_FAILURE, _("capset failed"));
cef4decf 1091
ac0391cc 1092 effective = ((uint64_t)payload[1].effective << 32) | (uint64_t)payload[0].effective;
cef4decf 1093
232fcae8 1094 for (cap = 0; cap < (sizeof(effective) * 8); cap++) {
cef4decf
JP
1095 /* This is the same check as cap_valid(), but using
1096 * the runtime value for the last valid cap. */
232fcae8 1097 if (cap > (uint64_t) cap_last_cap())
cef4decf 1098 continue;
cef4decf 1099
ac0391cc
KZ
1100 if ((effective & (1 << cap))
1101 && prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) < 0)
cef4decf 1102 err(EXIT_FAILURE, _("prctl(PR_CAP_AMBIENT) failed"));
cef4decf
JP
1103 }
1104 }
1105
57580694
ZJS
1106 if (optind < argc) {
1107 execvp(argv[optind], argv + optind);
fd777151 1108 errexec(argv[optind]);
57580694
ZJS
1109 }
1110 exec_shell();
4205f1fd 1111}