]> git.ipfire.org Git - thirdparty/util-linux.git/blob - sys-utils/unshare.c
unshare: Support multiple ID ranges for user and group maps
[thirdparty/util-linux.git] / sys-utils / unshare.c
1 /*
2 * unshare(1) - command-line interface for unshare(2)
3 *
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21 #include <errno.h>
22 #include <getopt.h>
23 #include <poll.h>
24 #include <sched.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <unistd.h>
28 #include <sys/eventfd.h>
29 #include <sys/wait.h>
30 #include <sys/mount.h>
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include <sys/prctl.h>
34 #include <grp.h>
35
36 /* we only need some defines missing in sys/mount.h, no libmount linkage */
37 #include <libmount.h>
38
39 #include "nls.h"
40 #include "c.h"
41 #include "caputils.h"
42 #include "closestream.h"
43 #include "namespace.h"
44 #include "pidfd-utils.h"
45 #include "exec_shell.h"
46 #include "xalloc.h"
47 #include "pathnames.h"
48 #include "all-io.h"
49 #include "signames.h"
50 #include "strutils.h"
51 #include "pwdutils.h"
52
53 /* synchronize parent and child by pipe */
54 #define PIPE_SYNC_BYTE 0x06
55
56 /* 'private' is kernel default */
57 #define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
58
59 /* /proc namespace files and mountpoints for binds */
60 static struct namespace_file {
61 int type; /* CLONE_NEW* */
62 const char *name; /* ns/<type> */
63 const char *target; /* user specified target for bind mount */
64 } namespace_files[] = {
65 { .type = CLONE_NEWUSER, .name = "ns/user" },
66 { .type = CLONE_NEWCGROUP,.name = "ns/cgroup" },
67 { .type = CLONE_NEWIPC, .name = "ns/ipc" },
68 { .type = CLONE_NEWUTS, .name = "ns/uts" },
69 { .type = CLONE_NEWNET, .name = "ns/net" },
70 { .type = CLONE_NEWPID, .name = "ns/pid_for_children" },
71 { .type = CLONE_NEWNS, .name = "ns/mnt" },
72 { .type = CLONE_NEWTIME, .name = "ns/time_for_children" },
73 { .name = NULL }
74 };
75
76 static int npersists; /* number of persistent namespaces */
77
78 enum {
79 SETGROUPS_NONE = -1,
80 SETGROUPS_DENY = 0,
81 SETGROUPS_ALLOW = 1,
82 };
83
84 static const char *setgroups_strings[] =
85 {
86 [SETGROUPS_DENY] = "deny",
87 [SETGROUPS_ALLOW] = "allow"
88 };
89
90 static int setgroups_str2id(const char *str)
91 {
92 size_t i;
93
94 for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++)
95 if (strcmp(str, setgroups_strings[i]) == 0)
96 return i;
97
98 errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str);
99 }
100
101 static void setgroups_control(int action)
102 {
103 const char *file = _PATH_PROC_SETGROUPS;
104 const char *cmd;
105 int fd;
106
107 if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings))
108 return;
109 cmd = setgroups_strings[action];
110
111 fd = open(file, O_WRONLY);
112 if (fd < 0) {
113 if (errno == ENOENT)
114 return;
115 err(EXIT_FAILURE, _("cannot open %s"), file);
116 }
117
118 if (write_all(fd, cmd, strlen(cmd)))
119 err(EXIT_FAILURE, _("write failed %s"), file);
120 close(fd);
121 }
122
123 static void map_id(const char *file, uint32_t from, uint32_t to)
124 {
125 char *buf;
126 int fd;
127
128 fd = open(file, O_WRONLY);
129 if (fd < 0)
130 err(EXIT_FAILURE, _("cannot open %s"), file);
131
132 xasprintf(&buf, "%u %u 1", from, to);
133 if (write_all(fd, buf, strlen(buf)))
134 err(EXIT_FAILURE, _("write failed %s"), file);
135 free(buf);
136 close(fd);
137 }
138
139 static unsigned long parse_propagation(const char *str)
140 {
141 size_t i;
142 static const struct prop_opts {
143 const char *name;
144 unsigned long flag;
145 } opts[] = {
146 { "slave", MS_REC | MS_SLAVE },
147 { "private", MS_REC | MS_PRIVATE },
148 { "shared", MS_REC | MS_SHARED },
149 { "unchanged", 0 }
150 };
151
152 for (i = 0; i < ARRAY_SIZE(opts); i++) {
153 if (strcmp(opts[i].name, str) == 0)
154 return opts[i].flag;
155 }
156
157 errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str);
158 }
159
160 static void set_propagation(unsigned long flags)
161 {
162 if (flags == 0)
163 return;
164
165 if (mount("none", "/", NULL, flags, NULL) != 0)
166 err(EXIT_FAILURE, _("cannot change root filesystem propagation"));
167 }
168
169
170 static int set_ns_target(int type, const char *path)
171 {
172 struct namespace_file *ns;
173
174 for (ns = namespace_files; ns->name; ns++) {
175 if (ns->type != type)
176 continue;
177 ns->target = path;
178 npersists++;
179 return 0;
180 }
181
182 return -EINVAL;
183 }
184
185 static int bind_ns_files(pid_t pid)
186 {
187 struct namespace_file *ns;
188 char src[PATH_MAX];
189
190 for (ns = namespace_files; ns->name; ns++) {
191 if (!ns->target)
192 continue;
193
194 snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name);
195
196 if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0)
197 err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target);
198 }
199
200 return 0;
201 }
202
203 static ino_t get_mnt_ino(pid_t pid)
204 {
205 struct stat st;
206 char path[PATH_MAX];
207
208 snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
209
210 if (stat(path, &st) != 0)
211 err(EXIT_FAILURE, _("stat of %s failed"), path);
212 return st.st_ino;
213 }
214
215 static void settime(int64_t offset, clockid_t clk_id)
216 {
217 char buf[sizeof(stringify_value(ULONG_MAX)) * 3];
218 int fd, len;
219
220 len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, offset);
221
222 fd = open("/proc/self/timens_offsets", O_WRONLY);
223 if (fd < 0)
224 err(EXIT_FAILURE, _("failed to open /proc/self/timens_offsets"));
225
226 if (write(fd, buf, len) != len)
227 err(EXIT_FAILURE, _("failed to write to /proc/self/timens_offsets"));
228
229 close(fd);
230 }
231
232 /**
233 * waitchild() - Wait for a process to exit successfully
234 * @pid: PID of the process to wait for
235 *
236 * Wait for a process to exit successfully. If it exits with a non-zero return
237 * code, then exit() with the same status.
238 */
239 static void waitchild(int pid)
240 {
241 int rc, status;
242
243 do {
244 rc = waitpid(pid, &status, 0);
245 if (rc < 0) {
246 if (errno == EINTR)
247 continue;
248 err(EXIT_FAILURE, _("waitpid failed"));
249 }
250 if (WIFEXITED(status) &&
251 WEXITSTATUS(status) != EXIT_SUCCESS)
252 exit(WEXITSTATUS(status));
253 } while (rc < 0);
254 }
255
256 /**
257 * sync_with_child() - Tell our child we're ready and wait for it to exit
258 * @pid: The pid of our child
259 * @fd: A file descriptor created with eventfd()
260 *
261 * This tells a child created with fork_and_wait() that we are ready for it to
262 * continue. Once we have done that, wait for our child to exit.
263 */
264 static void sync_with_child(pid_t pid, int fd)
265 {
266 uint64_t ch = PIPE_SYNC_BYTE;
267
268 write_all(fd, &ch, sizeof(ch));
269 close(fd);
270
271 waitchild(pid);
272 }
273
274 /**
275 * fork_and_wait() - Fork and wait to be sync'd with
276 * @fd - A file descriptor created with eventfd() which should be passed to
277 * sync_with_child()
278 *
279 * This creates an eventfd and forks. The parent process returns immediately,
280 * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
281 * This allows the parent to perform some tasks before the child starts its
282 * work. The parent should call sync_with_child() once it is ready for the
283 * child to continue.
284 *
285 * Return: The pid from fork()
286 */
287 static pid_t fork_and_wait(int *fd)
288 {
289 pid_t pid;
290 uint64_t ch;
291
292 *fd = eventfd(0, 0);
293 if (*fd < 0)
294 err(EXIT_FAILURE, _("eventfd failed"));
295
296 pid = fork();
297 if (pid < 0)
298 err(EXIT_FAILURE, _("fork failed"));
299
300 if (!pid) {
301 /* wait for the our parent to tell us to continue */
302 if (read_all(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) ||
303 ch != PIPE_SYNC_BYTE)
304 err(EXIT_FAILURE, _("failed to read eventfd"));
305 close(*fd);
306 }
307
308 return pid;
309 }
310
311 static pid_t bind_ns_files_from_child(int *fd)
312 {
313 pid_t child, ppid = getpid();
314 ino_t ino = get_mnt_ino(ppid);
315
316 child = fork_and_wait(fd);
317 if (child)
318 return child;
319
320 if (get_mnt_ino(ppid) == ino)
321 exit(EXIT_FAILURE);
322 bind_ns_files(ppid);
323 exit(EXIT_SUCCESS);
324 }
325
326 static uid_t get_user(const char *s, const char *err)
327 {
328 struct passwd *pw;
329 char *buf = NULL;
330 uid_t ret;
331
332 pw = xgetpwnam(s, &buf);
333 if (pw) {
334 ret = pw->pw_uid;
335 free(pw);
336 free(buf);
337 } else {
338 ret = strtoul_or_err(s, err);
339 }
340
341 return ret;
342 }
343
344 static gid_t get_group(const char *s, const char *err)
345 {
346 struct group *gr;
347 char *buf = NULL;
348 gid_t ret;
349
350 gr = xgetgrnam(s, &buf);
351 if (gr) {
352 ret = gr->gr_gid;
353 free(gr);
354 free(buf);
355 } else {
356 ret = strtoul_or_err(s, err);
357 }
358
359 return ret;
360 }
361
362 /**
363 * struct map_range - A range of IDs to map
364 * @outer: First ID mapped on the outside of the namespace
365 * @inner: First ID mapped on the inside of the namespace
366 * @count: Length of the inside and outside ranges
367 * @next: Next range of IDs in the chain
368 *
369 * A range of uids/gids to map using new[gu]idmap.
370 */
371 struct map_range {
372 unsigned int outer;
373 unsigned int inner;
374 unsigned int count;
375 struct map_range *next;
376 };
377
378 static void insert_map_range(struct map_range **chain, struct map_range map)
379 {
380 struct map_range *tail = *chain;
381 *chain = xmalloc(sizeof(**chain));
382 memcpy(*chain, &map, sizeof(**chain));
383 (*chain)->next = tail;
384 }
385
386 /**
387 * get_map_range() - Parse a mapping range from a string
388 * @s: A string of the format inner:outer:count or outer,inner,count
389 *
390 * Parse a string of the form inner:outer:count or outer,inner,count into
391 * a new mapping range.
392 *
393 * Return: A struct map_range
394 */
395 static struct map_range get_map_range(const char *s)
396 {
397 int end;
398 struct map_range ret;
399
400 ret.next = NULL;
401
402 if (sscanf(s, "%u:%u:%u%n", &ret.inner, &ret.outer, &ret.count,
403 &end) >= 3 && !s[end])
404 return ret; /* inner:outer:count */
405
406 if (sscanf(s, "%u,%u,%u%n", &ret.outer, &ret.inner, &ret.count,
407 &end) >= 3 && !s[end])
408 return ret; /* outer,inner,count */
409
410 errx(EXIT_FAILURE, _("invalid mapping '%s'"), s);
411 }
412
413 /**
414 * read_subid_range() - Look up a user's sub[gu]id range
415 * @filename: The file to look up the range from. This should be either
416 * ``/etc/subuid`` or ``/etc/subgid``.
417 * @uid: The uid of the user whose range we should look up.
418 *
419 * This finds the first subid range matching @uid in @filename.
420 */
421 static struct map_range read_subid_range(char *filename, uid_t uid)
422 {
423 char *line = NULL, *pwbuf;
424 FILE *idmap;
425 size_t n = 0;
426 struct passwd *pw;
427 struct map_range map;
428
429 map.inner = -1;
430 map.next = NULL;
431
432 pw = xgetpwuid(uid, &pwbuf);
433 if (!pw)
434 errx(EXIT_FAILURE, _("you (user %d) don't exist."), uid);
435
436 idmap = fopen(filename, "r");
437 if (!idmap)
438 err(EXIT_FAILURE, _("could not open '%s'"), filename);
439
440 /*
441 * Each line in sub[ug]idmap looks like
442 * username:subuid:count
443 * OR
444 * uid:subuid:count
445 */
446 while (getline(&line, &n, idmap) != -1) {
447 char *rest, *s;
448
449 rest = strchr(line, ':');
450 if (!rest)
451 continue;
452 *rest = '\0';
453
454 if (strcmp(line, pw->pw_name) &&
455 strtoul(line, NULL, 10) != pw->pw_uid)
456 continue;
457
458 s = rest + 1;
459 rest = strchr(s, ':');
460 if (!rest)
461 continue;
462 *rest = '\0';
463 map.outer = strtoul_or_err(s, _("failed to parse subid map"));
464
465 s = rest + 1;
466 rest = strchr(s, '\n');
467 if (rest)
468 *rest = '\0';
469 map.count = strtoul_or_err(s, _("failed to parse subid map"));
470
471 fclose(idmap);
472 free(pw);
473 free(pwbuf);
474
475 return map;
476 }
477
478 errx(EXIT_FAILURE, _("no line matching user \"%s\" in %s"),
479 pw->pw_name, filename);
480 }
481
482 /**
483 * add_single_map_range() - Add a single-ID map into a list without overlap
484 * @chain: A linked list of ID range mappings
485 * @outer: ID outside the namespace for a single map.
486 * @inner: ID inside the namespace for a single map, or -1 for no map.
487 *
488 * Prepend a mapping to @chain for the single ID @outer to the single ID
489 * @inner. The tricky bit is that we cannot let existing mappings overlap it.
490 * We accomplish this by removing a "hole" from each existing range @map, if
491 * @outer or @inner overlap it. This may result in one less than @map->count
492 * IDs being mapped from @map. The unmapped IDs are always the topmost IDs
493 * of the mapping (either in the parent or the child namespace).
494 *
495 * Most of the time, this function will be called with a single mapping range
496 * @map, @map->outer as some large ID, @map->inner as 0, and @map->count as a
497 * large number (at least 1000, but less than @map->outer). Typically, there
498 * will be no conflict with @outer. However, @inner may split the mapping for
499 * e.g. --map-current-user.
500 */
501
502 static void add_single_map_range(struct map_range **chain, unsigned int outer,
503 unsigned int inner)
504 {
505 struct map_range *map = *chain;
506
507 if (inner + 1 == 0)
508 outer = (unsigned int) -1;
509 *chain = NULL;
510
511 while (map) {
512 struct map_range lo, mid, hi, *next = map->next;
513 unsigned int inner_offset, outer_offset;
514
515 /*
516 * Start inner IDs from zero for an auto mapping; otherwise, if
517 * the single mapping exists and overlaps the range, remove an ID
518 */
519 if (map->inner + 1 == 0)
520 map->inner = 0;
521 else if (inner + 1 != 0 &&
522 ((outer >= map->outer && outer <= map->outer + map->count) ||
523 (inner >= map->inner && inner <= map->inner + map->count)))
524 map->count--;
525
526 /* Determine where the splits between lo, mid, and hi will be */
527 outer_offset = min(outer > map->outer ? outer - map->outer : 0,
528 map->count);
529 inner_offset = min(inner > map->inner ? inner - map->inner : 0,
530 map->count);
531
532 /*
533 * In the worst case, we need three mappings:
534 * From the bottom of map to either inner or outer
535 */
536 lo.outer = map->outer;
537 lo.inner = map->inner;
538 lo.count = min(inner_offset, outer_offset);
539
540 /* From the lower of inner or outer to the higher */
541 mid.outer = lo.outer + lo.count;
542 mid.outer += mid.outer == outer;
543 mid.inner = lo.inner + lo.count;
544 mid.inner += mid.inner == inner;
545 mid.count = abs_diff(outer_offset, inner_offset);
546
547 /* And from the higher of inner or outer to the end of the map */
548 hi.outer = mid.outer + mid.count;
549 hi.outer += hi.outer == outer;
550 hi.inner = mid.inner + mid.count;
551 hi.inner += hi.inner == inner;
552 hi.count = map->count - lo.count - mid.count;
553
554 /* Insert non-empty mappings into the output chain */
555 if (hi.count)
556 insert_map_range(chain, hi);
557 if (mid.count)
558 insert_map_range(chain, mid);
559 if (lo.count)
560 insert_map_range(chain, lo);
561
562 free(map);
563 map = next;
564 }
565
566 if (inner + 1 != 0) {
567 /* Insert single ID mapping as the first entry in the chain */
568 insert_map_range(chain, (struct map_range) {
569 .inner = inner,
570 .outer = outer,
571 .count = 1
572 });
573 }
574 }
575
576 /**
577 * map_ids() - Create a new uid/gid map
578 * @idmapper: Either newuidmap or newgidmap
579 * @ppid: Pid to set the map for
580 * @chain: A linked list of ID range mappings
581 *
582 * This creates a new uid/gid map for @ppid using @idmapper to set the
583 * mapping for each of the ranges in @chain.
584 *
585 * This function always exec()s or errors out and does not return.
586 */
587 static void __attribute__((__noreturn__))
588 map_ids(const char *idmapper, int ppid, struct map_range *chain)
589 {
590 unsigned int i = 0, length = 3;
591 char **argv;
592
593 for (struct map_range *map = chain; map; map = map->next)
594 length += 3;
595 argv = xcalloc(length, sizeof(*argv));
596 argv[i++] = xstrdup(idmapper);
597 xasprintf(&argv[i++], "%u", ppid);
598
599 for (struct map_range *map = chain; map; map = map->next) {
600 xasprintf(&argv[i++], "%u", map->inner);
601 xasprintf(&argv[i++], "%u", map->outer);
602 xasprintf(&argv[i++], "%u", map->count);
603 }
604
605 argv[i] = NULL;
606 execvp(idmapper, argv);
607 errexec(idmapper);
608 }
609
610 /**
611 * map_ids_from_child() - Set up a new uid/gid map
612 * @fd: The eventfd to wait on
613 * @mapuser: The user to map the current user to (or -1)
614 * @usermap: The range of UIDs to map (or %NULL)
615 * @mapgroup: The group to map the current group to (or -1)
616 * @groupmap: The range of GIDs to map (or %NULL)
617 *
618 * fork_and_wait() for our parent to call sync_with_child() on @fd. Upon
619 * recieving the go-ahead, use newuidmap and newgidmap to set the uid/gid map
620 * for our parent's PID.
621 *
622 * Return: The pid of the child.
623 */
624 static pid_t map_ids_from_child(int *fd, uid_t mapuser,
625 struct map_range *usermap, gid_t mapgroup,
626 struct map_range *groupmap)
627 {
628 pid_t child, pid = 0;
629 pid_t ppid = getpid();
630
631 child = fork_and_wait(fd);
632 if (child)
633 return child;
634
635 if (usermap)
636 add_single_map_range(&usermap, geteuid(), mapuser);
637 if (groupmap)
638 add_single_map_range(&groupmap, getegid(), mapgroup);
639
640 /* Avoid forking more than we need to */
641 if (usermap && groupmap) {
642 pid = fork();
643 if (pid < 0)
644 err(EXIT_FAILURE, _("fork failed"));
645 if (pid)
646 waitchild(pid);
647 }
648
649 if (!pid && usermap)
650 map_ids("newuidmap", ppid, usermap);
651 if (groupmap)
652 map_ids("newgidmap", ppid, groupmap);
653 exit(EXIT_SUCCESS);
654 }
655
656 static void __attribute__((__noreturn__)) usage(void)
657 {
658 FILE *out = stdout;
659
660 fputs(USAGE_HEADER, out);
661 fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"),
662 program_invocation_short_name);
663
664 fputs(USAGE_SEPARATOR, out);
665 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out);
666
667 fputs(USAGE_OPTIONS, out);
668 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out);
669 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out);
670 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out);
671 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out);
672 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out);
673 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out);
674 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out);
675 fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out);
676 fputs(USAGE_SEPARATOR, out);
677 fputs(_(" -f, --fork fork before launching <program>\n"), out);
678 fputs(_(" --map-user=<uid>|<name> map current user to uid (implies --user)\n"), out);
679 fputs(_(" --map-group=<gid>|<name> map current group to gid (implies --user)\n"), out);
680 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out);
681 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out);
682 fputs(_(" --map-auto map users and groups automatically (implies --user)\n"), out);
683 fputs(_(" --map-users=<inneruid>:<outeruid>:<count>\n"
684 " map count users from outeruid to inneruid (implies --user)\n"), out);
685 fputs(_(" --map-groups=<innergid>:<outergid>:<count>\n"
686 " map count groups from outergid to innergid (implies --user)\n"), out);
687 fputs(USAGE_SEPARATOR, out);
688 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
689 " defaults to SIGKILL\n"), out);
690 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out);
691 fputs(_(" --propagation slave|shared|private|unchanged\n"
692 " modify mount propagation in mount namespace\n"), out);
693 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out);
694 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out);
695 fputs(USAGE_SEPARATOR, out);
696 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out);
697 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out);
698 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out);
699 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out);
700 fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out);
701 fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out);
702
703 fputs(USAGE_SEPARATOR, out);
704 fprintf(out, USAGE_HELP_OPTIONS(27));
705 fprintf(out, USAGE_MAN_TAIL("unshare(1)"));
706
707 exit(EXIT_SUCCESS);
708 }
709
710 int main(int argc, char *argv[])
711 {
712 enum {
713 OPT_MOUNTPROC = CHAR_MAX + 1,
714 OPT_PROPAGATION,
715 OPT_SETGROUPS,
716 OPT_KILLCHILD,
717 OPT_KEEPCAPS,
718 OPT_MONOTONIC,
719 OPT_BOOTTIME,
720 OPT_MAPUSER,
721 OPT_MAPUSERS,
722 OPT_MAPGROUP,
723 OPT_MAPGROUPS,
724 OPT_MAPAUTO,
725 };
726 static const struct option longopts[] = {
727 { "help", no_argument, NULL, 'h' },
728 { "version", no_argument, NULL, 'V' },
729
730 { "mount", optional_argument, NULL, 'm' },
731 { "uts", optional_argument, NULL, 'u' },
732 { "ipc", optional_argument, NULL, 'i' },
733 { "net", optional_argument, NULL, 'n' },
734 { "pid", optional_argument, NULL, 'p' },
735 { "user", optional_argument, NULL, 'U' },
736 { "cgroup", optional_argument, NULL, 'C' },
737 { "time", optional_argument, NULL, 'T' },
738
739 { "fork", no_argument, NULL, 'f' },
740 { "kill-child", optional_argument, NULL, OPT_KILLCHILD },
741 { "mount-proc", optional_argument, NULL, OPT_MOUNTPROC },
742 { "map-user", required_argument, NULL, OPT_MAPUSER },
743 { "map-users", required_argument, NULL, OPT_MAPUSERS },
744 { "map-group", required_argument, NULL, OPT_MAPGROUP },
745 { "map-groups", required_argument, NULL, OPT_MAPGROUPS },
746 { "map-root-user", no_argument, NULL, 'r' },
747 { "map-current-user", no_argument, NULL, 'c' },
748 { "map-auto", no_argument, NULL, OPT_MAPAUTO },
749 { "propagation", required_argument, NULL, OPT_PROPAGATION },
750 { "setgroups", required_argument, NULL, OPT_SETGROUPS },
751 { "keep-caps", no_argument, NULL, OPT_KEEPCAPS },
752 { "setuid", required_argument, NULL, 'S' },
753 { "setgid", required_argument, NULL, 'G' },
754 { "root", required_argument, NULL, 'R' },
755 { "wd", required_argument, NULL, 'w' },
756 { "monotonic", required_argument, NULL, OPT_MONOTONIC },
757 { "boottime", required_argument, NULL, OPT_BOOTTIME },
758 { NULL, 0, NULL, 0 }
759 };
760
761 int setgrpcmd = SETGROUPS_NONE;
762 int unshare_flags = 0;
763 int c, forkit = 0;
764 uid_t mapuser = -1;
765 gid_t mapgroup = -1;
766 struct map_range *usermap = NULL;
767 struct map_range *groupmap = NULL;
768 int kill_child_signo = 0; /* 0 means --kill-child was not used */
769 const char *procmnt = NULL;
770 const char *newroot = NULL;
771 const char *newdir = NULL;
772 pid_t pid_bind = 0, pid_idmap = 0;
773 pid_t pid = 0;
774 #ifdef UL_HAVE_PIDFD
775 int fd_parent_pid = -1;
776 #endif
777 int fd_idmap, fd_bind = -1;
778 sigset_t sigset, oldsigset;
779 int status;
780 unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT;
781 int force_uid = 0, force_gid = 0;
782 uid_t uid = 0, real_euid = geteuid();
783 gid_t gid = 0, real_egid = getegid();
784 int keepcaps = 0;
785 int64_t monotonic = 0;
786 int64_t boottime = 0;
787 int force_monotonic = 0;
788 int force_boottime = 0;
789
790 setlocale(LC_ALL, "");
791 bindtextdomain(PACKAGE, LOCALEDIR);
792 textdomain(PACKAGE);
793 close_stdout_atexit();
794
795 while ((c = getopt_long(argc, argv, "+fhVmuinpCTUrR:w:S:G:c", longopts, NULL)) != -1) {
796 switch (c) {
797 case 'f':
798 forkit = 1;
799 break;
800 case 'm':
801 unshare_flags |= CLONE_NEWNS;
802 if (optarg)
803 set_ns_target(CLONE_NEWNS, optarg);
804 break;
805 case 'u':
806 unshare_flags |= CLONE_NEWUTS;
807 if (optarg)
808 set_ns_target(CLONE_NEWUTS, optarg);
809 break;
810 case 'i':
811 unshare_flags |= CLONE_NEWIPC;
812 if (optarg)
813 set_ns_target(CLONE_NEWIPC, optarg);
814 break;
815 case 'n':
816 unshare_flags |= CLONE_NEWNET;
817 if (optarg)
818 set_ns_target(CLONE_NEWNET, optarg);
819 break;
820 case 'p':
821 unshare_flags |= CLONE_NEWPID;
822 if (optarg)
823 set_ns_target(CLONE_NEWPID, optarg);
824 break;
825 case 'U':
826 unshare_flags |= CLONE_NEWUSER;
827 if (optarg)
828 set_ns_target(CLONE_NEWUSER, optarg);
829 break;
830 case 'C':
831 unshare_flags |= CLONE_NEWCGROUP;
832 if (optarg)
833 set_ns_target(CLONE_NEWCGROUP, optarg);
834 break;
835 case 'T':
836 unshare_flags |= CLONE_NEWTIME;
837 if (optarg)
838 set_ns_target(CLONE_NEWTIME, optarg);
839 break;
840 case OPT_MOUNTPROC:
841 unshare_flags |= CLONE_NEWNS;
842 procmnt = optarg ? optarg : "/proc";
843 break;
844 case OPT_MAPUSER:
845 unshare_flags |= CLONE_NEWUSER;
846 mapuser = get_user(optarg, _("failed to parse uid"));
847 break;
848 case OPT_MAPGROUP:
849 unshare_flags |= CLONE_NEWUSER;
850 mapgroup = get_group(optarg, _("failed to parse gid"));
851 break;
852 case 'r':
853 unshare_flags |= CLONE_NEWUSER;
854 mapuser = 0;
855 mapgroup = 0;
856 break;
857 case 'c':
858 unshare_flags |= CLONE_NEWUSER;
859 mapuser = real_euid;
860 mapgroup = real_egid;
861 break;
862 case OPT_MAPUSERS:
863 unshare_flags |= CLONE_NEWUSER;
864 if (!strcmp(optarg, "auto"))
865 insert_map_range(&usermap,
866 read_subid_range(_PATH_SUBUID, real_euid));
867 else
868 insert_map_range(&usermap, get_map_range(optarg));
869 break;
870 case OPT_MAPGROUPS:
871 unshare_flags |= CLONE_NEWUSER;
872 if (!strcmp(optarg, "auto"))
873 insert_map_range(&groupmap,
874 read_subid_range(_PATH_SUBGID, real_euid));
875 else
876 insert_map_range(&groupmap, get_map_range(optarg));
877 break;
878 case OPT_MAPAUTO:
879 unshare_flags |= CLONE_NEWUSER;
880 insert_map_range(&usermap, read_subid_range(_PATH_SUBUID, real_euid));
881 insert_map_range(&groupmap, read_subid_range(_PATH_SUBGID, real_euid));
882 break;
883 case OPT_SETGROUPS:
884 setgrpcmd = setgroups_str2id(optarg);
885 break;
886 case OPT_PROPAGATION:
887 propagation = parse_propagation(optarg);
888 break;
889 case OPT_KILLCHILD:
890 forkit = 1;
891 if (optarg) {
892 if ((kill_child_signo = signame_to_signum(optarg)) < 0)
893 errx(EXIT_FAILURE, _("unknown signal: %s"),
894 optarg);
895 } else {
896 kill_child_signo = SIGKILL;
897 }
898 break;
899 case OPT_KEEPCAPS:
900 keepcaps = 1;
901 cap_last_cap(); /* Force last cap to be cached before we fork. */
902 break;
903 case 'S':
904 uid = strtoul_or_err(optarg, _("failed to parse uid"));
905 force_uid = 1;
906 break;
907 case 'G':
908 gid = strtoul_or_err(optarg, _("failed to parse gid"));
909 force_gid = 1;
910 break;
911 case 'R':
912 newroot = optarg;
913 break;
914 case 'w':
915 newdir = optarg;
916 break;
917 case OPT_MONOTONIC:
918 monotonic = strtos64_or_err(optarg, _("failed to parse monotonic offset"));
919 force_monotonic = 1;
920 break;
921 case OPT_BOOTTIME:
922 boottime = strtos64_or_err(optarg, _("failed to parse boottime offset"));
923 force_boottime = 1;
924 break;
925
926 case 'h':
927 usage();
928 case 'V':
929 print_version(EXIT_SUCCESS);
930 default:
931 errtryhelp(EXIT_FAILURE);
932 }
933 }
934
935 if ((force_monotonic || force_boottime) && !(unshare_flags & CLONE_NEWTIME))
936 errx(EXIT_FAILURE, _("options --monotonic and --boottime require "
937 "unsharing of a time namespace (-T)"));
938
939 /* clear any inherited settings */
940 signal(SIGCHLD, SIG_DFL);
941
942 if (npersists && (unshare_flags & CLONE_NEWNS))
943 pid_bind = bind_ns_files_from_child(&fd_bind);
944
945 if (usermap || groupmap)
946 pid_idmap = map_ids_from_child(&fd_idmap, mapuser, usermap,
947 mapgroup, groupmap);
948
949 if (-1 == unshare(unshare_flags))
950 err(EXIT_FAILURE, _("unshare failed"));
951
952 /* Tell child we've called unshare() */
953 if (usermap || groupmap)
954 sync_with_child(pid_idmap, fd_idmap);
955
956 if (force_boottime)
957 settime(boottime, CLOCK_BOOTTIME);
958
959 if (force_monotonic)
960 settime(monotonic, CLOCK_MONOTONIC);
961
962 if (forkit) {
963 if (sigemptyset(&sigset) != 0 ||
964 sigaddset(&sigset, SIGINT) != 0 ||
965 sigaddset(&sigset, SIGTERM) != 0 ||
966 sigprocmask(SIG_BLOCK, &sigset, &oldsigset) != 0)
967 err(EXIT_FAILURE, _("sigprocmask block failed"));
968 #ifdef UL_HAVE_PIDFD
969 if (kill_child_signo != 0) {
970 /* make a connection to the original process (parent) */
971 fd_parent_pid = pidfd_open(getpid(), 0);
972 if (0 > fd_parent_pid)
973 err(EXIT_FAILURE, _("pidfd_open failed"));
974 }
975 #endif
976 /* force child forking before mountspace binding so
977 * pid_for_children is populated */
978 pid = fork();
979
980 switch(pid) {
981 case -1:
982 err(EXIT_FAILURE, _("fork failed"));
983 case 0: /* child */
984 if (sigprocmask(SIG_SETMASK, &oldsigset, NULL))
985 err(EXIT_FAILURE,
986 _("sigprocmask restore failed"));
987 if (npersists && (unshare_flags & CLONE_NEWNS))
988 close(fd_bind);
989 break;
990 default: /* parent */
991 break;
992 }
993 }
994
995 if (npersists && (pid || !forkit)) {
996 /* run in parent */
997 if (pid_bind && (unshare_flags & CLONE_NEWNS))
998 sync_with_child(pid_bind, fd_bind);
999 else
1000 /* simple way, just bind */
1001 bind_ns_files(getpid());
1002 }
1003
1004 if (pid) {
1005 if (waitpid(pid, &status, 0) == -1)
1006 err(EXIT_FAILURE, _("waitpid failed"));
1007
1008 if (WIFEXITED(status))
1009 return WEXITSTATUS(status);
1010 if (WIFSIGNALED(status)) {
1011
1012 /* Ensure the signal that terminated the child will
1013 * also terminate the parent. */
1014
1015 int termsig = WTERMSIG(status);
1016
1017 if (signal(termsig, SIG_DFL) == SIG_ERR ||
1018 sigemptyset(&sigset) != 0 ||
1019 sigaddset(&sigset, termsig) != 0 ||
1020 sigprocmask(SIG_UNBLOCK, &sigset, NULL) != 0)
1021 err(EXIT_FAILURE,
1022 _("sigprocmask unblock failed"));
1023
1024 kill(getpid(), termsig);
1025 }
1026 err(EXIT_FAILURE, _("child exit failed"));
1027 }
1028
1029 if (kill_child_signo != 0) {
1030 if (prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0)
1031 err(EXIT_FAILURE, "prctl failed");
1032 #ifdef UL_HAVE_PIDFD
1033 /* Use poll() to check that there is still the original parent. */
1034 if (fd_parent_pid != -1) {
1035 struct pollfd pollfds[1] = {
1036 { .fd = fd_parent_pid, .events = POLLIN }
1037 };
1038 int nfds = poll(pollfds, 1, 0);
1039
1040 if (0 > nfds)
1041 err(EXIT_FAILURE, "poll parent pidfd failed");
1042
1043 /* If the child was re-parented before prctl(2) was called, the
1044 * new parent will likely not be interested in the precise exit
1045 * status of the orphan.
1046 */
1047 if (nfds)
1048 exit(EXIT_FAILURE);
1049
1050 close(fd_parent_pid);
1051 fd_parent_pid = -1;
1052 }
1053 #endif
1054 }
1055
1056 if (mapuser != (uid_t) -1 && !usermap)
1057 map_id(_PATH_PROC_UIDMAP, mapuser, real_euid);
1058
1059 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
1060 * has been disabled unless /proc/self/setgroups is written
1061 * first to permanently disable the ability to call setgroups
1062 * in that user namespace. */
1063 if (mapgroup != (gid_t) -1 && !groupmap) {
1064 if (setgrpcmd == SETGROUPS_ALLOW)
1065 errx(EXIT_FAILURE, _("options --setgroups=allow and "
1066 "--map-group are mutually exclusive"));
1067 setgroups_control(SETGROUPS_DENY);
1068 map_id(_PATH_PROC_GIDMAP, mapgroup, real_egid);
1069 }
1070
1071 if (setgrpcmd != SETGROUPS_NONE)
1072 setgroups_control(setgrpcmd);
1073
1074 if ((unshare_flags & CLONE_NEWNS) && propagation)
1075 set_propagation(propagation);
1076
1077 if (newroot) {
1078 if (chroot(newroot) != 0)
1079 err(EXIT_FAILURE,
1080 _("cannot change root directory to '%s'"), newroot);
1081 newdir = newdir ?: "/";
1082 }
1083 if (newdir && chdir(newdir))
1084 err(EXIT_FAILURE, _("cannot chdir to '%s'"), newdir);
1085
1086 if (procmnt) {
1087 /* When not changing root and using the default propagation flags
1088 then the recursive propagation change of root will
1089 automatically change that of an existing proc mount. */
1090 if (!newroot && propagation != (MS_PRIVATE|MS_REC)) {
1091 int rc = mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL);
1092
1093 /* Custom procmnt means that proc is very likely not mounted, causing EINVAL.
1094 Ignoring the error in this specific instance is considered safe. */
1095 if(rc != 0 && errno != EINVAL)
1096 err(EXIT_FAILURE, _("cannot change %s filesystem propagation"), procmnt);
1097 }
1098
1099 if (mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0)
1100 err(EXIT_FAILURE, _("mount %s failed"), procmnt);
1101 }
1102
1103 if (force_gid) {
1104 if (setgroups(0, NULL) != 0) /* drop supplementary groups */
1105 err(EXIT_FAILURE, _("setgroups failed"));
1106 if (setgid(gid) < 0) /* change GID */
1107 err(EXIT_FAILURE, _("setgid failed"));
1108 }
1109 if (force_uid && setuid(uid) < 0) /* change UID */
1110 err(EXIT_FAILURE, _("setuid failed"));
1111
1112 if (keepcaps && (unshare_flags & CLONE_NEWUSER))
1113 cap_permitted_to_ambient();
1114
1115 if (optind < argc) {
1116 execvp(argv[optind], argv + optind);
1117 errexec(argv[optind]);
1118 }
1119 exec_shell();
1120 }