]> git.ipfire.org Git - thirdparty/util-linux.git/blame - sys-utils/unshare.c
Merge branch 'PR/lscpu-caches-sep' of github.com:karelzak/util-linux-work
[thirdparty/util-linux.git] / sys-utils / unshare.c
CommitLineData
4205f1fd
MG
1/*
2 * unshare(1) - command-line interface for unshare(2)
3 *
4 * Copyright (C) 2009 Mikhail Gusarov <dottedmag@dottedmag.net>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
7cebf0bb 18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
4205f1fd
MG
19 */
20
4205f1fd
MG
21#include <errno.h>
22#include <getopt.h>
1f248f73 23#include <poll.h>
4205f1fd
MG
24#include <sched.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <unistd.h>
ff5dc96e 28#include <sys/eventfd.h>
5088ec33 29#include <sys/wait.h>
6728ca10 30#include <sys/mount.h>
c84f2590
KZ
31#include <sys/types.h>
32#include <sys/stat.h>
8e8f0fa5 33#include <sys/prctl.h>
f0af42b5 34#include <grp.h>
c84f2590 35
d754315c
RM
36/* we only need some defines missing in sys/mount.h, no libmount linkage */
37#include <libmount.h>
38
4205f1fd 39#include "nls.h"
eb76ca98 40#include "c.h"
cef4decf 41#include "caputils.h"
efb8854f 42#include "closestream.h"
c91280a4 43#include "namespace.h"
1f248f73 44#include "pidfd-utils.h"
57580694 45#include "exec_shell.h"
4da21e37
LR
46#include "xalloc.h"
47#include "pathnames.h"
48#include "all-io.h"
8b39a17c 49#include "signames.h"
f0af42b5 50#include "strutils.h"
987550cb 51#include "pwdutils.h"
4da21e37 52
99fcafdf
YK
53/* synchronize parent and child by pipe */
54#define PIPE_SYNC_BYTE 0x06
55
f0f22e9c
KZ
56/* 'private' is kernel default */
57#define UNSHARE_PROPAGATION_DEFAULT (MS_REC | MS_PRIVATE)
58
0490a6ca
KZ
59/* /proc namespace files and mountpoints for binds */
60static struct namespace_file {
61 int type; /* CLONE_NEW* */
62 const char *name; /* ns/<type> */
63 const char *target; /* user specified target for bind mount */
64} namespace_files[] = {
f9e7b66d
SH
65 { .type = CLONE_NEWUSER, .name = "ns/user" },
66 { .type = CLONE_NEWCGROUP,.name = "ns/cgroup" },
67 { .type = CLONE_NEWIPC, .name = "ns/ipc" },
68 { .type = CLONE_NEWUTS, .name = "ns/uts" },
69 { .type = CLONE_NEWNET, .name = "ns/net" },
0d5260b6 70 { .type = CLONE_NEWPID, .name = "ns/pid_for_children" },
f9e7b66d 71 { .type = CLONE_NEWNS, .name = "ns/mnt" },
0d5260b6 72 { .type = CLONE_NEWTIME, .name = "ns/time_for_children" },
0490a6ca
KZ
73 { .name = NULL }
74};
75
76static int npersists; /* number of persistent namespaces */
77
fbceefde
KZ
78enum {
79 SETGROUPS_NONE = -1,
80 SETGROUPS_DENY = 0,
81 SETGROUPS_ALLOW = 1,
82};
83
84static const char *setgroups_strings[] =
85{
86 [SETGROUPS_DENY] = "deny",
87 [SETGROUPS_ALLOW] = "allow"
88};
89
90static int setgroups_str2id(const char *str)
91{
92 size_t i;
93
94 for (i = 0; i < ARRAY_SIZE(setgroups_strings); i++)
95 if (strcmp(str, setgroups_strings[i]) == 0)
96 return i;
97
98 errx(EXIT_FAILURE, _("unsupported --setgroups argument '%s'"), str);
99}
100
101static void setgroups_control(int action)
0bf15941
EB
102{
103 const char *file = _PATH_PROC_SETGROUPS;
fbceefde 104 const char *cmd;
0bf15941
EB
105 int fd;
106
fbceefde
KZ
107 if (action < 0 || (size_t) action >= ARRAY_SIZE(setgroups_strings))
108 return;
109 cmd = setgroups_strings[action];
110
0bf15941
EB
111 fd = open(file, O_WRONLY);
112 if (fd < 0) {
113 if (errno == ENOENT)
114 return;
7ff635bf 115 err(EXIT_FAILURE, _("cannot open %s"), file);
0bf15941
EB
116 }
117
fbceefde 118 if (write_all(fd, cmd, strlen(cmd)))
0bf15941
EB
119 err(EXIT_FAILURE, _("write failed %s"), file);
120 close(fd);
121}
122
4da21e37
LR
123static void map_id(const char *file, uint32_t from, uint32_t to)
124{
125 char *buf;
126 int fd;
127
128 fd = open(file, O_WRONLY);
129 if (fd < 0)
130 err(EXIT_FAILURE, _("cannot open %s"), file);
131
132 xasprintf(&buf, "%u %u 1", from, to);
133 if (write_all(fd, buf, strlen(buf)))
134 err(EXIT_FAILURE, _("write failed %s"), file);
135 free(buf);
136 close(fd);
137}
4205f1fd 138
f0f22e9c
KZ
139static unsigned long parse_propagation(const char *str)
140{
141 size_t i;
142 static const struct prop_opts {
143 const char *name;
144 unsigned long flag;
145 } opts[] = {
146 { "slave", MS_REC | MS_SLAVE },
147 { "private", MS_REC | MS_PRIVATE },
148 { "shared", MS_REC | MS_SHARED },
149 { "unchanged", 0 }
150 };
151
152 for (i = 0; i < ARRAY_SIZE(opts); i++) {
153 if (strcmp(opts[i].name, str) == 0)
154 return opts[i].flag;
155 }
156
157 errx(EXIT_FAILURE, _("unsupported propagation mode: %s"), str);
158}
159
160static void set_propagation(unsigned long flags)
161{
162 if (flags == 0)
163 return;
164
165 if (mount("none", "/", NULL, flags, NULL) != 0)
166 err(EXIT_FAILURE, _("cannot change root filesystem propagation"));
167}
168
0490a6ca
KZ
169
170static int set_ns_target(int type, const char *path)
171{
172 struct namespace_file *ns;
173
174 for (ns = namespace_files; ns->name; ns++) {
175 if (ns->type != type)
176 continue;
177 ns->target = path;
178 npersists++;
179 return 0;
180 }
181
182 return -EINVAL;
183}
184
185static int bind_ns_files(pid_t pid)
186{
187 struct namespace_file *ns;
188 char src[PATH_MAX];
189
190 for (ns = namespace_files; ns->name; ns++) {
191 if (!ns->target)
192 continue;
193
194 snprintf(src, sizeof(src), "/proc/%u/%s", (unsigned) pid, ns->name);
195
196 if (mount(src, ns->target, NULL, MS_BIND, NULL) != 0)
197 err(EXIT_FAILURE, _("mount %s on %s failed"), src, ns->target);
198 }
199
200 return 0;
201}
202
c84f2590
KZ
203static ino_t get_mnt_ino(pid_t pid)
204{
205 struct stat st;
206 char path[PATH_MAX];
207
208 snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
209
210 if (stat(path, &st) != 0)
1293b0f6 211 err(EXIT_FAILURE, _("stat of %s failed"), path);
c84f2590
KZ
212 return st.st_ino;
213}
214
95e85389 215static void settime(int64_t offset, clockid_t clk_id)
be7df01a
AR
216{
217 char buf[sizeof(stringify_value(ULONG_MAX)) * 3];
218 int fd, len;
219
95e85389 220 len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, offset);
be7df01a
AR
221
222 fd = open("/proc/self/timens_offsets", O_WRONLY);
223 if (fd < 0)
224 err(EXIT_FAILURE, _("failed to open /proc/self/timens_offsets"));
225
226 if (write(fd, buf, len) != len)
227 err(EXIT_FAILURE, _("failed to write to /proc/self/timens_offsets"));
228
229 close(fd);
230}
231
82ea6298
SA
232/**
233 * waitchild() - Wait for a process to exit successfully
234 * @pid: PID of the process to wait for
235 *
236 * Wait for a process to exit successfully. If it exits with a non-zero return
237 * code, then exit() with the same status.
238 */
239static void waitchild(int pid)
240{
241 int rc, status;
242
243 do {
244 rc = waitpid(pid, &status, 0);
245 if (rc < 0) {
246 if (errno == EINTR)
247 continue;
248 err(EXIT_FAILURE, _("waitpid failed"));
249 }
250 if (WIFEXITED(status) &&
251 WEXITSTATUS(status) != EXIT_SUCCESS)
252 exit(WEXITSTATUS(status));
253 } while (rc < 0);
254}
255
783bb52a
SA
256/**
257 * sync_with_child() - Tell our child we're ready and wait for it to exit
258 * @pid: The pid of our child
259 * @fd: A file descriptor created with eventfd()
260 *
261 * This tells a child created with fork_and_wait() that we are ready for it to
262 * continue. Once we have done that, wait for our child to exit.
263 */
264static void sync_with_child(pid_t pid, int fd)
c84f2590 265{
783bb52a
SA
266 uint64_t ch = PIPE_SYNC_BYTE;
267
268 write_all(fd, &ch, sizeof(ch));
269 close(fd);
c84f2590 270
783bb52a
SA
271 waitchild(pid);
272}
99fcafdf 273
783bb52a
SA
274/**
275 * fork_and_wait() - Fork and wait to be sync'd with
276 * @fd - A file descriptor created with eventfd() which should be passed to
277 * sync_with_child()
278 *
279 * This creates an eventfd and forks. The parent process returns immediately,
280 * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
281 * This allows the parent to perform some tasks before the child starts its
282 * work. The parent should call sync_with_child() once it is ready for the
283 * child to continue.
284 *
285 * Return: The pid from fork()
286 */
287static pid_t fork_and_wait(int *fd)
288{
289 pid_t pid;
290 uint64_t ch;
c84f2590 291
783bb52a
SA
292 *fd = eventfd(0, 0);
293 if (*fd < 0)
294 err(EXIT_FAILURE, _("eventfd failed"));
295
296 pid = fork();
297 if (pid < 0)
c84f2590 298 err(EXIT_FAILURE, _("fork failed"));
99fcafdf 299
783bb52a
SA
300 if (!pid) {
301 /* wait for the our parent to tell us to continue */
302 if (read_all(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) ||
303 ch != PIPE_SYNC_BYTE)
304 err(EXIT_FAILURE, _("failed to read eventfd"));
305 close(*fd);
c84f2590 306 }
783bb52a
SA
307
308 return pid;
309}
310
311static pid_t bind_ns_files_from_child(int *fd)
312{
313 pid_t child, ppid = getpid();
314 ino_t ino = get_mnt_ino(ppid);
315
316 child = fork_and_wait(fd);
317 if (child)
318 return child;
319
320 if (get_mnt_ino(ppid) == ino)
321 exit(EXIT_FAILURE);
322 bind_ns_files(ppid);
323 exit(EXIT_SUCCESS);
c84f2590
KZ
324}
325
987550cb
MHB
326static uid_t get_user(const char *s, const char *err)
327{
328 struct passwd *pw;
329 char *buf = NULL;
330 uid_t ret;
331
332 pw = xgetpwnam(s, &buf);
333 if (pw) {
334 ret = pw->pw_uid;
335 free(pw);
336 free(buf);
337 } else {
338 ret = strtoul_or_err(s, err);
339 }
340
341 return ret;
342}
343
344static gid_t get_group(const char *s, const char *err)
345{
346 struct group *gr;
347 char *buf = NULL;
348 gid_t ret;
349
350 gr = xgetgrnam(s, &buf);
351 if (gr) {
352 ret = gr->gr_gid;
353 free(gr);
354 free(buf);
355 } else {
356 ret = strtoul_or_err(s, err);
357 }
358
359 return ret;
360}
361
ff5dc96e
SA
362/**
363 * struct map_range - A range of IDs to map
8bf585f7
SA
364 * @outer: First ID mapped on the outside of the namespace
365 * @inner: First ID mapped on the inside of the namespace
ff5dc96e
SA
366 * @count: Length of the inside and outside ranges
367 *
368 * A range of uids/gids to map using new[gu]idmap.
369 */
370struct map_range {
371 unsigned int outer;
372 unsigned int inner;
373 unsigned int count;
374};
375
376#define UID_BUFSIZ sizeof(stringify_value(ULONG_MAX))
377
ff5dc96e
SA
378/**
379 * get_map_range() - Parse a mapping range from a string
3870b182 380 * @s: A string of the format inner:outer:count or outer,inner,count
ff5dc96e 381 *
3870b182
CW
382 * Parse a string of the form inner:outer:count or outer,inner,count into
383 * a new mapping range.
ff5dc96e
SA
384 *
385 * Return: A new &struct map_range
386 */
387static struct map_range *get_map_range(const char *s)
388{
3870b182 389 int end;
ff5dc96e
SA
390 struct map_range *ret;
391
ff5dc96e 392 ret = xmalloc(sizeof(*ret));
3870b182
CW
393
394 if (sscanf(s, "%u:%u:%u%n", &ret->inner, &ret->outer, &ret->count,
395 &end) >= 3 && !s[end])
396 return ret; /* inner:outer:count */
397
398 if (sscanf(s, "%u,%u,%u%n", &ret->outer, &ret->inner, &ret->count,
399 &end) >= 3 && !s[end])
400 return ret; /* outer,inner,count */
401
402 errx(EXIT_FAILURE, _("invalid mapping '%s'"), s);
ff5dc96e
SA
403}
404
e67b0ba3
SA
405/**
406 * read_subid_range() - Look up a user's sub[gu]id range
407 * @filename: The file to look up the range from. This should be either
408 * ``/etc/subuid`` or ``/etc/subgid``.
409 * @uid: The uid of the user whose range we should look up.
410 *
411 * This finds the first subid range matching @uid in @filename.
412 */
413static struct map_range *read_subid_range(char *filename, uid_t uid)
414{
415 char *line = NULL, *pwbuf;
416 FILE *idmap;
de9dcbdf 417 size_t n = 0;
e67b0ba3
SA
418 struct passwd *pw;
419 struct map_range *map;
420
421 map = xmalloc(sizeof(*map));
07935158 422 map->inner = -1;
e67b0ba3
SA
423
424 pw = xgetpwuid(uid, &pwbuf);
425 if (!pw)
426 errx(EXIT_FAILURE, _("you (user %d) don't exist."), uid);
427
428 idmap = fopen(filename, "r");
429 if (!idmap)
430 err(EXIT_FAILURE, _("could not open '%s'"), filename);
431
432 /*
433 * Each line in sub[ug]idmap looks like
434 * username:subuid:count
435 * OR
436 * uid:subuid:count
437 */
438 while (getline(&line, &n, idmap) != -1) {
439 char *rest, *s;
440
441 rest = strchr(line, ':');
442 if (!rest)
443 continue;
444 *rest = '\0';
445
446 if (strcmp(line, pw->pw_name) &&
447 strtoul(line, NULL, 10) != pw->pw_uid)
448 continue;
449
450 s = rest + 1;
451 rest = strchr(s, ':');
452 if (!rest)
453 continue;
454 *rest = '\0';
455 map->outer = strtoul_or_err(s, _("failed to parse subid map"));
456
457 s = rest + 1;
458 rest = strchr(s, '\n');
459 if (rest)
460 *rest = '\0';
461 map->count = strtoul_or_err(s, _("failed to parse subid map"));
462
463 fclose(idmap);
d504b862
KZ
464 free(pw);
465 free(pwbuf);
466
e67b0ba3
SA
467 return map;
468 }
469
e05ab331 470 errx(EXIT_FAILURE, _("no line matching user \"%s\" in %s"),
e67b0ba3
SA
471 pw->pw_name, filename);
472}
473
ff5dc96e
SA
474/**
475 * map_ids() - Create a new uid/gid map
476 * @idmapper: Either newuidmap or newgidmap
477 * @ppid: Pid to set the map for
478 * @outer: ID outside the namespace for a single map.
479 * @inner: ID inside the namespace for a single map. May be -1 to only use @map.
480 * @map: A range of IDs to map
481 *
482 * This creates a new uid/gid map for @ppid using @idmapper. The ID @outer in
483 * the parent (our) namespace is mapped to the ID @inner in the child (@ppid's)
484 * namespace. In addition, the range of IDs beginning at @map->outer is mapped
485 * to the range of IDs beginning at @map->inner. The tricky bit is that we
486 * cannot let these mappings overlap. We accomplish this by removing a "hole"
487 * from @map, if @outer or @inner overlap it. This may result in one less than
488 * @map->count IDs being mapped from @map. The unmapped IDs are always the
489 * topmost IDs of the mapping (either in the parent or the child namespace).
490 *
491 * Most of the time, this function will be called with @map->outer as some
492 * large ID, @map->inner as 0, and @map->count as a large number (at least
493 * 1000, but less than @map->outer). Typically, there will be no conflict with
494 * @outer. However, @inner may split the mapping for e.g. --map-current-user.
495 *
496 * This function always exec()s or errors out and does not return.
497 */
498static void __attribute__((__noreturn__))
499map_ids(const char *idmapper, int ppid, unsigned int outer, unsigned int inner,
500 struct map_range *map)
501{
502 /* idmapper + pid + 4 * map + NULL */
503 char *argv[15];
504 /* argv - idmapper - "1" - NULL */
505 char args[12][UID_BUFSIZ];
506 int i = 0, j = 0;
507 struct map_range lo, mid, hi;
508 unsigned int inner_offset, outer_offset;
509
510 /* Some helper macros to reduce bookkeeping */
511#define push_str(s) do { \
512 argv[i++] = s; \
513} while (0)
514#define push_ul(x) do { \
515 snprintf(args[j], sizeof(args[j]), "%u", x); \
516 push_str(args[j++]); \
517} while (0)
518
519 push_str(xstrdup(idmapper));
520 push_ul(ppid);
521 if ((int)inner == -1) {
522 /*
07935158
CW
523 * If we don't have a "single" mapping, then we can just use map
524 * directly, starting inner IDs from zero for an auto mapping
ff5dc96e 525 */
07935158 526 push_ul(map->inner + 1 ? map->inner : 0);
ff5dc96e
SA
527 push_ul(map->outer);
528 push_ul(map->count);
529 push_str(NULL);
530
531 execvp(idmapper, argv);
532 errexec(idmapper);
533 }
534
07935158
CW
535 /*
536 * Start inner IDs from zero for an auto mapping; otherwise, if the two
537 * fixed mappings overlap, remove an ID from map
538 */
539 if (map->inner + 1 == 0)
540 map->inner = 0;
541 else if ((outer >= map->outer && outer <= map->outer + map->count) ||
542 (inner >= map->inner && inner <= map->inner + map->count))
ff5dc96e
SA
543 map->count--;
544
545 /* Determine where the splits between lo, mid, and hi will be */
546 outer_offset = min(outer > map->outer ? outer - map->outer : 0,
547 map->count);
548 inner_offset = min(inner > map->inner ? inner - map->inner : 0,
549 map->count);
550
551 /*
552 * In the worst case, we need three mappings:
553 * From the bottom of map to either inner or outer
554 */
555 lo.outer = map->outer;
556 lo.inner = map->inner;
557 lo.count = min(inner_offset, outer_offset);
558
559 /* From the lower of inner or outer to the higher */
560 mid.outer = lo.outer + lo.count;
561 mid.outer += mid.outer == outer;
562 mid.inner = lo.inner + lo.count;
563 mid.inner += mid.inner == inner;
564 mid.count = abs_diff(outer_offset, inner_offset);
565
566 /* And from the higher of inner or outer to the end of the map */
567 hi.outer = mid.outer + mid.count;
568 hi.outer += hi.outer == outer;
569 hi.inner = mid.inner + mid.count;
570 hi.inner += hi.inner == inner;
571 hi.count = map->count - lo.count - mid.count;
572
573 push_ul(inner);
574 push_ul(outer);
575 push_str("1");
576 /* new[gu]idmap doesn't like zero-length mappings, so skip them */
577 if (lo.count) {
578 push_ul(lo.inner);
579 push_ul(lo.outer);
580 push_ul(lo.count);
581 }
582 if (mid.count) {
583 push_ul(mid.inner);
584 push_ul(mid.outer);
585 push_ul(mid.count);
586 }
587 if (hi.count) {
588 push_ul(hi.inner);
589 push_ul(hi.outer);
590 push_ul(hi.count);
591 }
592 push_str(NULL);
593 execvp(idmapper, argv);
594 errexec(idmapper);
595}
596
597/**
598 * map_ids_from_child() - Set up a new uid/gid map
599 * @fd: The eventfd to wait on
600 * @mapuser: The user to map the current user to (or -1)
601 * @usermap: The range of UIDs to map (or %NULL)
602 * @mapgroup: The group to map the current group to (or -1)
603 * @groupmap: The range of GIDs to map (or %NULL)
604 *
605 * fork_and_wait() for our parent to call sync_with_child() on @fd. Upon
606 * recieving the go-ahead, use newuidmap and newgidmap to set the uid/gid map
607 * for our parent's PID.
608 *
609 * Return: The pid of the child.
610 */
611static pid_t map_ids_from_child(int *fd, uid_t mapuser,
612 struct map_range *usermap, gid_t mapgroup,
613 struct map_range *groupmap)
614{
615 pid_t child, pid = 0;
616 pid_t ppid = getpid();
617
618 child = fork_and_wait(fd);
619 if (child)
620 return child;
621
622 /* Avoid forking more than we need to */
623 if (usermap && groupmap) {
624 pid = fork();
625 if (pid < 0)
626 err(EXIT_FAILURE, _("fork failed"));
627 if (pid)
628 waitchild(pid);
629 }
630
631 if (!pid && usermap)
632 map_ids("newuidmap", ppid, geteuid(), mapuser, usermap);
633 if (groupmap)
634 map_ids("newgidmap", ppid, getegid(), mapgroup, groupmap);
635 exit(EXIT_SUCCESS);
636}
637
fa2cd89a 638static void __attribute__((__noreturn__)) usage(void)
4205f1fd 639{
fa2cd89a 640 FILE *out = stdout;
4205f1fd 641
6a87798a 642 fputs(USAGE_HEADER, out);
b5672517 643 fprintf(out, _(" %s [options] [<program> [<argument>...]]\n"),
298dc4ff 644 program_invocation_short_name);
4205f1fd 645
451dbcfa
BS
646 fputs(USAGE_SEPARATOR, out);
647 fputs(_("Run a program with some namespaces unshared from the parent.\n"), out);
648
6a87798a 649 fputs(USAGE_OPTIONS, out);
0490a6ca
KZ
650 fputs(_(" -m, --mount[=<file>] unshare mounts namespace\n"), out);
651 fputs(_(" -u, --uts[=<file>] unshare UTS namespace (hostname etc)\n"), out);
652 fputs(_(" -i, --ipc[=<file>] unshare System V IPC namespace\n"), out);
653 fputs(_(" -n, --net[=<file>] unshare network namespace\n"), out);
654 fputs(_(" -p, --pid[=<file>] unshare pid namespace\n"), out);
655 fputs(_(" -U, --user[=<file>] unshare user namespace\n"), out);
f9e7b66d 656 fputs(_(" -C, --cgroup[=<file>] unshare cgroup namespace\n"), out);
f218fd97 657 fputs(_(" -T, --time[=<file>] unshare time namespace\n"), out);
da639217 658 fputs(USAGE_SEPARATOR, out);
6728ca10 659 fputs(_(" -f, --fork fork before launching <program>\n"), out);
987550cb
MHB
660 fputs(_(" --map-user=<uid>|<name> map current user to uid (implies --user)\n"), out);
661 fputs(_(" --map-group=<gid>|<name> map current group to gid (implies --user)\n"), out);
4da21e37 662 fputs(_(" -r, --map-root-user map current user to root (implies --user)\n"), out);
4175f29e 663 fputs(_(" -c, --map-current-user map current user to itself (implies --user)\n"), out);
e67b0ba3 664 fputs(_(" --map-auto map users and groups automatically (implies --user)\n"), out);
3870b182 665 fputs(_(" --map-users=<inneruid>:<outeruid>:<count>\n"
ff5dc96e 666 " map count users from outeruid to inneruid (implies --user)\n"), out);
3870b182 667 fputs(_(" --map-groups=<innergid>:<outergid>:<count>\n"
ff5dc96e 668 " map count groups from outergid to innergid (implies --user)\n"), out);
da639217
KZ
669 fputs(USAGE_SEPARATOR, out);
670 fputs(_(" --kill-child[=<signame>] when dying, kill the forked child (implies --fork)\n"
671 " defaults to SIGKILL\n"), out);
672 fputs(_(" --mount-proc[=<dir>] mount proc filesystem first (implies --mount)\n"), out);
673 fputs(_(" --propagation slave|shared|private|unchanged\n"
f0f22e9c 674 " modify mount propagation in mount namespace\n"), out);
da639217 675 fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out);
cef4decf 676 fputs(_(" --keep-caps retain capabilities granted in user namespaces\n"), out);
bf8834d4 677 fputs(USAGE_SEPARATOR, out);
6671501c
AR
678 fputs(_(" -R, --root=<dir> run the command with root directory set to <dir>\n"), out);
679 fputs(_(" -w, --wd=<dir> change working directory to <dir>\n"), out);
680 fputs(_(" -S, --setuid <uid> set uid in entered namespace\n"), out);
681 fputs(_(" -G, --setgid <gid> set gid in entered namespace\n"), out);
be7df01a
AR
682 fputs(_(" --monotonic <offset> set clock monotonic offset (seconds) in time namespaces\n"), out);
683 fputs(_(" --boottime <offset> set clock boottime offset (seconds) in time namespaces\n"), out);
4205f1fd 684
6a87798a 685 fputs(USAGE_SEPARATOR, out);
f45f3ec3
RM
686 printf(USAGE_HELP_OPTIONS(27));
687 printf(USAGE_MAN_TAIL("unshare(1)"));
6a87798a 688
fa2cd89a 689 exit(EXIT_SUCCESS);
4205f1fd
MG
690}
691
692int main(int argc, char *argv[])
693{
6728ca10 694 enum {
fbceefde 695 OPT_MOUNTPROC = CHAR_MAX + 1,
f0f22e9c 696 OPT_PROPAGATION,
8e8f0fa5 697 OPT_SETGROUPS,
bf8834d4 698 OPT_KILLCHILD,
cef4decf 699 OPT_KEEPCAPS,
be7df01a
AR
700 OPT_MONOTONIC,
701 OPT_BOOTTIME,
6e837b5a 702 OPT_MAPUSER,
ff5dc96e 703 OPT_MAPUSERS,
6e837b5a 704 OPT_MAPGROUP,
ff5dc96e 705 OPT_MAPGROUPS,
e67b0ba3 706 OPT_MAPAUTO,
6728ca10 707 };
6c7d5ae9 708 static const struct option longopts[] = {
87918040
SK
709 { "help", no_argument, NULL, 'h' },
710 { "version", no_argument, NULL, 'V' },
711
712 { "mount", optional_argument, NULL, 'm' },
713 { "uts", optional_argument, NULL, 'u' },
714 { "ipc", optional_argument, NULL, 'i' },
715 { "net", optional_argument, NULL, 'n' },
716 { "pid", optional_argument, NULL, 'p' },
717 { "user", optional_argument, NULL, 'U' },
718 { "cgroup", optional_argument, NULL, 'C' },
f218fd97 719 { "time", optional_argument, NULL, 'T' },
87918040
SK
720
721 { "fork", no_argument, NULL, 'f' },
8b39a17c 722 { "kill-child", optional_argument, NULL, OPT_KILLCHILD },
87918040 723 { "mount-proc", optional_argument, NULL, OPT_MOUNTPROC },
6e837b5a 724 { "map-user", required_argument, NULL, OPT_MAPUSER },
ff5dc96e 725 { "map-users", required_argument, NULL, OPT_MAPUSERS },
6e837b5a 726 { "map-group", required_argument, NULL, OPT_MAPGROUP },
ff5dc96e 727 { "map-groups", required_argument, NULL, OPT_MAPGROUPS },
87918040 728 { "map-root-user", no_argument, NULL, 'r' },
4175f29e 729 { "map-current-user", no_argument, NULL, 'c' },
e67b0ba3 730 { "map-auto", no_argument, NULL, OPT_MAPAUTO },
87918040
SK
731 { "propagation", required_argument, NULL, OPT_PROPAGATION },
732 { "setgroups", required_argument, NULL, OPT_SETGROUPS },
cef4decf 733 { "keep-caps", no_argument, NULL, OPT_KEEPCAPS },
f0af42b5
LV
734 { "setuid", required_argument, NULL, 'S' },
735 { "setgid", required_argument, NULL, 'G' },
bf8834d4
LV
736 { "root", required_argument, NULL, 'R' },
737 { "wd", required_argument, NULL, 'w' },
be7df01a
AR
738 { "monotonic", required_argument, NULL, OPT_MONOTONIC },
739 { "boottime", required_argument, NULL, OPT_BOOTTIME },
87918040 740 { NULL, 0, NULL, 0 }
4205f1fd
MG
741 };
742
fbceefde 743 int setgrpcmd = SETGROUPS_NONE;
4205f1fd 744 int unshare_flags = 0;
6e837b5a
MHB
745 int c, forkit = 0;
746 uid_t mapuser = -1;
747 gid_t mapgroup = -1;
ff5dc96e
SA
748 struct map_range *usermap = NULL;
749 struct map_range *groupmap = NULL;
8b39a17c 750 int kill_child_signo = 0; /* 0 means --kill-child was not used */
6728ca10 751 const char *procmnt = NULL;
bf8834d4
LV
752 const char *newroot = NULL;
753 const char *newdir = NULL;
ff5dc96e 754 pid_t pid_bind = 0, pid_idmap = 0;
1f71f0cf 755 pid_t pid = 0;
153607e2 756#ifdef UL_HAVE_PIDFD
1f248f73 757 int fd_parent_pid = -1;
153607e2 758#endif
ff5dc96e 759 int fd_idmap, fd_bind = -1;
f2f98017 760 sigset_t sigset, oldsigset;
c84f2590 761 int status;
f0f22e9c 762 unsigned long propagation = UNSHARE_PROPAGATION_DEFAULT;
f0af42b5
LV
763 int force_uid = 0, force_gid = 0;
764 uid_t uid = 0, real_euid = geteuid();
765 gid_t gid = 0, real_egid = getegid();
cef4decf 766 int keepcaps = 0;
95e85389
TW
767 int64_t monotonic = 0;
768 int64_t boottime = 0;
be7df01a
AR
769 int force_monotonic = 0;
770 int force_boottime = 0;
4205f1fd 771
999ac5e2 772 setlocale(LC_ALL, "");
4205f1fd
MG
773 bindtextdomain(PACKAGE, LOCALEDIR);
774 textdomain(PACKAGE);
2c308875 775 close_stdout_atexit();
4205f1fd 776
f218fd97 777 while ((c = getopt_long(argc, argv, "+fhVmuinpCTUrR:w:S:G:c", longopts, NULL)) != -1) {
2eefe517 778 switch (c) {
5088ec33
MF
779 case 'f':
780 forkit = 1;
781 break;
4205f1fd 782 case 'm':
ef6acdb8 783 unshare_flags |= CLONE_NEWNS;
0490a6ca
KZ
784 if (optarg)
785 set_ns_target(CLONE_NEWNS, optarg);
4205f1fd
MG
786 break;
787 case 'u':
ef6acdb8 788 unshare_flags |= CLONE_NEWUTS;
0490a6ca
KZ
789 if (optarg)
790 set_ns_target(CLONE_NEWUTS, optarg);
4205f1fd
MG
791 break;
792 case 'i':
ef6acdb8 793 unshare_flags |= CLONE_NEWIPC;
0490a6ca
KZ
794 if (optarg)
795 set_ns_target(CLONE_NEWIPC, optarg);
4205f1fd
MG
796 break;
797 case 'n':
ef6acdb8 798 unshare_flags |= CLONE_NEWNET;
0490a6ca
KZ
799 if (optarg)
800 set_ns_target(CLONE_NEWNET, optarg);
4205f1fd 801 break;
bc7f9b95
EB
802 case 'p':
803 unshare_flags |= CLONE_NEWPID;
0490a6ca
KZ
804 if (optarg)
805 set_ns_target(CLONE_NEWPID, optarg);
bc7f9b95
EB
806 break;
807 case 'U':
808 unshare_flags |= CLONE_NEWUSER;
0490a6ca
KZ
809 if (optarg)
810 set_ns_target(CLONE_NEWUSER, optarg);
bc7f9b95 811 break;
f9e7b66d
SH
812 case 'C':
813 unshare_flags |= CLONE_NEWCGROUP;
814 if (optarg)
815 set_ns_target(CLONE_NEWCGROUP, optarg);
816 break;
f218fd97 817 case 'T':
be7df01a
AR
818 unshare_flags |= CLONE_NEWTIME;
819 if (optarg)
820 set_ns_target(CLONE_NEWTIME, optarg);
821 break;
6728ca10
KZ
822 case OPT_MOUNTPROC:
823 unshare_flags |= CLONE_NEWNS;
824 procmnt = optarg ? optarg : "/proc";
825 break;
6e837b5a
MHB
826 case OPT_MAPUSER:
827 unshare_flags |= CLONE_NEWUSER;
987550cb 828 mapuser = get_user(optarg, _("failed to parse uid"));
6e837b5a
MHB
829 break;
830 case OPT_MAPGROUP:
831 unshare_flags |= CLONE_NEWUSER;
987550cb 832 mapgroup = get_group(optarg, _("failed to parse gid"));
6e837b5a 833 break;
4da21e37 834 case 'r':
4175f29e 835 unshare_flags |= CLONE_NEWUSER;
6e837b5a
MHB
836 mapuser = 0;
837 mapgroup = 0;
4175f29e
JP
838 break;
839 case 'c':
4da21e37 840 unshare_flags |= CLONE_NEWUSER;
6e837b5a
MHB
841 mapuser = real_euid;
842 mapgroup = real_egid;
4da21e37 843 break;
ff5dc96e
SA
844 case OPT_MAPUSERS:
845 unshare_flags |= CLONE_NEWUSER;
e67b0ba3
SA
846 if (!strcmp(optarg, "auto"))
847 usermap = read_subid_range(_PATH_SUBUID, real_euid);
848 else
849 usermap = get_map_range(optarg);
ff5dc96e
SA
850 break;
851 case OPT_MAPGROUPS:
852 unshare_flags |= CLONE_NEWUSER;
e67b0ba3 853 if (!strcmp(optarg, "auto"))
c6fa0ebd 854 groupmap = read_subid_range(_PATH_SUBGID, real_euid);
e67b0ba3
SA
855 else
856 groupmap = get_map_range(optarg);
857 break;
858 case OPT_MAPAUTO:
859 unshare_flags |= CLONE_NEWUSER;
860 usermap = read_subid_range(_PATH_SUBUID, real_euid);
c6fa0ebd 861 groupmap = read_subid_range(_PATH_SUBGID, real_euid);
ff5dc96e 862 break;
fbceefde
KZ
863 case OPT_SETGROUPS:
864 setgrpcmd = setgroups_str2id(optarg);
865 break;
f0f22e9c
KZ
866 case OPT_PROPAGATION:
867 propagation = parse_propagation(optarg);
868 break;
8e8f0fa5 869 case OPT_KILLCHILD:
8e8f0fa5 870 forkit = 1;
8b39a17c
NH
871 if (optarg) {
872 if ((kill_child_signo = signame_to_signum(optarg)) < 0)
873 errx(EXIT_FAILURE, _("unknown signal: %s"),
874 optarg);
875 } else {
876 kill_child_signo = SIGKILL;
877 }
8e8f0fa5 878 break;
cef4decf
JP
879 case OPT_KEEPCAPS:
880 keepcaps = 1;
881 cap_last_cap(); /* Force last cap to be cached before we fork. */
882 break;
f0af42b5
LV
883 case 'S':
884 uid = strtoul_or_err(optarg, _("failed to parse uid"));
885 force_uid = 1;
886 break;
887 case 'G':
888 gid = strtoul_or_err(optarg, _("failed to parse gid"));
889 force_gid = 1;
890 break;
bf8834d4
LV
891 case 'R':
892 newroot = optarg;
893 break;
894 case 'w':
895 newdir = optarg;
896 break;
be7df01a 897 case OPT_MONOTONIC:
95e85389 898 monotonic = strtos64_or_err(optarg, _("failed to parse monotonic offset"));
be7df01a
AR
899 force_monotonic = 1;
900 break;
901 case OPT_BOOTTIME:
95e85389 902 boottime = strtos64_or_err(optarg, _("failed to parse boottime offset"));
be7df01a
AR
903 force_boottime = 1;
904 break;
2c308875
KZ
905
906 case 'h':
907 usage();
908 case 'V':
909 print_version(EXIT_SUCCESS);
4205f1fd 910 default:
677ec86c 911 errtryhelp(EXIT_FAILURE);
4205f1fd
MG
912 }
913 }
914
be7df01a
AR
915 if ((force_monotonic || force_boottime) && !(unshare_flags & CLONE_NEWTIME))
916 errx(EXIT_FAILURE, _("options --monotonic and --boottime require "
2da02003 917 "unsharing of a time namespace (-T)"));
be7df01a 918
ec711d72
KZ
919 /* clear any inherited settings */
920 signal(SIGCHLD, SIG_DFL);
921
c84f2590 922 if (npersists && (unshare_flags & CLONE_NEWNS))
783bb52a 923 pid_bind = bind_ns_files_from_child(&fd_bind);
c84f2590 924
ff5dc96e
SA
925 if (usermap || groupmap)
926 pid_idmap = map_ids_from_child(&fd_idmap, mapuser, usermap,
927 mapgroup, groupmap);
928
2eefe517 929 if (-1 == unshare(unshare_flags))
4205f1fd
MG
930 err(EXIT_FAILURE, _("unshare failed"));
931
ff5dc96e
SA
932 /* Tell child we've called unshare() */
933 if (usermap || groupmap)
934 sync_with_child(pid_idmap, fd_idmap);
935
0d5260b6 936 if (force_boottime)
937 settime(boottime, CLOCK_BOOTTIME);
938
939 if (force_monotonic)
940 settime(monotonic, CLOCK_MONOTONIC);
941
942 if (forkit) {
f2f98017
EC
943 if (sigemptyset(&sigset) != 0 ||
944 sigaddset(&sigset, SIGINT) != 0 ||
945 sigaddset(&sigset, SIGTERM) != 0 ||
946 sigprocmask(SIG_BLOCK, &sigset, &oldsigset) != 0)
947 err(EXIT_FAILURE, _("sigprocmask block failed"));
153607e2
KZ
948#ifdef UL_HAVE_PIDFD
949 if (kill_child_signo != 0) {
950 /* make a connection to the original process (parent) */
951 fd_parent_pid = pidfd_open(getpid(), 0);
952 if (0 > fd_parent_pid)
953 err(EXIT_FAILURE, _("pidfd_open failed"));
954 }
955#endif
956 /* force child forking before mountspace binding so
957 * pid_for_children is populated */
0d5260b6 958 pid = fork();
959
960 switch(pid) {
961 case -1:
962 err(EXIT_FAILURE, _("fork failed"));
963 case 0: /* child */
f2f98017
EC
964 if (sigprocmask(SIG_SETMASK, &oldsigset, NULL))
965 err(EXIT_FAILURE,
966 _("sigprocmask restore failed"));
783bb52a
SA
967 if (npersists && (unshare_flags & CLONE_NEWNS))
968 close(fd_bind);
0d5260b6 969 break;
970 default: /* parent */
971 break;
972 }
973 }
974
975 if (npersists && (pid || !forkit)) {
976 /* run in parent */
783bb52a
SA
977 if (pid_bind && (unshare_flags & CLONE_NEWNS))
978 sync_with_child(pid_bind, fd_bind);
979 else
c84f2590
KZ
980 /* simple way, just bind */
981 bind_ns_files(getpid());
982 }
983
0d5260b6 984 if (pid) {
985 if (waitpid(pid, &status, 0) == -1)
986 err(EXIT_FAILURE, _("waitpid failed"));
3ba6736f 987
0d5260b6 988 if (WIFEXITED(status))
989 return WEXITSTATUS(status);
f2f98017
EC
990 if (WIFSIGNALED(status)) {
991
992 /* Ensure the signal that terminated the child will
993 * also terminate the parent. */
994
995 int termsig = WTERMSIG(status);
996
997 if (signal(termsig, SIG_DFL) == SIG_ERR ||
998 sigemptyset(&sigset) != 0 ||
999 sigaddset(&sigset, termsig) != 0 ||
1000 sigprocmask(SIG_UNBLOCK, &sigset, NULL) != 0)
1001 err(EXIT_FAILURE,
1002 _("sigprocmask unblock failed"));
1003
1004 kill(getpid(), termsig);
1005 }
0d5260b6 1006 err(EXIT_FAILURE, _("child exit failed"));
5088ec33
MF
1007 }
1008
1f248f73
EC
1009 if (kill_child_signo != 0) {
1010 if (prctl(PR_SET_PDEATHSIG, kill_child_signo) < 0)
1011 err(EXIT_FAILURE, "prctl failed");
153607e2
KZ
1012#ifdef UL_HAVE_PIDFD
1013 /* Use poll() to check that there is still the original parent. */
1014 if (fd_parent_pid != -1) {
1015 struct pollfd pollfds[1] = {
1016 { .fd = fd_parent_pid, .events = POLLIN }
1017 };
1018 int nfds = poll(pollfds, 1, 0);
1019
1020 if (0 > nfds)
1021 err(EXIT_FAILURE, "poll parent pidfd failed");
1022
1023 /* If the child was re-parented before prctl(2) was called, the
1024 * new parent will likely not be interested in the precise exit
1025 * status of the orphan.
1026 */
1027 if (nfds)
1028 exit(EXIT_FAILURE);
1029
1030 close(fd_parent_pid);
1031 fd_parent_pid = -1;
1032 }
1033#endif
1f248f73
EC
1034 }
1035
ff5dc96e 1036 if (mapuser != (uid_t) -1 && !usermap)
6e837b5a
MHB
1037 map_id(_PATH_PROC_UIDMAP, mapuser, real_euid);
1038
4175f29e
JP
1039 /* Since Linux 3.19 unprivileged writing of /proc/self/gid_map
1040 * has been disabled unless /proc/self/setgroups is written
1041 * first to permanently disable the ability to call setgroups
1042 * in that user namespace. */
ff5dc96e 1043 if (mapgroup != (gid_t) -1 && !groupmap) {
fbceefde
KZ
1044 if (setgrpcmd == SETGROUPS_ALLOW)
1045 errx(EXIT_FAILURE, _("options --setgroups=allow and "
6e837b5a 1046 "--map-group are mutually exclusive"));
fbceefde 1047 setgroups_control(SETGROUPS_DENY);
6e837b5a
MHB
1048 map_id(_PATH_PROC_GIDMAP, mapgroup, real_egid);
1049 }
fbceefde 1050
6e837b5a
MHB
1051 if (setgrpcmd != SETGROUPS_NONE)
1052 setgroups_control(setgrpcmd);
4da21e37 1053
f0f22e9c
KZ
1054 if ((unshare_flags & CLONE_NEWNS) && propagation)
1055 set_propagation(propagation);
1056
bf8834d4
LV
1057 if (newroot) {
1058 if (chroot(newroot) != 0)
1059 err(EXIT_FAILURE,
1060 _("cannot change root directory to '%s'"), newroot);
1061 newdir = newdir ?: "/";
1062 }
1063 if (newdir && chdir(newdir))
1064 err(EXIT_FAILURE, _("cannot chdir to '%s'"), newdir);
1065
1066 if (procmnt) {
ef7eccad 1067 /* When not changing root and using the default propagation flags
1068 then the recursive propagation change of root will
1069 automatically change that of an existing proc mount. */
1070 if (!newroot && propagation != (MS_PRIVATE|MS_REC)) {
1071 int rc = mount("none", procmnt, NULL, MS_PRIVATE|MS_REC, NULL);
1072
1073 /* Custom procmnt means that proc is very likely not mounted, causing EINVAL.
1074 Ignoring the error in this specific instance is considered safe. */
1075 if(rc != 0 && errno != EINVAL)
1076 err(EXIT_FAILURE, _("cannot change %s filesystem propagation"), procmnt);
1077 }
1078
bf8834d4 1079 if (mount("proc", procmnt, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0)
6728ca10 1080 err(EXIT_FAILURE, _("mount %s failed"), procmnt);
bf8834d4 1081 }
6728ca10 1082
f0af42b5
LV
1083 if (force_gid) {
1084 if (setgroups(0, NULL) != 0) /* drop supplementary groups */
1085 err(EXIT_FAILURE, _("setgroups failed"));
1086 if (setgid(gid) < 0) /* change GID */
1087 err(EXIT_FAILURE, _("setgid failed"));
1088 }
1089 if (force_uid && setuid(uid) < 0) /* change UID */
1090 err(EXIT_FAILURE, _("setuid failed"));
1091
acb72212
DG
1092 if (keepcaps && (unshare_flags & CLONE_NEWUSER))
1093 cap_permitted_to_ambient();
cef4decf 1094
57580694
ZJS
1095 if (optind < argc) {
1096 execvp(argv[optind], argv + optind);
fd777151 1097 errexec(argv[optind]);
57580694
ZJS
1098 }
1099 exec_shell();
4205f1fd 1100}