]> git.ipfire.org Git - thirdparty/man-pages.git/blob - man7/user_namespaces.7
user_namespaces.7: tfix
[thirdparty/man-pages.git] / man7 / user_namespaces.7
1 .\" Copyright (c) 2013, 2014 by Michael Kerrisk <mtk.manpages@gmail.com>
2 .\" and Copyright (c) 2012, 2014 by Eric W. Biederman <ebiederm@xmission.com>
3 .\"
4 .\" %%%LICENSE_START(VERBATIM)
5 .\" Permission is granted to make and distribute verbatim copies of this
6 .\" manual provided the copyright notice and this permission notice are
7 .\" preserved on all copies.
8 .\"
9 .\" Permission is granted to copy and distribute modified versions of this
10 .\" manual under the conditions for verbatim copying, provided that the
11 .\" entire resulting derived work is distributed under the terms of a
12 .\" permission notice identical to this one.
13 .\"
14 .\" Since the Linux kernel and libraries are constantly changing, this
15 .\" manual page may be incorrect or out-of-date. The author(s) assume no
16 .\" responsibility for errors or omissions, or for damages resulting from
17 .\" the use of the information contained herein. The author(s) may not
18 .\" have taken the same level of care in the production of this manual,
19 .\" which is licensed free of charge, as they might when working
20 .\" professionally.
21 .\"
22 .\" Formatted or processed versions of this manual, if unaccompanied by
23 .\" the source, must acknowledge the copyright and authors of this work.
24 .\" %%%LICENSE_END
25 .\"
26 .\"
27 .TH USER_NAMESPACES 7 2017-09-15 "Linux" "Linux Programmer's Manual"
28 .SH NAME
29 user_namespaces \- overview of Linux user namespaces
30 .SH DESCRIPTION
31 For an overview of namespaces, see
32 .BR namespaces (7).
33 .PP
34 User namespaces isolate security-related identifiers and attributes,
35 in particular,
36 user IDs and group IDs (see
37 .BR credentials (7)),
38 the root directory,
39 keys (see
40 .BR keyrings (7)),
41 .\" FIXME: This page says very little about the interaction
42 .\" of user namespaces and keys. Add something on this topic.
43 and capabilities (see
44 .BR capabilities (7)).
45 A process's user and group IDs can be different
46 inside and outside a user namespace.
47 In particular,
48 a process can have a normal unprivileged user ID outside a user namespace
49 while at the same time having a user ID of 0 inside the namespace;
50 in other words,
51 the process has full privileges for operations inside the user namespace,
52 but is unprivileged for operations outside the namespace.
53 .\"
54 .\" ============================================================
55 .\"
56 .SS Nested namespaces, namespace membership
57 User namespaces can be nested;
58 that is, each user namespace\(emexcept the initial ("root")
59 namespace\(emhas a parent user namespace,
60 and can have zero or more child user namespaces.
61 The parent user namespace is the user namespace
62 of the process that creates the user namespace via a call to
63 .BR unshare (2)
64 or
65 .BR clone (2)
66 with the
67 .BR CLONE_NEWUSER
68 flag.
69 .PP
70 The kernel imposes (since version 3.11) a limit of 32 nested levels of
71 .\" commit 8742f229b635bf1c1c84a3dfe5e47c814c20b5c8
72 user namespaces.
73 .\" FIXME Explain the rationale for this limit. (What is the rationale?)
74 Calls to
75 .BR unshare (2)
76 or
77 .BR clone (2)
78 that would cause this limit to be exceeded fail with the error
79 .BR EUSERS .
80 .PP
81 Each process is a member of exactly one user namespace.
82 A process created via
83 .BR fork (2)
84 or
85 .BR clone (2)
86 without the
87 .BR CLONE_NEWUSER
88 flag is a member of the same user namespace as its parent.
89 A single-threaded process can join another user namespace with
90 .BR setns (2)
91 if it has the
92 .BR CAP_SYS_ADMIN
93 in that namespace;
94 upon doing so, it gains a full set of capabilities in that namespace.
95 .PP
96 A call to
97 .BR clone (2)
98 or
99 .BR unshare (2)
100 with the
101 .BR CLONE_NEWUSER
102 flag makes the new child process (for
103 .BR clone (2))
104 or the caller (for
105 .BR unshare (2))
106 a member of the new user namespace created by the call.
107 .PP
108 The
109 .BR NS_GET_PARENT
110 .BR ioctl (2)
111 operation can be used to discover the parental relationship
112 between user namespaces; see
113 .BR ioctl_ns (2).
114 .\"
115 .\" ============================================================
116 .\"
117 .SS Capabilities
118 The child process created by
119 .BR clone (2)
120 with the
121 .BR CLONE_NEWUSER
122 flag starts out with a complete set
123 of capabilities in the new user namespace.
124 Likewise, a process that creates a new user namespace using
125 .BR unshare (2)
126 or joins an existing user namespace using
127 .BR setns (2)
128 gains a full set of capabilities in that namespace.
129 On the other hand,
130 that process has no capabilities in the parent (in the case of
131 .BR clone (2))
132 or previous (in the case of
133 .BR unshare (2)
134 and
135 .BR setns (2))
136 user namespace,
137 even if the new namespace is created or joined by the root user
138 (i.e., a process with user ID 0 in the root namespace).
139 .PP
140 Note that a call to
141 .BR execve (2)
142 will cause a process's capabilities to be recalculated in the usual way (see
143 .BR capabilities (7)).
144 Consequently,
145 unless the process has a user ID of 0 within the namespace,
146 or the executable file has a nonempty inheritable capabilities mask,
147 the process will lose all capabilities.
148 See the discussion of user and group ID mappings, below.
149 .PP
150 A call to
151 .BR clone (2),
152 .BR unshare (2),
153 or
154 .BR setns (2)
155 using the
156 .BR CLONE_NEWUSER
157 flag sets the "securebits" flags
158 (see
159 .BR capabilities (7))
160 to their default values (all flags disabled) in the child (for
161 .BR clone (2))
162 or caller (for
163 .BR unshare (2),
164 or
165 .BR setns (2)).
166 Note that because the caller no longer has capabilities
167 in its original user namespace after a call to
168 .BR setns (2),
169 it is not possible for a process to reset its "securebits" flags while
170 retaining its user namespace membership by using a pair of
171 .BR setns (2)
172 calls to move to another user namespace and then return to
173 its original user namespace.
174 .PP
175 The rules for determining whether or not a process has a capability
176 in a particular user namespace are as follows:
177 .IP 1. 3
178 A process has a capability inside a user namespace
179 if it is a member of that namespace and
180 it has the capability in its effective capability set.
181 A process can gain capabilities in its effective capability
182 set in various ways.
183 For example, it may execute a set-user-ID program or an
184 executable with associated file capabilities.
185 In addition,
186 a process may gain capabilities via the effect of
187 .BR clone (2),
188 .BR unshare (2),
189 or
190 .BR setns (2),
191 as already described.
192 .\" In the 3.8 sources, see security/commoncap.c::cap_capable():
193 .IP 2.
194 If a process has a capability in a user namespace,
195 then it has that capability in all child (and further removed descendant)
196 namespaces as well.
197 .IP 3.
198 .\" * The owner of the user namespace in the parent of the
199 .\" * user namespace has all caps.
200 When a user namespace is created, the kernel records the effective
201 user ID of the creating process as being the "owner" of the namespace.
202 .\" (and likewise associates the effective group ID of the creating process
203 .\" with the namespace).
204 A process that resides
205 in the parent of the user namespace
206 .\" See kernel commit 520d9eabce18edfef76a60b7b839d54facafe1f9 for a fix
207 .\" on this point
208 and whose effective user ID matches the owner of the namespace
209 has all capabilities in the namespace.
210 .\" This includes the case where the process executes a set-user-ID
211 .\" program that confers the effective UID of the creator of the namespace.
212 By virtue of the previous rule,
213 this means that the process has all capabilities in all
214 further removed descendant user namespaces as well.
215 The
216 .B NS_GET_OWNER_UID
217 .BR ioctl (2)
218 operation can be used to discover the user ID of the owner of the namespace;
219 see
220 .BR ioctl_ns (2).
221 .\"
222 .\" ============================================================
223 .\"
224 .SS Effect of capabilities within a user namespace
225 Having a capability inside a user namespace
226 permits a process to perform operations (that require privilege)
227 only on resources governed by that namespace.
228 In other words, having a capability in a user namespace permits a process
229 to perform privileged operations on resources that are governed by (nonuser)
230 namespaces associated with the user namespace (see the next subsection).
231 .PP
232 On the other hand, there are many privileged operations that affect
233 resources that are not associated with any namespace type,
234 for example, changing the system time (governed by
235 .BR CAP_SYS_TIME ),
236 loading a kernel module (governed by
237 .BR CAP_SYS_MODULE ),
238 and creating a device (governed by
239 .BR CAP_MKNOD ).
240 Only a process with privileges in the
241 .I initial
242 user namespace can perform such operations.
243 .PP
244 Holding
245 .B CAP_SYS_ADMIN
246 within the user namespace associated with a process's mount namespace
247 allows that process to create bind mounts
248 and mount the following types of filesystems:
249 .\" fs_flags = FS_USERNS_MOUNT in kernel sources
250 .PP
251 .RS 4
252 .PD 0
253 .IP * 2
254 .IR /proc
255 (since Linux 3.8)
256 .IP *
257 .IR /sys
258 (since Linux 3.8)
259 .IP *
260 .IR devpts
261 (since Linux 3.9)
262 .IP *
263 .BR tmpfs (5)
264 (since Linux 3.9)
265 .IP *
266 .IR ramfs
267 (since Linux 3.9)
268 .IP *
269 .IR mqueue
270 (since Linux 3.9)
271 .IP *
272 .IR bpf
273 .\" commit b2197755b2633e164a439682fb05a9b5ea48f706
274 (since Linux 4.4)
275 .PD
276 .RE
277 .PP
278 Holding
279 .B CAP_SYS_ADMIN
280 within the user namespace associated with a process's cgroup namespace
281 allows (since Linux 4.6)
282 that process to the mount cgroup version 2 filesystem and
283 cgroup version 1 named hierarchies
284 (i.e., cgroup filesystems mounted with the
285 .BR """none,name="""
286 option).
287 .PP
288 Holding
289 .B CAP_SYS_ADMIN
290 within the user namespace associated with a process's PID namespace
291 allows (since Linux 3.8)
292 that process to mount
293 .I /proc
294 filesystems.
295 .PP
296 Note however, that mounting block-based filesystems can be done
297 only by a process that holds
298 .BR CAP_SYS_ADMIN
299 in the initial user namespace.
300 .\"
301 .\" ============================================================
302 .\"
303 .SS Interaction of user namespaces and other types of namespaces
304 Starting in Linux 3.8, unprivileged processes can create user namespaces,
305 and other the other types of namespaces can be created with just the
306 .B CAP_SYS_ADMIN
307 capability in the caller's user namespace.
308 .PP
309 When a non-user-namespace is created,
310 it is owned by the user namespace in which the creating process
311 was a member at the time of the creation of the namespace.
312 Actions on the non-user-namespace
313 require capabilities in the corresponding user namespace.
314 .PP
315 If
316 .BR CLONE_NEWUSER
317 is specified along with other
318 .B CLONE_NEW*
319 flags in a single
320 .BR clone (2)
321 or
322 .BR unshare (2)
323 call, the user namespace is guaranteed to be created first,
324 giving the child
325 .RB ( clone (2))
326 or caller
327 .RB ( unshare (2))
328 privileges over the remaining namespaces created by the call.
329 Thus, it is possible for an unprivileged caller to specify this combination
330 of flags.
331 .PP
332 When a new namespace (other than a user namespace) is created via
333 .BR clone (2)
334 or
335 .BR unshare (2),
336 the kernel records the user namespace of the creating process against
337 the new namespace.
338 (This association can't be changed.)
339 When a process in the new namespace subsequently performs
340 privileged operations that operate on global
341 resources isolated by the namespace,
342 the permission checks are performed according to the process's capabilities
343 in the user namespace that the kernel associated with the new namespace.
344 For example, suppose that a process attempts to change the hostname
345 .RB ( sethostname (2)),
346 a resource governed by the UTS namespace.
347 In this case,
348 the kernel will determine which user namespace is associated with
349 the process's UTS namespace, and check whether the process has the
350 required capability
351 .RB ( CAP_SYS_ADMIN )
352 in that user namespace.
353 .PP
354 The
355 .BR NS_GET_USERNS
356 .BR ioctl (2)
357 operation can be used to discover the user namespace with which
358 a non-user namespace is associated; see
359 .BR ioctl_ns (2).
360 .\"
361 .\" ============================================================
362 .\"
363 .SS User and group ID mappings: uid_map and gid_map
364 When a user namespace is created,
365 it starts out without a mapping of user IDs (group IDs)
366 to the parent user namespace.
367 The
368 .IR /proc/[pid]/uid_map
369 and
370 .IR /proc/[pid]/gid_map
371 files (available since Linux 3.5)
372 .\" commit 22d917d80e842829d0ca0a561967d728eb1d6303
373 expose the mappings for user and group IDs
374 inside the user namespace for the process
375 .IR pid .
376 These files can be read to view the mappings in a user namespace and
377 written to (once) to define the mappings.
378 .PP
379 The description in the following paragraphs explains the details for
380 .IR uid_map ;
381 .IR gid_map
382 is exactly the same,
383 but each instance of "user ID" is replaced by "group ID".
384 .PP
385 The
386 .I uid_map
387 file exposes the mapping of user IDs from the user namespace
388 of the process
389 .IR pid
390 to the user namespace of the process that opened
391 .IR uid_map
392 (but see a qualification to this point below).
393 In other words, processes that are in different user namespaces
394 will potentially see different values when reading from a particular
395 .I uid_map
396 file, depending on the user ID mappings for the user namespaces
397 of the reading processes.
398 .PP
399 Each line in the
400 .I uid_map
401 file specifies a 1-to-1 mapping of a range of contiguous
402 user IDs between two user namespaces.
403 (When a user namespace is first created, this file is empty.)
404 The specification in each line takes the form of
405 three numbers delimited by white space.
406 The first two numbers specify the starting user ID in
407 each of the two user namespaces.
408 The third number specifies the length of the mapped range.
409 In detail, the fields are interpreted as follows:
410 .IP (1) 4
411 The start of the range of user IDs in
412 the user namespace of the process
413 .IR pid .
414 .IP (2)
415 The start of the range of user
416 IDs to which the user IDs specified by field one map.
417 How field two is interpreted depends on whether the process that opened
418 .I uid_map
419 and the process
420 .IR pid
421 are in the same user namespace, as follows:
422 .RS
423 .IP a) 3
424 If the two processes are in different user namespaces:
425 field two is the start of a range of
426 user IDs in the user namespace of the process that opened
427 .IR uid_map .
428 .IP b)
429 If the two processes are in the same user namespace:
430 field two is the start of the range of
431 user IDs in the parent user namespace of the process
432 .IR pid .
433 This case enables the opener of
434 .I uid_map
435 (the common case here is opening
436 .IR /proc/self/uid_map )
437 to see the mapping of user IDs into the user namespace of the process
438 that created this user namespace.
439 .RE
440 .IP (3)
441 The length of the range of user IDs that is mapped between the two
442 user namespaces.
443 .PP
444 System calls that return user IDs (group IDs)\(emfor example,
445 .BR getuid (2),
446 .BR getgid (2),
447 and the credential fields in the structure returned by
448 .BR stat (2)\(emreturn
449 the user ID (group ID) mapped into the caller's user namespace.
450 .PP
451 When a process accesses a file, its user and group IDs
452 are mapped into the initial user namespace for the purpose of permission
453 checking and assigning IDs when creating a file.
454 When a process retrieves file user and group IDs via
455 .BR stat (2),
456 the IDs are mapped in the opposite direction,
457 to produce values relative to the process user and group ID mappings.
458 .PP
459 The initial user namespace has no parent namespace,
460 but, for consistency, the kernel provides dummy user and group
461 ID mapping files for this namespace.
462 Looking at the
463 .I uid_map
464 file
465 .RI ( gid_map
466 is the same) from a shell in the initial namespace shows:
467 .PP
468 .in +4n
469 .EX
470 $ \fBcat /proc/$$/uid_map\fP
471 0 0 4294967295
472 .EE
473 .in
474 .PP
475 This mapping tells us
476 that the range starting at user ID 0 in this namespace
477 maps to a range starting at 0 in the (nonexistent) parent namespace,
478 and the length of the range is the largest 32-bit unsigned integer.
479 This leaves 4294967295 (the 32-bit signed \-1 value) unmapped.
480 This is deliberate:
481 .IR "(uid_t)\ \-1"
482 is used in several interfaces (e.g.,
483 .BR setreuid (2))
484 as a way to specify "no user ID".
485 Leaving
486 .IR "(uid_t)\ \-1"
487 unmapped and unusable guarantees that there will be no
488 confusion when using these interfaces.
489 .\"
490 .\" ============================================================
491 .\"
492 .SS Defining user and group ID mappings: writing to uid_map and gid_map
493 .PP
494 After the creation of a new user namespace, the
495 .I uid_map
496 file of
497 .I one
498 of the processes in the namespace may be written to
499 .I once
500 to define the mapping of user IDs in the new user namespace.
501 An attempt to write more than once to a
502 .I uid_map
503 file in a user namespace fails with the error
504 .BR EPERM .
505 Similar rules apply for
506 .I gid_map
507 files.
508 .PP
509 The lines written to
510 .IR uid_map
511 .RI ( gid_map )
512 must conform to the following rules:
513 .IP * 3
514 The three fields must be valid numbers,
515 and the last field must be greater than 0.
516 .IP *
517 Lines are terminated by newline characters.
518 .IP *
519 There is a limit on the number of lines in the file.
520 In Linux 4.14 and earlier, this limit was (arbitrarily)
521 .\" 5*12-byte records could fit in a 64B cache line
522 set at 5 lines.
523 Since Linux 4.15,
524 .\" commit 6397fac4915ab3002dc15aae751455da1a852f25
525 the limit is 340 lines.
526 In addition, the number of bytes written to
527 the file must be less than the system page size,
528 and the write must be performed at the start of the file (i.e.,
529 .BR lseek (2)
530 and
531 .BR pwrite (2)
532 can't be used to write to nonzero offsets in the file).
533 .IP *
534 The range of user IDs (group IDs)
535 specified in each line cannot overlap with the ranges
536 in any other lines.
537 In the initial implementation (Linux 3.8), this requirement was
538 satisfied by a simplistic implementation that imposed the further
539 requirement that
540 the values in both field 1 and field 2 of successive lines must be
541 in ascending numerical order,
542 which prevented some otherwise valid maps from being created.
543 Linux 3.9 and later
544 .\" commit 0bd14b4fd72afd5df41e9fd59f356740f22fceba
545 fix this limitation, allowing any valid set of nonoverlapping maps.
546 .IP *
547 At least one line must be written to the file.
548 .PP
549 Writes that violate the above rules fail with the error
550 .BR EINVAL .
551 .PP
552 In order for a process to write to the
553 .I /proc/[pid]/uid_map
554 .RI ( /proc/[pid]/gid_map )
555 file, all of the following requirements must be met:
556 .IP 1. 3
557 The writing process must have the
558 .BR CAP_SETUID
559 .RB ( CAP_SETGID )
560 capability in the user namespace of the process
561 .IR pid .
562 .IP 2.
563 The writing process must either be in the user namespace of the process
564 .I pid
565 or be in the parent user namespace of the process
566 .IR pid .
567 .IP 3.
568 The mapped user IDs (group IDs) must in turn have a mapping
569 in the parent user namespace.
570 .IP 4.
571 One of the following two cases applies:
572 .RS
573 .IP * 3
574 .IR Either
575 the writing process has the
576 .BR CAP_SETUID
577 .RB ( CAP_SETGID )
578 capability in the
579 .I parent
580 user namespace.
581 .RS
582 .IP + 3
583 No further restrictions apply:
584 the process can make mappings to arbitrary user IDs (group IDs)
585 in the parent user namespace.
586 .RE
587 .IP * 3
588 .IR Or
589 otherwise all of the following restrictions apply:
590 .RS
591 .IP + 3
592 The data written to
593 .I uid_map
594 .RI ( gid_map )
595 must consist of a single line that maps
596 the writing process's effective user ID
597 (group ID) in the parent user namespace to a user ID (group ID)
598 in the user namespace.
599 .IP +
600 The writing process must have the same effective user ID as the process
601 that created the user namespace.
602 .IP +
603 In the case of
604 .IR gid_map ,
605 use of the
606 .BR setgroups (2)
607 system call must first be denied by writing
608 .RI \(dq deny \(dq
609 to the
610 .I /proc/[pid]/setgroups
611 file (see below) before writing to
612 .IR gid_map .
613 .RE
614 .RE
615 .PP
616 Writes that violate the above rules fail with the error
617 .BR EPERM .
618 .\"
619 .\" ============================================================
620 .\"
621 .SS Interaction with system calls that change process UIDs or GIDs
622 In a user namespace where the
623 .I uid_map
624 file has not been written, the system calls that change user IDs will fail.
625 Similarly, if the
626 .I gid_map
627 file has not been written, the system calls that change group IDs will fail.
628 After the
629 .I uid_map
630 and
631 .I gid_map
632 files have been written, only the mapped values may be used in
633 system calls that change user and group IDs.
634 .PP
635 For user IDs, the relevant system calls include
636 .BR setuid (2),
637 .BR setfsuid (2),
638 .BR setreuid (2),
639 and
640 .BR setresuid (2).
641 For group IDs, the relevant system calls include
642 .BR setgid (2),
643 .BR setfsgid (2),
644 .BR setregid (2),
645 .BR setresgid (2),
646 and
647 .BR setgroups (2).
648 .PP
649 Writing
650 .RI \(dq deny \(dq
651 to the
652 .I /proc/[pid]/setgroups
653 file before writing to
654 .I /proc/[pid]/gid_map
655 .\" Things changed in Linux 3.19
656 .\" commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8
657 .\" commit 66d2f338ee4c449396b6f99f5e75cd18eb6df272
658 .\" http://lwn.net/Articles/626665/
659 will permanently disable
660 .BR setgroups (2)
661 in a user namespace and allow writing to
662 .I /proc/[pid]/gid_map
663 without having the
664 .BR CAP_SETGID
665 capability in the parent user namespace.
666 .\"
667 .\" ============================================================
668 .\"
669 .SS The /proc/[pid]/setgroups file
670 .\"
671 .\" commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8
672 .\" commit 66d2f338ee4c449396b6f99f5e75cd18eb6df272
673 .\" http://lwn.net/Articles/626665/
674 .\" http://web.nvd.nist.gov/view/vuln/detail?vulnId=CVE-2014-8989
675 .\"
676 The
677 .I /proc/[pid]/setgroups
678 file displays the string
679 .RI \(dq allow \(dq
680 if processes in the user namespace that contains the process
681 .I pid
682 are permitted to employ the
683 .BR setgroups (2)
684 system call; it displays
685 .RI \(dq deny \(dq
686 if
687 .BR setgroups (2)
688 is not permitted in that user namespace.
689 Note that regardless of the value in the
690 .I /proc/[pid]/setgroups
691 file (and regardless of the process's capabilities), calls to
692 .BR setgroups (2)
693 are also not permitted if
694 .IR /proc/[pid]/gid_map
695 has not yet been set.
696 .PP
697 A privileged process (one with the
698 .BR CAP_SYS_ADMIN
699 capability in the namespace) may write either of the strings
700 .RI \(dq allow \(dq
701 or
702 .RI \(dq deny \(dq
703 to this file
704 .I before
705 writing a group ID mapping
706 for this user namespace to the file
707 .IR /proc/[pid]/gid_map .
708 Writing the string
709 .RI \(dq deny \(dq
710 prevents any process in the user namespace from employing
711 .BR setgroups (2).
712 .PP
713 The essence of the restrictions described in the preceding
714 paragraph is that it is permitted to write to
715 .I /proc/[pid]/setgroups
716 only so long as calling
717 .BR setgroups (2)
718 is disallowed because
719 .I /proc/[pid]gid_map
720 has not been set.
721 This ensures that a process cannot transition from a state where
722 .BR setgroups (2)
723 is allowed to a state where
724 .BR setgroups (2)
725 is denied;
726 a process can transition only from
727 .BR setgroups (2)
728 being disallowed to
729 .BR setgroups (2)
730 being allowed.
731 .PP
732 The default value of this file in the initial user namespace is
733 .RI \(dq allow \(dq.
734 .PP
735 Once
736 .IR /proc/[pid]/gid_map
737 has been written to
738 (which has the effect of enabling
739 .BR setgroups (2)
740 in the user namespace),
741 it is no longer possible to disallow
742 .BR setgroups (2)
743 by writing
744 .RI \(dq deny \(dq
745 to
746 .IR /proc/[pid]/setgroups
747 (the write fails with the error
748 .BR EPERM ).
749 .PP
750 A child user namespace inherits the
751 .IR /proc/[pid]/setgroups
752 setting from its parent.
753 .PP
754 If the
755 .I setgroups
756 file has the value
757 .RI \(dq deny \(dq,
758 then the
759 .BR setgroups (2)
760 system call can't subsequently be reenabled (by writing
761 .RI \(dq allow \(dq
762 to the file) in this user namespace.
763 (Attempts to do so fail with the error
764 .BR EPERM .)
765 This restriction also propagates down to all child user namespaces of
766 this user namespace.
767 .PP
768 The
769 .I /proc/[pid]/setgroups
770 file was added in Linux 3.19,
771 but was backported to many earlier stable kernel series,
772 because it addresses a security issue.
773 The issue concerned files with permissions such as "rwx\-\-\-rwx".
774 Such files give fewer permissions to "group" than they do to "other".
775 This means that dropping groups using
776 .BR setgroups (2)
777 might allow a process file access that it did not formerly have.
778 Before the existence of user namespaces this was not a concern,
779 since only a privileged process (one with the
780 .BR CAP_SETGID
781 capability) could call
782 .BR setgroups (2).
783 However, with the introduction of user namespaces,
784 it became possible for an unprivileged process to create
785 a new namespace in which the user had all privileges.
786 This then allowed formerly unprivileged
787 users to drop groups and thus gain file access
788 that they did not previously have.
789 The
790 .I /proc/[pid]/setgroups
791 file was added to address this security issue,
792 by denying any pathway for an unprivileged process to drop groups with
793 .BR setgroups (2).
794 .\"
795 .\" /proc/PID/setgroups
796 .\" [allow == setgroups() is allowed, "deny" == setgroups() is disallowed]
797 .\" * Can write if have CAP_SYS_ADMIN in NS
798 .\" * Must write BEFORE writing to /proc/PID/gid_map
799 .\"
800 .\" setgroups()
801 .\" * Must already have written to gid_map
802 .\" * /proc/PID/setgroups must be "allow"
803 .\"
804 .\" /proc/PID/gid_map -- writing
805 .\" * Must already have written "deny" to /proc/PID/setgroups
806 .\"
807 .\" ============================================================
808 .\"
809 .SS Unmapped user and group IDs
810 .PP
811 There are various places where an unmapped user ID (group ID)
812 may be exposed to user space.
813 For example, the first process in a new user namespace may call
814 .BR getuid (2)
815 before a user ID mapping has been defined for the namespace.
816 In most such cases, an unmapped user ID is converted
817 .\" from_kuid_munged(), from_kgid_munged()
818 to the overflow user ID (group ID);
819 the default value for the overflow user ID (group ID) is 65534.
820 See the descriptions of
821 .IR /proc/sys/kernel/overflowuid
822 and
823 .IR /proc/sys/kernel/overflowgid
824 in
825 .BR proc (5).
826 .PP
827 The cases where unmapped IDs are mapped in this fashion include
828 system calls that return user IDs
829 .RB ( getuid (2),
830 .BR getgid (2),
831 and similar),
832 credentials passed over a UNIX domain socket,
833 .\" also SO_PEERCRED
834 credentials returned by
835 .BR stat (2),
836 .BR waitid (2),
837 and the System V IPC "ctl"
838 .B IPC_STAT
839 operations,
840 credentials exposed by
841 .IR /proc/[pid]/status
842 and the files in
843 .IR /proc/sysvipc/* ,
844 credentials returned via the
845 .I si_uid
846 field in the
847 .I siginfo_t
848 received with a signal (see
849 .BR sigaction (2)),
850 credentials written to the process accounting file (see
851 .BR acct (5)),
852 and credentials returned with POSIX message queue notifications (see
853 .BR mq_notify (3)).
854 .PP
855 There is one notable case where unmapped user and group IDs are
856 .I not
857 .\" from_kuid(), from_kgid()
858 .\" Also F_GETOWNER_UIDS is an exception
859 converted to the corresponding overflow ID value.
860 When viewing a
861 .I uid_map
862 or
863 .I gid_map
864 file in which there is no mapping for the second field,
865 that field is displayed as 4294967295 (\-1 as an unsigned integer).
866 .\"
867 .\" ============================================================
868 .\"
869 .SS Set-user-ID and set-group-ID programs
870 .PP
871 When a process inside a user namespace executes
872 a set-user-ID (set-group-ID) program,
873 the process's effective user (group) ID inside the namespace is changed
874 to whatever value is mapped for the user (group) ID of the file.
875 However, if either the user
876 .I or
877 the group ID of the file has no mapping inside the namespace,
878 the set-user-ID (set-group-ID) bit is silently ignored:
879 the new program is executed,
880 but the process's effective user (group) ID is left unchanged.
881 (This mirrors the semantics of executing a set-user-ID or set-group-ID
882 program that resides on a filesystem that was mounted with the
883 .BR MS_NOSUID
884 flag, as described in
885 .BR mount (2).)
886 .\"
887 .\" ============================================================
888 .\"
889 .SS Miscellaneous
890 .PP
891 When a process's user and group IDs are passed over a UNIX domain socket
892 to a process in a different user namespace (see the description of
893 .B SCM_CREDENTIALS
894 in
895 .BR unix (7)),
896 they are translated into the corresponding values as per the
897 receiving process's user and group ID mappings.
898 .\"
899 .SH CONFORMING TO
900 Namespaces are a Linux-specific feature.
901 .\"
902 .SH NOTES
903 Over the years, there have been a lot of features that have been added
904 to the Linux kernel that have been made available only to privileged users
905 because of their potential to confuse set-user-ID-root applications.
906 In general, it becomes safe to allow the root user in a user namespace to
907 use those features because it is impossible, while in a user namespace,
908 to gain more privilege than the root user of a user namespace has.
909 .\"
910 .\" ============================================================
911 .\"
912 .SS Availability
913 Use of user namespaces requires a kernel that is configured with the
914 .B CONFIG_USER_NS
915 option.
916 User namespaces require support in a range of subsystems across
917 the kernel.
918 When an unsupported subsystem is configured into the kernel,
919 it is not possible to configure user namespaces support.
920 .PP
921 As at Linux 3.8, most relevant subsystems supported user namespaces,
922 but a number of filesystems did not have the infrastructure needed
923 to map user and group IDs between user namespaces.
924 Linux 3.9 added the required infrastructure support for many of
925 the remaining unsupported filesystems
926 (Plan 9 (9P), Andrew File System (AFS), Ceph, CIFS, CODA, NFS, and OCFS2).
927 Linux 3.12 added support the last of the unsupported major filesystems,
928 .\" commit d6970d4b726cea6d7a9bc4120814f95c09571fc3
929 XFS.
930 .\"
931 .SH EXAMPLE
932 The program below is designed to allow experimenting with
933 user namespaces, as well as other types of namespaces.
934 It creates namespaces as specified by command-line options and then executes
935 a command inside those namespaces.
936 The comments and
937 .I usage()
938 function inside the program provide a full explanation of the program.
939 The following shell session demonstrates its use.
940 .PP
941 First, we look at the run-time environment:
942 .PP
943 .in +4n
944 .EX
945 $ \fBuname \-rs\fP # Need Linux 3.8 or later
946 Linux 3.8.0
947 $ \fBid \-u\fP # Running as unprivileged user
948 1000
949 $ \fBid \-g\fP
950 1000
951 .EE
952 .in
953 .PP
954 Now start a new shell in new user
955 .RI ( \-U ),
956 mount
957 .RI ( \-m ),
958 and PID
959 .RI ( \-p )
960 namespaces, with user ID
961 .RI ( \-M )
962 and group ID
963 .RI ( \-G )
964 1000 mapped to 0 inside the user namespace:
965 .PP
966 .in +4n
967 .EX
968 $ \fB./userns_child_exec \-p \-m \-U \-M '0 1000 1' \-G '0 1000 1' bash\fP
969 .EE
970 .in
971 .PP
972 The shell has PID 1, because it is the first process in the new
973 PID namespace:
974 .PP
975 .in +4n
976 .EX
977 bash$ \fBecho $$\fP
978 1
979 .EE
980 .in
981 Mounting a new
982 .I /proc
983 filesystem and listing all of the processes visible
984 in the new PID namespace shows that the shell can't see
985 any processes outside the PID namespace:
986 .PP
987 .in +4n
988 .EX
989 bash$ \fBmount \-t proc proc /proc\fP
990 bash$ \fBps ax\fP
991 PID TTY STAT TIME COMMAND
992 1 pts/3 S 0:00 bash
993 22 pts/3 R+ 0:00 ps ax
994 .EE
995 .in
996 .PP
997 Inside the user namespace, the shell has user and group ID 0,
998 and a full set of permitted and effective capabilities:
999 .PP
1000 .in +4n
1001 .EX
1002 bash$ \fBcat /proc/$$/status | egrep '^[UG]id'\fP
1003 Uid: 0 0 0 0
1004 Gid: 0 0 0 0
1005 bash$ \fBcat /proc/$$/status | egrep '^Cap(Prm|Inh|Eff)'\fP
1006 CapInh: 0000000000000000
1007 CapPrm: 0000001fffffffff
1008 CapEff: 0000001fffffffff
1009 .EE
1010 .in
1011 .SS Program source
1012 \&
1013 .EX
1014 /* userns_child_exec.c
1015
1016 Licensed under GNU General Public License v2 or later
1017
1018 Create a child process that executes a shell command in new
1019 namespace(s); allow UID and GID mappings to be specified when
1020 creating a user namespace.
1021 */
1022 #define _GNU_SOURCE
1023 #include <sched.h>
1024 #include <unistd.h>
1025 #include <stdlib.h>
1026 #include <sys/wait.h>
1027 #include <signal.h>
1028 #include <fcntl.h>
1029 #include <stdio.h>
1030 #include <string.h>
1031 #include <limits.h>
1032 #include <errno.h>
1033
1034 /* A simple error\-handling function: print an error message based
1035 on the value in \(aqerrno\(aq and terminate the calling process */
1036
1037 #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
1038 } while (0)
1039
1040 struct child_args {
1041 char **argv; /* Command to be executed by child, with args */
1042 int pipe_fd[2]; /* Pipe used to synchronize parent and child */
1043 };
1044
1045 static int verbose;
1046
1047 static void
1048 usage(char *pname)
1049 {
1050 fprintf(stderr, "Usage: %s [options] cmd [arg...]\\n\\n", pname);
1051 fprintf(stderr, "Create a child process that executes a shell "
1052 "command in a new user namespace,\\n"
1053 "and possibly also other new namespace(s).\\n\\n");
1054 fprintf(stderr, "Options can be:\\n\\n");
1055 #define fpe(str) fprintf(stderr, " %s", str);
1056 fpe("\-i New IPC namespace\\n");
1057 fpe("\-m New mount namespace\\n");
1058 fpe("\-n New network namespace\\n");
1059 fpe("\-p New PID namespace\\n");
1060 fpe("\-u New UTS namespace\\n");
1061 fpe("\-U New user namespace\\n");
1062 fpe("\-M uid_map Specify UID map for user namespace\\n");
1063 fpe("\-G gid_map Specify GID map for user namespace\\n");
1064 fpe("\-z Map user\(aqs UID and GID to 0 in user namespace\\n");
1065 fpe(" (equivalent to: \-M \(aq0 <uid> 1\(aq \-G \(aq0 <gid> 1\(aq)\\n");
1066 fpe("\-v Display verbose messages\\n");
1067 fpe("\\n");
1068 fpe("If \-z, \-M, or \-G is specified, \-U is required.\\n");
1069 fpe("It is not permitted to specify both \-z and either \-M or \-G.\\n");
1070 fpe("\\n");
1071 fpe("Map strings for \-M and \-G consist of records of the form:\\n");
1072 fpe("\\n");
1073 fpe(" ID\-inside\-ns ID\-outside\-ns len\\n");
1074 fpe("\\n");
1075 fpe("A map string can contain multiple records, separated"
1076 " by commas;\\n");
1077 fpe("the commas are replaced by newlines before writing"
1078 " to map files.\\n");
1079
1080 exit(EXIT_FAILURE);
1081 }
1082
1083 /* Update the mapping file \(aqmap_file\(aq, with the value provided in
1084 \(aqmapping\(aq, a string that defines a UID or GID mapping. A UID or
1085 GID mapping consists of one or more newline\-delimited records
1086 of the form:
1087
1088 ID_inside\-ns ID\-outside\-ns length
1089
1090 Requiring the user to supply a string that contains newlines is
1091 of course inconvenient for command\-line use. Thus, we permit the
1092 use of commas to delimit records in this string, and replace them
1093 with newlines before writing the string to the file. */
1094
1095 static void
1096 update_map(char *mapping, char *map_file)
1097 {
1098 int fd, j;
1099 size_t map_len; /* Length of \(aqmapping\(aq */
1100
1101 /* Replace commas in mapping string with newlines */
1102
1103 map_len = strlen(mapping);
1104 for (j = 0; j < map_len; j++)
1105 if (mapping[j] == \(aq,\(aq)
1106 mapping[j] = \(aq\\n\(aq;
1107
1108 fd = open(map_file, O_RDWR);
1109 if (fd == \-1) {
1110 fprintf(stderr, "ERROR: open %s: %s\\n", map_file,
1111 strerror(errno));
1112 exit(EXIT_FAILURE);
1113 }
1114
1115 if (write(fd, mapping, map_len) != map_len) {
1116 fprintf(stderr, "ERROR: write %s: %s\\n", map_file,
1117 strerror(errno));
1118 exit(EXIT_FAILURE);
1119 }
1120
1121 close(fd);
1122 }
1123
1124 /* Linux 3.19 made a change in the handling of setgroups(2) and the
1125 \(aqgid_map\(aq file to address a security issue. The issue allowed
1126 *unprivileged* users to employ user namespaces in order to drop
1127 The upshot of the 3.19 changes is that in order to update the
1128 \(aqgid_maps\(aq file, use of the setgroups() system call in this
1129 user namespace must first be disabled by writing "deny" to one of
1130 the /proc/PID/setgroups files for this namespace. That is the
1131 purpose of the following function. */
1132
1133 static void
1134 proc_setgroups_write(pid_t child_pid, char *str)
1135 {
1136 char setgroups_path[PATH_MAX];
1137 int fd;
1138
1139 snprintf(setgroups_path, PATH_MAX, "/proc/%ld/setgroups",
1140 (long) child_pid);
1141
1142 fd = open(setgroups_path, O_RDWR);
1143 if (fd == \-1) {
1144
1145 /* We may be on a system that doesn\(aqt support
1146 /proc/PID/setgroups. In that case, the file won\(aqt exist,
1147 and the system won\(aqt impose the restrictions that Linux 3.19
1148 added. That\(aqs fine: we don\(aqt need to do anything in order
1149 to permit \(aqgid_map\(aq to be updated.
1150
1151 However, if the error from open() was something other than
1152 the ENOENT error that is expected for that case, let the
1153 user know. */
1154
1155 if (errno != ENOENT)
1156 fprintf(stderr, "ERROR: open %s: %s\\n", setgroups_path,
1157 strerror(errno));
1158 return;
1159 }
1160
1161 if (write(fd, str, strlen(str)) == \-1)
1162 fprintf(stderr, "ERROR: write %s: %s\\n", setgroups_path,
1163 strerror(errno));
1164
1165 close(fd);
1166 }
1167
1168 static int /* Start function for cloned child */
1169 childFunc(void *arg)
1170 {
1171 struct child_args *args = (struct child_args *) arg;
1172 char ch;
1173
1174 /* Wait until the parent has updated the UID and GID mappings.
1175 See the comment in main(). We wait for end of file on a
1176 pipe that will be closed by the parent process once it has
1177 updated the mappings. */
1178
1179 close(args\->pipe_fd[1]); /* Close our descriptor for the write
1180 end of the pipe so that we see EOF
1181 when parent closes its descriptor */
1182 if (read(args\->pipe_fd[0], &ch, 1) != 0) {
1183 fprintf(stderr,
1184 "Failure in child: read from pipe returned != 0\\n");
1185 exit(EXIT_FAILURE);
1186 }
1187
1188 close(args\->pipe_fd[0]);
1189
1190 /* Execute a shell command */
1191
1192 printf("About to exec %s\\n", args\->argv[0]);
1193 execvp(args\->argv[0], args\->argv);
1194 errExit("execvp");
1195 }
1196
1197 #define STACK_SIZE (1024 * 1024)
1198
1199 static char child_stack[STACK_SIZE]; /* Space for child\(aqs stack */
1200
1201 int
1202 main(int argc, char *argv[])
1203 {
1204 int flags, opt, map_zero;
1205 pid_t child_pid;
1206 struct child_args args;
1207 char *uid_map, *gid_map;
1208 const int MAP_BUF_SIZE = 100;
1209 char map_buf[MAP_BUF_SIZE];
1210 char map_path[PATH_MAX];
1211
1212 /* Parse command\-line options. The initial \(aq+\(aq character in
1213 the final getopt() argument prevents GNU\-style permutation
1214 of command\-line options. That\(aqs useful, since sometimes
1215 the \(aqcommand\(aq to be executed by this program itself
1216 has command\-line options. We don\(aqt want getopt() to treat
1217 those as options to this program. */
1218
1219 flags = 0;
1220 verbose = 0;
1221 gid_map = NULL;
1222 uid_map = NULL;
1223 map_zero = 0;
1224 while ((opt = getopt(argc, argv, "+imnpuUM:G:zv")) != \-1) {
1225 switch (opt) {
1226 case \(aqi\(aq: flags |= CLONE_NEWIPC; break;
1227 case \(aqm\(aq: flags |= CLONE_NEWNS; break;
1228 case \(aqn\(aq: flags |= CLONE_NEWNET; break;
1229 case \(aqp\(aq: flags |= CLONE_NEWPID; break;
1230 case \(aqu\(aq: flags |= CLONE_NEWUTS; break;
1231 case \(aqv\(aq: verbose = 1; break;
1232 case \(aqz\(aq: map_zero = 1; break;
1233 case \(aqM\(aq: uid_map = optarg; break;
1234 case \(aqG\(aq: gid_map = optarg; break;
1235 case \(aqU\(aq: flags |= CLONE_NEWUSER; break;
1236 default: usage(argv[0]);
1237 }
1238 }
1239
1240 /* \-M or \-G without \-U is nonsensical */
1241
1242 if (((uid_map != NULL || gid_map != NULL || map_zero) &&
1243 !(flags & CLONE_NEWUSER)) ||
1244 (map_zero && (uid_map != NULL || gid_map != NULL)))
1245 usage(argv[0]);
1246
1247 args.argv = &argv[optind];
1248
1249 /* We use a pipe to synchronize the parent and child, in order to
1250 ensure that the parent sets the UID and GID maps before the child
1251 calls execve(). This ensures that the child maintains its
1252 capabilities during the execve() in the common case where we
1253 want to map the child\(aqs effective user ID to 0 in the new user
1254 namespace. Without this synchronization, the child would lose
1255 its capabilities if it performed an execve() with nonzero
1256 user IDs (see the capabilities(7) man page for details of the
1257 transformation of a process\(aqs capabilities during execve()). */
1258
1259 if (pipe(args.pipe_fd) == \-1)
1260 errExit("pipe");
1261
1262 /* Create the child in new namespace(s) */
1263
1264 child_pid = clone(childFunc, child_stack + STACK_SIZE,
1265 flags | SIGCHLD, &args);
1266 if (child_pid == \-1)
1267 errExit("clone");
1268
1269 /* Parent falls through to here */
1270
1271 if (verbose)
1272 printf("%s: PID of child created by clone() is %ld\\n",
1273 argv[0], (long) child_pid);
1274
1275 /* Update the UID and GID maps in the child */
1276
1277 if (uid_map != NULL || map_zero) {
1278 snprintf(map_path, PATH_MAX, "/proc/%ld/uid_map",
1279 (long) child_pid);
1280 if (map_zero) {
1281 snprintf(map_buf, MAP_BUF_SIZE, "0 %ld 1", (long) getuid());
1282 uid_map = map_buf;
1283 }
1284 update_map(uid_map, map_path);
1285 }
1286
1287 if (gid_map != NULL || map_zero) {
1288 proc_setgroups_write(child_pid, "deny");
1289
1290 snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map",
1291 (long) child_pid);
1292 if (map_zero) {
1293 snprintf(map_buf, MAP_BUF_SIZE, "0 %ld 1", (long) getgid());
1294 gid_map = map_buf;
1295 }
1296 update_map(gid_map, map_path);
1297 }
1298
1299 /* Close the write end of the pipe, to signal to the child that we
1300 have updated the UID and GID maps */
1301
1302 close(args.pipe_fd[1]);
1303
1304 if (waitpid(child_pid, NULL, 0) == \-1) /* Wait for child */
1305 errExit("waitpid");
1306
1307 if (verbose)
1308 printf("%s: terminating\\n", argv[0]);
1309
1310 exit(EXIT_SUCCESS);
1311 }
1312 .EE
1313 .SH SEE ALSO
1314 .BR newgidmap (1), \" From the shadow package
1315 .BR newuidmap (1), \" From the shadow package
1316 .BR clone (2),
1317 .BR ptrace (2),
1318 .BR setns (2),
1319 .BR unshare (2),
1320 .BR proc (5),
1321 .BR subgid (5), \" From the shadow package
1322 .BR subuid (5), \" From the shadow package
1323 .BR capabilities (7),
1324 .BR cgroup_namespaces (7)
1325 .BR credentials (7),
1326 .BR namespaces (7),
1327 .BR pid_namespaces (7)
1328 .PP
1329 The kernel source file
1330 .IR Documentation/namespaces/resource-control.txt .