]>
Commit | Line | Data |
---|---|---|
b10c74ff EB |
1 | .\" Copyright (c) 2013, 2014 by Michael Kerrisk <mtk.manpages@gmail.com> |
2 | .\" and Copyright (c) 2012, 2014 by Eric W. Biederman <ebiederm@xmission.com> | |
046de6a7 | 3 | .\" |
c228b4b4 | 4 | .\" %%%LICENSE_START(VERBATIM) |
046de6a7 MK |
5 | .\" Permission is granted to make and distribute verbatim copies of this |
6 | .\" manual provided the copyright notice and this permission notice are | |
7 | .\" preserved on all copies. | |
8 | .\" | |
9 | .\" Permission is granted to copy and distribute modified versions of this | |
10 | .\" manual under the conditions for verbatim copying, provided that the | |
11 | .\" entire resulting derived work is distributed under the terms of a | |
12 | .\" permission notice identical to this one. | |
13 | .\" | |
14 | .\" Since the Linux kernel and libraries are constantly changing, this | |
15 | .\" manual page may be incorrect or out-of-date. The author(s) assume no | |
16 | .\" responsibility for errors or omissions, or for damages resulting from | |
17 | .\" the use of the information contained herein. The author(s) may not | |
18 | .\" have taken the same level of care in the production of this manual, | |
19 | .\" which is licensed free of charge, as they might when working | |
20 | .\" professionally. | |
21 | .\" | |
22 | .\" Formatted or processed versions of this manual, if unaccompanied by | |
23 | .\" the source, must acknowledge the copyright and authors of this work. | |
c228b4b4 | 24 | .\" %%%LICENSE_END |
046de6a7 MK |
25 | .\" |
26 | .\" | |
3df541c0 | 27 | .TH USER_NAMESPACES 7 2016-07-17 "Linux" "Linux Programmer's Manual" |
046de6a7 | 28 | .SH NAME |
445d38c9 | 29 | user_namespaces \- overview of Linux user namespaces |
046de6a7 MK |
30 | .SH DESCRIPTION |
31 | For an overview of namespaces, see | |
32 | .BR namespaces (7). | |
33 | ||
99f04bb1 MK |
34 | User namespaces isolate security-related identifiers and attributes, |
35 | in particular, | |
03611be8 | 36 | user IDs and group IDs (see |
dba9ebf2 | 37 | .BR credentials (7)), |
99f04bb1 | 38 | the root directory, |
03611be8 | 39 | keys (see |
046de6a7 | 40 | .BR keyctl (2)), |
6b928030 MK |
41 | .\" FIXME: This page says very little about the interaction |
42 | .\" of user namespaces and keys. Add something on this topic. | |
03611be8 | 43 | and capabilities (see |
d916d9d0 | 44 | .BR capabilities (7)). |
046de6a7 MK |
45 | A process's user and group IDs can be different |
46 | inside and outside a user namespace. | |
47 | In particular, | |
48 | a process can have a normal unprivileged user ID outside a user namespace | |
49 | while at the same time having a user ID of 0 inside the namespace; | |
50 | in other words, | |
51 | the process has full privileges for operations inside the user namespace, | |
52 | but is unprivileged for operations outside the namespace. | |
d916d9d0 MK |
53 | .\" |
54 | .\" ============================================================ | |
55 | .\" | |
56 | .SS Nested namespaces, namespace membership | |
57 | User namespaces can be nested; | |
58 | that is, each user namespace\(emexcept the initial ("root") | |
59 | namespace\(emhas a parent user namespace, | |
60 | and can have zero or more child user namespaces. | |
61 | The parent user namespace is the user namespace | |
62 | of the process that creates the user namespace via a call to | |
63 | .BR unshare (2) | |
64 | or | |
65 | .BR clone (2) | |
66 | with the | |
67 | .BR CLONE_NEWUSER | |
68 | flag. | |
69 | ||
e56b6c42 MK |
70 | The kernel imposes (since version 3.11) a limit of 32 nested levels of |
71 | .\" commit 8742f229b635bf1c1c84a3dfe5e47c814c20b5c8 | |
72 | user namespaces. | |
73 | .\" FIXME Explain the rationale for this limit. (What is the rationale?) | |
74 | Calls to | |
75 | .BR unshare (2) | |
76 | or | |
77 | .BR clone (2) | |
78 | that would cause this limit to be exceeded fail with the error | |
79 | .BR EUSERS . | |
80 | ||
3b44624f | 81 | Each process is a member of exactly one user namespace. |
d916d9d0 MK |
82 | A process created via |
83 | .BR fork (2) | |
84 | or | |
85 | .BR clone (2) | |
86 | without the | |
87 | .BR CLONE_NEWUSER | |
88 | flag is a member of the same user namespace as its parent. | |
7aba437a | 89 | A single-threaded process can join another user namespace with |
d916d9d0 MK |
90 | .BR setns (2) |
91 | if it has the | |
92 | .BR CAP_SYS_ADMIN | |
93 | in that namespace; | |
94 | upon doing so, it gains a full set of capabilities in that namespace. | |
95 | ||
96 | A call to | |
97 | .BR clone (2) | |
98 | or | |
99 | .BR unshare (2) | |
100 | with the | |
101 | .BR CLONE_NEWUSER | |
102 | flag makes the new child process (for | |
103 | .BR clone (2)) | |
104 | or the caller (for | |
105 | .BR unshare (2)) | |
106 | a member of the new user namespace created by the call. | |
107 | .\" | |
108 | .\" ============================================================ | |
109 | .\" | |
110 | .SS Capabilities | |
96ec9d12 MK |
111 | The child process created by |
112 | .BR clone (2) | |
113 | with the | |
114 | .BR CLONE_NEWUSER | |
115 | flag starts out with a complete set | |
116 | of capabilities in the new user namespace. | |
d916d9d0 MK |
117 | Likewise, a process that creates a new user namespace using |
118 | .BR unshare (2) | |
119 | or joins an existing user namespace using | |
120 | .BR setns (2) | |
3b44624f | 121 | gains a full set of capabilities in that namespace. |
c0098e76 | 122 | On the other hand, |
d68c5f11 MK |
123 | that process has no capabilities in the parent (in the case of |
124 | .BR clone (2)) | |
125 | or previous (in the case of | |
126 | .BR unshare (2) | |
127 | and | |
128 | .BR setns (2)) | |
129 | user namespace, | |
d916d9d0 MK |
130 | even if the new namespace is created or joined by the root user |
131 | (i.e., a process with user ID 0 in the root namespace). | |
c0098e76 | 132 | |
77f95488 MK |
133 | Note that a call to |
134 | .BR execve (2) | |
6cfec3d8 | 135 | will cause a process's capabilities to be recalculated in the usual way (see |
a3969b76 MK |
136 | .BR capabilities (7)). |
137 | Consequently, | |
138 | unless the process has a user ID of 0 within the namespace, | |
139 | or the executable file has a nonempty inheritable capabilities mask, | |
140 | the process will lose all capabilities. | |
6c21c0f9 | 141 | See the discussion of user and group ID mappings, below. |
77f95488 | 142 | |
f5d401dd | 143 | A call to |
0666f549 MK |
144 | .BR clone (2), |
145 | .BR unshare (2), | |
146 | or | |
147 | .BR setns (2) | |
148 | using the | |
149 | .BR CLONE_NEWUSER | |
150 | flag sets the "securebits" flags | |
151 | (see | |
152 | .BR capabilities (7)) | |
153 | to their default values (all flags disabled) in the child (for | |
154 | .BR clone (2)) | |
155 | or caller (for | |
156 | .BR unshare (2), | |
157 | or | |
158 | .BR setns (2)). | |
d68c5f11 MK |
159 | Note that because the caller no longer has capabilities |
160 | in its original user namespace after a call to | |
0666f549 MK |
161 | .BR setns (2), |
162 | it is not possible for a process to reset its "securebits" flags while | |
163 | retaining its user namespace membership by using a pair of | |
164 | .BR setns (2) | |
165 | calls to move to another user namespace and then return to | |
166 | its original user namespace. | |
167 | ||
d916d9d0 MK |
168 | The rules for determining whether or not a process has a capability |
169 | in a particular user namespace are as follows: | |
170 | .IP 1. 3 | |
171 | A process has a capability inside a user namespace | |
172 | if it is a member of that namespace and | |
173 | it has the capability in its effective capability set. | |
174 | A process can gain capabilities in its effective capability | |
175 | set in various ways. | |
176 | For example, it may execute a set-user-ID program or an | |
177 | executable with associated file capabilities. | |
178 | In addition, | |
179 | a process may gain capabilities via the effect of | |
3b44624f MK |
180 | .BR clone (2), |
181 | .BR unshare (2), | |
046de6a7 | 182 | or |
d916d9d0 MK |
183 | .BR setns (2), |
184 | as already described. | |
185 | .\" In the 3.8 sources, see security/commoncap.c::cap_capable(): | |
186 | .IP 2. | |
187 | If a process has a capability in a user namespace, | |
188 | then it has that capability in all child (and further removed descendant) | |
189 | namespaces as well. | |
190 | .IP 3. | |
191 | .\" * The owner of the user namespace in the parent of the | |
192 | .\" * user namespace has all caps. | |
193 | When a user namespace is created, the kernel records the effective | |
194 | user ID of the creating process as being the "owner" of the namespace. | |
195 | .\" (and likewise associates the effective group ID of the creating process | |
196 | .\" with the namespace). | |
197 | A process that resides | |
198 | in the parent of the user namespace | |
199 | .\" See kernel commit 520d9eabce18edfef76a60b7b839d54facafe1f9 for a fix | |
200 | .\" on this point | |
201 | and whose effective user ID matches the owner of the namespace | |
202 | has all capabilities in the namespace. | |
203 | .\" This includes the case where the process executes a set-user-ID | |
204 | .\" program that confers the effective UID of the creator of the namespace. | |
205 | By virtue of the previous rule, | |
206 | this means that the process has all capabilities in all | |
207 | further removed descendant user namespaces as well. | |
2304b0d7 MK |
208 | .\" |
209 | .\" ============================================================ | |
210 | .\" | |
211 | .SS Effect of capabilities within a user namespace | |
89070c1a MK |
212 | Having a capability inside a user namespace |
213 | permits a process to perform operations (that require privilege) | |
214 | only on resources governed by that namespace. | |
215 | In other words, having a capability in a user namespace permits a process | |
216 | to perform privileged operations on resources that are governed by (nonuser) | |
217 | namespaces associated with the user namespace (see the next subsection). | |
2304b0d7 | 218 | |
89070c1a MK |
219 | On the other hand, there are many privileged operations that affect |
220 | resources that are not associated with any namespace type, | |
221 | for example, changing the system time (governed by | |
222 | .BR CAP_SYS_TIME ), | |
223 | loading a kernel module (governed by | |
224 | .BR CAP_SYS_MODULE ), | |
225 | and creating a device (governed by | |
226 | .BR CAP_MKNOD ). | |
227 | Only a process with privileges in the | |
228 | .I initial | |
229 | user namespace can perform such operations. | |
32efecaa MK |
230 | |
231 | Holding | |
232 | .B CAP_SYS_ADMIN | |
8a9fb19d MK |
233 | within the user namespace associated with a process's mount namespace |
234 | allows that process to create bind mounts | |
235 | and mount the following types of filesystems: | |
32efecaa MK |
236 | .\" fs_flags = FS_USERNS_MOUNT in kernel sources |
237 | ||
238 | .RS 4 | |
239 | .PD 0 | |
240 | .IP * 2 | |
241 | .IR /proc | |
242 | (since Linux 3.8) | |
243 | .IP * | |
244 | .IR /sys | |
245 | (since Linux 3.8) | |
246 | .IP * | |
247 | .IR devpts | |
248 | (since Linux 3.9) | |
249 | .IP * | |
4e07c70f | 250 | .BR tmpfs (5) |
32efecaa MK |
251 | (since Linux 3.9) |
252 | .IP * | |
253 | .IR ramfs | |
254 | (since Linux 3.9) | |
255 | .IP * | |
256 | .IR mqueue | |
257 | (since Linux 3.9) | |
258 | .IP * | |
259 | .IR bpf | |
260 | .\" commit b2197755b2633e164a439682fb05a9b5ea48f706 | |
261 | (since Linux 4.4) | |
262 | .PD | |
263 | .RE | |
264 | .PP | |
7e52299f MK |
265 | Holding |
266 | .B CAP_SYS_ADMIN | |
c7e077ea | 267 | within the user namespace associated with a process's cgroup namespace |
7e52299f | 268 | allows (since Linux 4.6) |
8c74a1ce MK |
269 | that process to the mount cgroup version 2 filesystem and |
270 | cgroup version 1 named hierarchies | |
271 | (i.e., cgroup filesystems mounted with the | |
272 | .BR """none,name=""" | |
273 | option). | |
7e52299f | 274 | |
687d3f4a MK |
275 | Holding |
276 | .B CAP_SYS_ADMIN | |
277 | within the user namespace associated with a process's PID namespace | |
278 | allows (since Linux 3.8) | |
279 | that process to mount | |
280 | .I /proc | |
281 | filesystems. | |
282 | ||
32efecaa MK |
283 | Note however, that mounting block-based filesystems can be done |
284 | only by a process that holds | |
285 | .BR CAP_SYS_ADMIN | |
286 | in the initial user namespace. | |
d6842bf1 MK |
287 | .\" |
288 | .\" ============================================================ | |
289 | .\" | |
62a5214c | 290 | .SS Interaction of user namespaces and other types of namespaces |
046de6a7 | 291 | Starting in Linux 3.8, unprivileged processes can create user namespaces, |
06999763 | 292 | and other the other types of namespaces can be created with just the |
046de6a7 MK |
293 | .B CAP_SYS_ADMIN |
294 | capability in the caller's user namespace. | |
295 | ||
576233f0 MK |
296 | When a non-user-namespace is created, |
297 | it is owned by the user namespace in which the creating process | |
298 | was a member at the time of the creation of the namespace. | |
299 | Actions on the non-user-namespace | |
300 | require capabilities in the corresponding user namespace. | |
301 | ||
046de6a7 MK |
302 | If |
303 | .BR CLONE_NEWUSER | |
304 | is specified along with other | |
305 | .B CLONE_NEW* | |
306 | flags in a single | |
307 | .BR clone (2) | |
308 | or | |
309 | .BR unshare (2) | |
310 | call, the user namespace is guaranteed to be created first, | |
96ec9d12 MK |
311 | giving the child |
312 | .RB ( clone (2)) | |
313 | or caller | |
314 | .RB ( unshare (2)) | |
315 | privileges over the remaining namespaces created by the call. | |
046de6a7 MK |
316 | Thus, it is possible for an unprivileged caller to specify this combination |
317 | of flags. | |
318 | ||
06999763 | 319 | When a new namespace (other than a user namespace) is created via |
046de6a7 MK |
320 | .BR clone (2) |
321 | or | |
322 | .BR unshare (2), | |
323 | the kernel records the user namespace of the creating process against | |
324 | the new namespace. | |
d916d9d0 | 325 | (This association can't be changed.) |
046de6a7 MK |
326 | When a process in the new namespace subsequently performs |
327 | privileged operations that operate on global | |
328 | resources isolated by the namespace, | |
329 | the permission checks are performed according to the process's capabilities | |
330 | in the user namespace that the kernel associated with the new namespace. | |
7ea1c45e MK |
331 | For example, suppose that a process attempts to change the hostname |
332 | .RB ( sethostname (2)), | |
333 | a resource governed by the UTS namespace. | |
334 | In this case, | |
335 | the kernel will determine which user namespace is associated with | |
336 | the process's UTS namespace, and check whether the process has the | |
337 | required capability | |
338 | .RB ( CAP_SYS_ADMIN ) | |
339 | in that user namespace. | |
d6842bf1 MK |
340 | .\" |
341 | .\" ============================================================ | |
342 | .\" | |
b10c74ff | 343 | .SS Restrictions on mount namespaces |
b10c74ff EB |
344 | Note the following points with respect to mount namespaces: |
345 | .IP * 3 | |
346 | A mount namespace has an owner user namespace. | |
347 | A mount namespace whose owner user namespace is different from | |
348 | the owner user namespace of its parent mount namespace is | |
349 | considered a less privileged mount namespace. | |
350 | .IP * | |
351 | When creating a less privileged mount namespace, | |
352 | shared mounts are reduced to slave mounts. | |
353 | This ensures that mappings performed in less | |
354 | privileged mount namespaces will not propagate to more privileged | |
355 | mount namespaces. | |
356 | .IP * | |
357 | .\" FIXME . | |
358 | .\" What does "come as a single unit from more privileged mount" mean? | |
359 | Mounts that come as a single unit from more privileged mount are | |
360 | locked together and may not be separated in a less privileged mount | |
361 | namespace. | |
890a86d3 EB |
362 | (The |
363 | .BR unshare (2) | |
364 | .B CLONE_NEWNS | |
365 | operation brings across all of the mounts from the original | |
366 | mount namespace as a single unit, | |
09fcbb82 MK |
367 | and recursive mounts that propagate between |
368 | mount namespaces propagate as a single unit.) | |
b10c74ff EB |
369 | .IP * |
370 | The | |
371 | .BR mount (2) | |
372 | flags | |
373 | .BR MS_RDONLY , | |
374 | .BR MS_NOSUID , | |
375 | .BR MS_NOEXEC , | |
376 | and the "atime" flags | |
377 | .RB ( MS_NOATIME , | |
378 | .BR MS_NODIRATIME , | |
1c3c805b | 379 | .BR MS_RELATIME ) |
b10c74ff EB |
380 | settings become locked |
381 | .\" commit 9566d6742852c527bf5af38af5cbb878dad75705 | |
382 | .\" Author: Eric W. Biederman <ebiederm@xmission.com> | |
383 | .\" Date: Mon Jul 28 17:26:07 2014 -0700 | |
f5d401dd | 384 | .\" |
b10c74ff EB |
385 | .\" mnt: Correct permission checks in do_remount |
386 | .\" | |
387 | when propagated from a more privileged to | |
388 | a less privileged mount namespace, | |
389 | and may not be changed in the less privileged mount namespace. | |
390 | .IP * | |
391 | .\" (As of 3.18-rc1 (in Al Viro's 2014-08-30 vfs.git#for-next tree)) | |
392 | A file or directory that is a mount point in one namespace that is not | |
393 | a mount point in another namespace, may be renamed, unlinked, or removed | |
394 | .RB ( rmdir (2)) | |
395 | in the mount namespace in which it is not a mount point | |
396 | (subject to the usual permission checks). | |
397 | .IP | |
398 | Previously, attempting to unlink, rename, or remove a file or directory | |
399 | that was a mount point in another mount namespace would result in the error | |
400 | .BR EBUSY . | |
401 | That behavior had technical problems of enforcement (e.g., for NFS) | |
402 | and permitted denial-of-service attacks against more privileged users. | |
403 | (i.e., preventing individual files from being updated | |
404 | by bind mounting on top of them). | |
405 | .\" | |
406 | .\" ============================================================ | |
407 | .\" | |
62a5214c | 408 | .SS User and group ID mappings: uid_map and gid_map |
6eda9441 MK |
409 | When a user namespace is created, |
410 | it starts out without a mapping of user IDs (group IDs) | |
411 | to the parent user namespace. | |
046de6a7 MK |
412 | The |
413 | .IR /proc/[pid]/uid_map | |
414 | and | |
415 | .IR /proc/[pid]/gid_map | |
416 | files (available since Linux 3.5) | |
417 | .\" commit 22d917d80e842829d0ca0a561967d728eb1d6303 | |
418 | expose the mappings for user and group IDs | |
419 | inside the user namespace for the process | |
420 | .IR pid . | |
62a5214c MK |
421 | These files can be read to view the mappings in a user namespace and |
422 | written to (once) to define the mappings. | |
423 | ||
424 | The description in the following paragraphs explains the details for | |
046de6a7 MK |
425 | .IR uid_map ; |
426 | .IR gid_map | |
427 | is exactly the same, | |
428 | but each instance of "user ID" is replaced by "group ID". | |
429 | ||
430 | The | |
431 | .I uid_map | |
432 | file exposes the mapping of user IDs from the user namespace | |
433 | of the process | |
434 | .IR pid | |
435 | to the user namespace of the process that opened | |
436 | .IR uid_map | |
437 | (but see a qualification to this point below). | |
438 | In other words, processes that are in different user namespaces | |
439 | will potentially see different values when reading from a particular | |
440 | .I uid_map | |
441 | file, depending on the user ID mappings for the user namespaces | |
442 | of the reading processes. | |
443 | ||
444 | Each line in the | |
445 | .I uid_map | |
446 | file specifies a 1-to-1 mapping of a range of contiguous | |
447 | user IDs between two user namespaces. | |
448 | (When a user namespace is first created, this file is empty.) | |
449 | The specification in each line takes the form of | |
450 | three numbers delimited by white space. | |
d45d0128 | 451 | The first two numbers specify the starting user ID in |
3b44624f | 452 | each of the two user namespaces. |
046de6a7 MK |
453 | The third number specifies the length of the mapped range. |
454 | In detail, the fields are interpreted as follows: | |
455 | .IP (1) 4 | |
456 | The start of the range of user IDs in | |
457 | the user namespace of the process | |
458 | .IR pid . | |
459 | .IP (2) | |
460 | The start of the range of user | |
461 | IDs to which the user IDs specified by field one map. | |
462 | How field two is interpreted depends on whether the process that opened | |
463 | .I uid_map | |
464 | and the process | |
465 | .IR pid | |
466 | are in the same user namespace, as follows: | |
467 | .RS | |
468 | .IP a) 3 | |
469 | If the two processes are in different user namespaces: | |
470 | field two is the start of a range of | |
471 | user IDs in the user namespace of the process that opened | |
472 | .IR uid_map . | |
473 | .IP b) | |
474 | If the two processes are in the same user namespace: | |
475 | field two is the start of the range of | |
476 | user IDs in the parent user namespace of the process | |
477 | .IR pid . | |
478 | This case enables the opener of | |
479 | .I uid_map | |
480 | (the common case here is opening | |
481 | .IR /proc/self/uid_map ) | |
482 | to see the mapping of user IDs into the user namespace of the process | |
483 | that created this user namespace. | |
484 | .RE | |
485 | .IP (3) | |
486 | The length of the range of user IDs that is mapped between the two | |
487 | user namespaces. | |
6eda9441 MK |
488 | .PP |
489 | System calls that return user IDs (group IDs)\(emfor example, | |
490 | .BR getuid (2), | |
491 | .BR getgid (2), | |
492 | and the credential fields in the structure returned by | |
493 | .BR stat (2)\(emreturn | |
3b44624f | 494 | the user ID (group ID) mapped into the caller's user namespace. |
6eda9441 MK |
495 | |
496 | When a process accesses a file, its user and group IDs | |
497 | are mapped into the initial user namespace for the purpose of permission | |
498 | checking and assigning IDs when creating a file. | |
499 | When a process retrieves file user and group IDs via | |
3b44624f | 500 | .BR stat (2), |
6eda9441 MK |
501 | the IDs are mapped in the opposite direction, |
502 | to produce values relative to the process user and group ID mappings. | |
20e4a147 MK |
503 | |
504 | The initial user namespace has no parent namespace, | |
505 | but, for consistency, the kernel provides dummy user and group | |
506 | ID mapping files for this namespace. | |
507 | Looking at the | |
508 | .I uid_map | |
c9195ded MK |
509 | file |
510 | .RI ( gid_map | |
20e4a147 MK |
511 | is the same) from a shell in the initial namespace shows: |
512 | ||
513 | .in +4n | |
514 | .nf | |
515 | $ \fBcat /proc/$$/uid_map\fP | |
516 | 0 0 4294967295 | |
517 | .fi | |
518 | .in | |
519 | ||
520 | This mapping tells us | |
521 | that the range starting at user ID 0 in this namespace | |
522 | maps to a range starting at 0 in the (nonexistent) parent namespace, | |
523 | and the length of the range is the largest 32-bit unsigned integer. | |
364ce935 | 524 | This leaves 4294967295 (the 32-bit signed \-1 value) unmapped. |
6cfec3d8 | 525 | This is deliberate: |
374215d5 | 526 | .IR "(uid_t)\ \-1" |
6cfec3d8 MK |
527 | is used in several interfaces (e.g., |
528 | .BR setreuid (2)) | |
529 | as a way to specify "no user ID". | |
530 | Leaving | |
374215d5 | 531 | .IR "(uid_t)\ \-1" |
09fcbb82 | 532 | unmapped and unusable guarantees that there will be no |
6cfec3d8 | 533 | confusion when using these interfaces. |
d6842bf1 MK |
534 | .\" |
535 | .\" ============================================================ | |
536 | .\" | |
62a5214c | 537 | .SS Defining user and group ID mappings: writing to uid_map and gid_map |
046de6a7 MK |
538 | .PP |
539 | After the creation of a new user namespace, the | |
540 | .I uid_map | |
541 | file of | |
542 | .I one | |
37909bee | 543 | of the processes in the namespace may be written to |
046de6a7 MK |
544 | .I once |
545 | to define the mapping of user IDs in the new user namespace. | |
1b3d5347 | 546 | An attempt to write more than once to a |
046de6a7 MK |
547 | .I uid_map |
548 | file in a user namespace fails with the error | |
1b3d5347 MK |
549 | .BR EPERM . |
550 | Similar rules apply for | |
551 | .I gid_map | |
552 | files. | |
046de6a7 MK |
553 | |
554 | The lines written to | |
555 | .IR uid_map | |
1b3d5347 | 556 | .RI ( gid_map ) |
046de6a7 MK |
557 | must conform to the following rules: |
558 | .IP * 3 | |
559 | The three fields must be valid numbers, | |
560 | and the last field must be greater than 0. | |
561 | .IP * | |
562 | Lines are terminated by newline characters. | |
563 | .IP * | |
564 | There is an (arbitrary) limit on the number of lines in the file. | |
74412268 | 565 | As at Linux 3.18, the limit is five lines. |
046de6a7 MK |
566 | In addition, the number of bytes written to |
567 | the file must be less than the system page size, | |
568 | .\" FIXME(Eric): the restriction "less than" rather than "less than or equal" | |
569 | .\" seems strangely arbitrary. Furthermore, the comment does not agree | |
11d8ef17 | 570 | .\" with the code in kernel/user_namespace.c. Which is correct? |
046de6a7 MK |
571 | and the write must be performed at the start of the file (i.e., |
572 | .BR lseek (2) | |
573 | and | |
574 | .BR pwrite (2) | |
575 | can't be used to write to nonzero offsets in the file). | |
576 | .IP * | |
1b3d5347 MK |
577 | The range of user IDs (group IDs) |
578 | specified in each line cannot overlap with the ranges | |
046de6a7 | 579 | in any other lines. |
df23ae04 MK |
580 | In the initial implementation (Linux 3.8), this requirement was |
581 | satisfied by a simplistic implementation that imposed the further | |
046de6a7 MK |
582 | requirement that |
583 | the values in both field 1 and field 2 of successive lines must be | |
df23ae04 MK |
584 | in ascending numerical order, |
585 | which prevented some otherwise valid maps from being created. | |
586 | Linux 3.9 and later | |
587 | .\" commit 0bd14b4fd72afd5df41e9fd59f356740f22fceba | |
588 | fix this limitation, allowing any valid set of nonoverlapping maps. | |
046de6a7 MK |
589 | .IP * |
590 | At least one line must be written to the file. | |
591 | .PP | |
592 | Writes that violate the above rules fail with the error | |
593 | .BR EINVAL . | |
594 | ||
595 | In order for a process to write to the | |
596 | .I /proc/[pid]/uid_map | |
597 | .RI ( /proc/[pid]/gid_map ) | |
598 | file, all of the following requirements must be met: | |
599 | .IP 1. 3 | |
600 | The writing process must have the | |
601 | .BR CAP_SETUID | |
602 | .RB ( CAP_SETGID ) | |
603 | capability in the user namespace of the process | |
604 | .IR pid . | |
046de6a7 | 605 | .IP 2. |
31a7d506 | 606 | The writing process must either be in the user namespace of the process |
046de6a7 | 607 | .I pid |
31a7d506 | 608 | or be in the parent user namespace of the process |
046de6a7 MK |
609 | .IR pid . |
610 | .IP 3. | |
1863e451 MK |
611 | The mapped user IDs (group IDs) must in turn have a mapping |
612 | in the parent user namespace. | |
613 | .IP 4. | |
30b33164 | 614 | One of the following two cases applies: |
046de6a7 MK |
615 | .RS |
616 | .IP * 3 | |
30b33164 MK |
617 | .IR Either |
618 | the writing process has the | |
619 | .BR CAP_SETUID | |
620 | .RB ( CAP_SETGID ) | |
6c8571e0 MK |
621 | capability in the |
622 | .I parent | |
623 | user namespace. | |
30b33164 MK |
624 | .RS |
625 | .IP + 3 | |
626 | No further restrictions apply: | |
50b49f0b | 627 | the process can make mappings to arbitrary user IDs (group IDs) |
30b33164 MK |
628 | in the parent user namespace. |
629 | .RE | |
630 | .IP * 3 | |
631 | .IR Or | |
632 | otherwise all of the following restrictions apply: | |
633 | .RS | |
634 | .IP + 3 | |
046de6a7 MK |
635 | The data written to |
636 | .I uid_map | |
637 | .RI ( gid_map ) | |
690c890a MK |
638 | must consist of a single line that maps |
639 | the writing process's effective user ID | |
046de6a7 MK |
640 | (group ID) in the parent user namespace to a user ID (group ID) |
641 | in the user namespace. | |
30b33164 | 642 | .IP + |
0c9abe8b EB |
643 | The writing process must have the same effective user ID as the process |
644 | that created the user namespace. | |
30b33164 | 645 | .IP + |
0c9abe8b | 646 | In the case of |
f2d61dbb | 647 | .IR gid_map , |
30b33164 | 648 | use of the |
f2d61dbb | 649 | .BR setgroups (2) |
30b33164 MK |
650 | system call must first be denied by writing |
651 | .RI \(dq deny \(dq | |
652 | to the | |
653 | .I /proc/[pid]/setgroups | |
654 | file (see below) before writing to | |
655 | .IR gid_map . | |
656 | .RE | |
046de6a7 MK |
657 | .RE |
658 | .PP | |
659 | Writes that violate the above rules fail with the error | |
660 | .BR EPERM . | |
d6842bf1 MK |
661 | .\" |
662 | .\" ============================================================ | |
663 | .\" | |
f2d61dbb MK |
664 | .SS Interaction with system calls that change process UIDs or GIDs |
665 | In a user namespace where the | |
0c9abe8b | 666 | .I uid_map |
f2d61dbb MK |
667 | file has not been written, the system calls that change user IDs will fail. |
668 | Similarly, if the | |
0c9abe8b | 669 | .I gid_map |
f2d61dbb MK |
670 | file has not been written, the system calls that change group IDs will fail. |
671 | After the | |
0c9abe8b EB |
672 | .I uid_map |
673 | and | |
674 | .I gid_map | |
f2d61dbb MK |
675 | files have been written, only the mapped values may be used in |
676 | system calls that change user and group IDs. | |
0c9abe8b | 677 | |
f2d61dbb MK |
678 | For user IDs, the relevant system calls include |
679 | .BR setuid (2), | |
680 | .BR setfsuid (2), | |
681 | .BR setreuid (2), | |
0c9abe8b | 682 | and |
f2d61dbb MK |
683 | .BR setresuid (2). |
684 | For group IDs, the relevant system calls include | |
685 | .BR setgid (2), | |
686 | .BR setfsgid (2), | |
687 | .BR setregid (2), | |
688 | .BR setresgid (2), | |
0c9abe8b | 689 | and |
f2d61dbb | 690 | .BR setgroups (2). |
0c9abe8b EB |
691 | |
692 | Writing | |
f2d61dbb | 693 | .RI \(dq deny \(dq |
0c9abe8b EB |
694 | to the |
695 | .I /proc/[pid]/setgroups | |
696 | file before writing to | |
697 | .I /proc/[pid]/gid_map | |
f2d61dbb MK |
698 | .\" Things changed in Linux 3.19 |
699 | .\" commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8 | |
700 | .\" commit 66d2f338ee4c449396b6f99f5e75cd18eb6df272 | |
701 | .\" http://lwn.net/Articles/626665/ | |
702 | will permanently disable | |
703 | .BR setgroups (2) | |
704 | in a user namespace and allow writing to | |
0c9abe8b | 705 | .I /proc/[pid]/gid_map |
f2d61dbb | 706 | without having the |
0c9abe8b | 707 | .BR CAP_SETGID |
f2d61dbb MK |
708 | capability in the parent user namespace. |
709 | .\" | |
710 | .\" ============================================================ | |
711 | .\" | |
ab28dba9 | 712 | .SS The /proc/[pid]/setgroups file |
458abbe6 | 713 | .\" |
ab28dba9 MK |
714 | .\" commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8 |
715 | .\" commit 66d2f338ee4c449396b6f99f5e75cd18eb6df272 | |
716 | .\" http://lwn.net/Articles/626665/ | |
717 | .\" http://web.nvd.nist.gov/view/vuln/detail?vulnId=CVE-2014-8989 | |
718 | .\" | |
719 | The | |
720 | .I /proc/[pid]/setgroups | |
721 | file displays the string | |
722 | .RI \(dq allow \(dq | |
723 | if processes in the user namespace that contains the process | |
724 | .I pid | |
725 | are permitted to employ the | |
726 | .BR setgroups (2) | |
727 | system call; it displays | |
728 | .RI \(dq deny \(dq | |
729 | if | |
730 | .BR setgroups (2) | |
731 | is not permitted in that user namespace. | |
e2b6e58c MK |
732 | Note that regardless of the value in the |
733 | .I /proc/[pid]/setgroups | |
734 | file (and regardless of the process's capabilities), calls to | |
735 | .BR setgroups (2) | |
736 | are also not permitted if | |
737 | .IR /proc/[pid]/gid_map | |
738 | has not yet been set. | |
ab28dba9 MK |
739 | |
740 | A privileged process (one with the | |
741 | .BR CAP_SYS_ADMIN | |
742 | capability in the namespace) may write either of the strings | |
743 | .RI \(dq allow \(dq | |
744 | or | |
745 | .RI \(dq deny \(dq | |
746 | to this file | |
747 | .I before | |
748 | writing a group ID mapping | |
749 | for this user namespace to the file | |
750 | .IR /proc/[pid]/gid_map . | |
751 | Writing the string | |
752 | .RI \(dq deny \(dq | |
753 | prevents any process in the user namespace from employing | |
754 | .BR setgroups (2). | |
d6add5ef MK |
755 | |
756 | The essence of the restrictions described in the preceding | |
757 | paragraph is that it is permitted to write to | |
ab28dba9 | 758 | .I /proc/[pid]/setgroups |
d6add5ef | 759 | only so long as calling |
ab28dba9 | 760 | .BR setgroups (2) |
d6add5ef | 761 | is disallowed because |
ab28dba9 MK |
762 | .I /proc/[pid]gid_map |
763 | has not been set. | |
764 | This ensures that a process cannot transition from a state where | |
765 | .BR setgroups (2) | |
766 | is allowed to a state where | |
767 | .BR setgroups (2) | |
768 | is denied; | |
b64fbdca | 769 | a process can transition only from |
ab28dba9 MK |
770 | .BR setgroups (2) |
771 | being disallowed to | |
772 | .BR setgroups (2) | |
773 | being allowed. | |
774 | ||
775 | The default value of this file in the initial user namespace is | |
776 | .RI \(dq allow \(dq. | |
777 | ||
778 | Once | |
779 | .IR /proc/[pid]/gid_map | |
780 | has been written to | |
781 | (which has the effect of enabling | |
782 | .BR setgroups (2) | |
783 | in the user namespace), | |
fe3e2b4e | 784 | it is no longer possible to disallow |
ab28dba9 | 785 | .BR setgroups (2) |
fe3e2b4e | 786 | by writing |
bb6adc58 | 787 | .RI \(dq deny \(dq |
fe3e2b4e MK |
788 | to |
789 | .IR /proc/[pid]/setgroups | |
790 | (the write fails with the error | |
791 | .BR EPERM ). | |
ab28dba9 MK |
792 | |
793 | A child user namespace inherits the | |
34bcced0 | 794 | .IR /proc/[pid]/setgroups |
ab28dba9 MK |
795 | setting from its parent. |
796 | ||
797 | If the | |
798 | .I setgroups | |
799 | file has the value | |
800 | .RI \(dq deny \(dq, | |
801 | then the | |
802 | .BR setgroups (2) | |
803 | system call can't subsequently be reenabled (by writing | |
804 | .RI \(dq allow \(dq | |
805 | to the file) in this user namespace. | |
d6add5ef MK |
806 | (Attempts to do so will fail with the error |
807 | .BR EPERM .) | |
ab28dba9 MK |
808 | This restriction also propagates down to all child user namespaces of |
809 | this user namespace. | |
ecb0ff30 MK |
810 | |
811 | The | |
812 | .I /proc/[pid]/setgroups | |
813 | file was added in Linux 3.19, | |
814 | but was backported to many earlier stable kernel series, | |
815 | because it addresses a security issue. | |
816 | The issue concerned files with permissions such as "rwx\-\-\-rwx". | |
817 | Such files give fewer permissions to "group" than they do to "other". | |
818 | This means that dropping groups using | |
819 | .BR setgroups (2) | |
820 | might allow a process file access that it did not formerly have. | |
821 | Before the existence of user namespaces this was not a concern, | |
822 | since only a privileged process (one with the | |
823 | .BR CAP_SETGID | |
824 | capability) could call | |
825 | .BR setgroups (2). | |
826 | However, with the introduction of user namespaces, | |
827 | it became possible for an unprivileged process to create | |
828 | a new namespace in which the user had all privileges. | |
829 | This then allowed formerly unprivileged | |
830 | users to drop groups and thus gain file access | |
831 | that they did not previously have. | |
832 | The | |
833 | .I /proc/[pid]/setgroups | |
834 | file was added to address this security issue, | |
1fc04edf | 835 | by denying any pathway for an unprivileged process to drop groups with |
ecb0ff30 MK |
836 | .BR setgroups (2). |
837 | .\" | |
838 | .\" /proc/PID/setgroups | |
839 | .\" [allow == setgroups() is allowed, "deny" == setgroups() is disallowed] | |
840 | .\" * Can write if have CAP_SYS_ADMIN in NS | |
841 | .\" * Must write BEFORE writing to /proc/PID/gid_map | |
842 | .\" | |
843 | .\" setgroups() | |
844 | .\" * Must already have written to gid_maps | |
845 | .\" * /proc/PID/setgroups must be "allow" | |
846 | .\" | |
847 | .\" /proc/PID/gid_map -- writing | |
848 | .\" * Must already have written "deny" to /proc/PID/setgroups | |
ab28dba9 MK |
849 | .\" |
850 | .\" ============================================================ | |
851 | .\" | |
3e2a37ec MK |
852 | .SS Unmapped user and group IDs |
853 | .PP | |
854 | There are various places where an unmapped user ID (group ID) | |
855 | may be exposed to user space. | |
856 | For example, the first process in a new user namespace may call | |
a4680ab5 | 857 | .BR getuid (2) |
3e2a37ec MK |
858 | before a user ID mapping has been defined for the namespace. |
859 | In most such cases, an unmapped user ID is converted | |
860 | .\" from_kuid_munged(), from_kgid_munged() | |
861 | to the overflow user ID (group ID); | |
862 | the default value for the overflow user ID (group ID) is 65534. | |
863 | See the descriptions of | |
864 | .IR /proc/sys/kernel/overflowuid | |
865 | and | |
866 | .IR /proc/sys/kernel/overflowgid | |
867 | in | |
868 | .BR proc (5). | |
869 | ||
870 | The cases where unmapped IDs are mapped in this fashion include | |
871 | system calls that return user IDs | |
dba9ebf2 | 872 | .RB ( getuid (2), |
3e2a37ec MK |
873 | .BR getgid (2), |
874 | and similar), | |
875 | credentials passed over a UNIX domain socket, | |
876 | .\" also SO_PEERCRED | |
877 | credentials returned by | |
878 | .BR stat (2), | |
879 | .BR waitid (2), | |
880 | and the System V IPC "ctl" | |
881 | .B IPC_STAT | |
882 | operations, | |
883 | credentials exposed by | |
750653a8 | 884 | .IR /proc/[pid]/status |
3e2a37ec MK |
885 | and the files in |
886 | .IR /proc/sysvipc/* , | |
887 | credentials returned via the | |
888 | .I si_uid | |
889 | field in the | |
890 | .I siginfo_t | |
891 | received with a signal (see | |
892 | .BR sigaction (2)), | |
893 | credentials written to the process accounting file (see | |
3b44624f | 894 | .BR acct (5)), |
3e2a37ec MK |
895 | and credentials returned with POSIX message queue notifications (see |
896 | .BR mq_notify (3)). | |
897 | ||
898 | There is one notable case where unmapped user and group IDs are | |
899 | .I not | |
900 | .\" from_kuid(), from_kgid() | |
901 | .\" Also F_GETOWNER_UIDS is an exception | |
902 | converted to the corresponding overflow ID value. | |
903 | When viewing a | |
904 | .I uid_map | |
905 | or | |
906 | .I gid_map | |
907 | file in which there is no mapping for the second field, | |
908 | that field is displayed as 4294967295 (\-1 as an unsigned integer); | |
909 | .\" | |
910 | .\" ============================================================ | |
911 | .\" | |
62a5214c | 912 | .SS Set-user-ID and set-group-ID programs |
046de6a7 MK |
913 | .PP |
914 | When a process inside a user namespace executes | |
915 | a set-user-ID (set-group-ID) program, | |
916 | the process's effective user (group) ID inside the namespace is changed | |
917 | to whatever value is mapped for the user (group) ID of the file. | |
918 | However, if either the user | |
919 | .I or | |
920 | the group ID of the file has no mapping inside the namespace, | |
921 | the set-user-ID (set-group-ID) bit is silently ignored: | |
922 | the new program is executed, | |
923 | but the process's effective user (group) ID is left unchanged. | |
924 | (This mirrors the semantics of executing a set-user-ID or set-group-ID | |
ab3311aa | 925 | program that resides on a filesystem that was mounted with the |
046de6a7 | 926 | .BR MS_NOSUID |
3b44624f | 927 | flag, as described in |
046de6a7 | 928 | .BR mount (2).) |
6eda9441 MK |
929 | .\" |
930 | .\" ============================================================ | |
931 | .\" | |
932 | .SS Miscellaneous | |
933 | .PP | |
934 | When a process's user and group IDs are passed over a UNIX domain socket | |
935 | to a process in a different user namespace (see the description of | |
936 | .B SCM_CREDENTIALS | |
937 | in | |
938 | .BR unix (7)), | |
939 | they are translated into the corresponding values as per the | |
940 | receiving process's user and group ID mappings. | |
63f66893 | 941 | .\" |
046de6a7 MK |
942 | .SH CONFORMING TO |
943 | Namespaces are a Linux-specific feature. | |
63f66893 | 944 | .\" |
62a5214c MK |
945 | .SH NOTES |
946 | Over the years, there have been a lot of features that have been added | |
7ae693d0 | 947 | to the Linux kernel that have been made available only to privileged users |
62a5214c MK |
948 | because of their potential to confuse set-user-ID-root applications. |
949 | In general, it becomes safe to allow the root user in a user namespace to | |
950 | use those features because it is impossible, while in a user namespace, | |
951 | to gain more privilege than the root user of a user namespace has. | |
bc921757 MK |
952 | .\" |
953 | .\" ============================================================ | |
954 | .\" | |
c3f29a89 MK |
955 | .SS Availability |
956 | Use of user namespaces requires a kernel that is configured with the | |
957 | .B CONFIG_USER_NS | |
958 | option. | |
959 | User namespaces require support in a range of subsystems across | |
960 | the kernel. | |
961 | When an unsupported subsystem is configured into the kernel, | |
962 | it is not possible to configure user namespaces support. | |
ed8bd845 MK |
963 | |
964 | As at Linux 3.8, most relevant subsystems supported user namespaces, | |
965 | but a number of filesystems did not have the infrastructure needed | |
966 | to map user and group IDs between user namespaces. | |
967 | Linux 3.9 added the required infrastructure support for many of | |
968 | the remaining unsupported filesystems | |
969 | (Plan 9 (9P), Andrew File System (AFS), Ceph, CIFS, CODA, NFS, and OCFS2). | |
fa7ae0ea | 970 | Linux 3.12 added support the last of the unsupported major filesystems, |
c0d02ab0 MK |
971 | .\" commit d6970d4b726cea6d7a9bc4120814f95c09571fc3 |
972 | XFS. | |
63f66893 | 973 | .\" |
8d36d80c MK |
974 | .SH EXAMPLE |
975 | The program below is designed to allow experimenting with | |
976 | user namespaces, as well as other types of namespaces. | |
977 | It creates namespaces as specified by command-line options and then executes | |
978 | a command inside those namespaces. | |
979 | The comments and | |
980 | .I usage() | |
981 | function inside the program provide a full explanation of the program. | |
3e2a37ec MK |
982 | The following shell session demonstrates its use. |
983 | ||
984 | First, we look at the run-time environment: | |
8d36d80c MK |
985 | |
986 | .in +4n | |
987 | .nf | |
988 | $ \fBuname -rs\fP # Need Linux 3.8 or later | |
989 | Linux 3.8.0 | |
990 | $ \fBid -u\fP # Running as unprivileged user | |
991 | 1000 | |
992 | $ \fBid -g\fP | |
993 | 1000 | |
994 | .fi | |
995 | .in | |
996 | ||
3e2a37ec | 997 | Now start a new shell in new user |
8d36d80c MK |
998 | .RI ( \-U ), |
999 | mount | |
1000 | .RI ( \-m ), | |
1001 | and PID | |
1002 | .RI ( \-p ) | |
1003 | namespaces, with user ID | |
1004 | .RI ( \-M ) | |
3b44624f | 1005 | and group ID |
8d36d80c | 1006 | .RI ( \-G ) |
3b44624f | 1007 | 1000 mapped to 0 inside the user namespace: |
8d36d80c MK |
1008 | |
1009 | .in +4n | |
1010 | .nf | |
1011 | $ \fB./userns_child_exec -p -m -U -M '0 1000 1' -G '0 1000 1' bash\fP | |
1012 | .fi | |
1013 | .in | |
1014 | ||
f5d401dd | 1015 | The shell has PID 1, because it is the first process in the new |
8d36d80c MK |
1016 | PID namespace: |
1017 | ||
1018 | .in +4n | |
1019 | .nf | |
1020 | bash$ \fBecho $$\fP | |
1021 | 1 | |
1022 | .fi | |
1023 | .in | |
1024 | ||
1025 | Inside the user namespace, the shell has user and group ID 0, | |
1026 | and a full set of permitted and effective capabilities: | |
1027 | ||
1028 | .in +4n | |
1029 | .nf | |
1030 | bash$ \fBcat /proc/$$/status | egrep '^[UG]id'\fP | |
1031 | Uid: 0 0 0 0 | |
1032 | Gid: 0 0 0 0 | |
1033 | bash$ \fBcat /proc/$$/status | egrep '^Cap(Prm|Inh|Eff)'\fP | |
1034 | CapInh: 0000000000000000 | |
1035 | CapPrm: 0000001fffffffff | |
1036 | CapEff: 0000001fffffffff | |
1037 | .fi | |
1038 | .in | |
1039 | ||
1040 | Mounting a new | |
1041 | .I /proc | |
ab3311aa | 1042 | filesystem and listing all of the processes visible |
8d36d80c MK |
1043 | in the new PID namespace shows that the shell can't see |
1044 | any processes outside the PID namespace: | |
1045 | ||
1046 | .in +4n | |
1047 | .nf | |
1048 | bash$ \fBmount -t proc proc /proc\fP | |
1049 | bash$ \fBps ax\fP | |
1050 | PID TTY STAT TIME COMMAND | |
1051 | 1 pts/3 S 0:00 bash | |
1052 | 22 pts/3 R+ 0:00 ps ax | |
1053 | .fi | |
1054 | .in | |
1055 | .SS Program source | |
1056 | \& | |
1057 | .nf | |
1058 | /* userns_child_exec.c | |
1059 | ||
1060 | Licensed under GNU General Public License v2 or later | |
1061 | ||
1062 | Create a child process that executes a shell command in new | |
1063 | namespace(s); allow UID and GID mappings to be specified when | |
1064 | creating a user namespace. | |
1065 | */ | |
1066 | #define _GNU_SOURCE | |
1067 | #include <sched.h> | |
1068 | #include <unistd.h> | |
1069 | #include <stdlib.h> | |
1070 | #include <sys/wait.h> | |
1071 | #include <signal.h> | |
1072 | #include <fcntl.h> | |
1073 | #include <stdio.h> | |
1074 | #include <string.h> | |
1075 | #include <limits.h> | |
1076 | #include <errno.h> | |
1077 | ||
1078 | /* A simple error\-handling function: print an error message based | |
1079 | on the value in \(aqerrno\(aq and terminate the calling process */ | |
1080 | ||
1081 | #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\ | |
1082 | } while (0) | |
1083 | ||
1084 | struct child_args { | |
1085 | char **argv; /* Command to be executed by child, with args */ | |
1086 | int pipe_fd[2]; /* Pipe used to synchronize parent and child */ | |
1087 | }; | |
1088 | ||
1089 | static int verbose; | |
1090 | ||
1091 | static void | |
1092 | usage(char *pname) | |
1093 | { | |
1094 | fprintf(stderr, "Usage: %s [options] cmd [arg...]\\n\\n", pname); | |
1095 | fprintf(stderr, "Create a child process that executes a shell " | |
1096 | "command in a new user namespace,\\n" | |
1097 | "and possibly also other new namespace(s).\\n\\n"); | |
1098 | fprintf(stderr, "Options can be:\\n\\n"); | |
1099 | #define fpe(str) fprintf(stderr, " %s", str); | |
1100 | fpe("\-i New IPC namespace\\n"); | |
1101 | fpe("\-m New mount namespace\\n"); | |
1102 | fpe("\-n New network namespace\\n"); | |
1103 | fpe("\-p New PID namespace\\n"); | |
1104 | fpe("\-u New UTS namespace\\n"); | |
1105 | fpe("\-U New user namespace\\n"); | |
1106 | fpe("\-M uid_map Specify UID map for user namespace\\n"); | |
1107 | fpe("\-G gid_map Specify GID map for user namespace\\n"); | |
1108 | fpe("\-z Map user\(aqs UID and GID to 0 in user namespace\\n"); | |
1109 | fpe(" (equivalent to: \-M \(aq0 <uid> 1\(aq \-G \(aq0 <gid> 1\(aq)\\n"); | |
1110 | fpe("\-v Display verbose messages\\n"); | |
1111 | fpe("\\n"); | |
1112 | fpe("If \-z, \-M, or \-G is specified, \-U is required.\\n"); | |
1113 | fpe("It is not permitted to specify both \-z and either \-M or \-G.\\n"); | |
1114 | fpe("\\n"); | |
1115 | fpe("Map strings for \-M and \-G consist of records of the form:\\n"); | |
1116 | fpe("\\n"); | |
1117 | fpe(" ID\-inside\-ns ID\-outside\-ns len\\n"); | |
1118 | fpe("\\n"); | |
1119 | fpe("A map string can contain multiple records, separated" | |
1120 | " by commas;\\n"); | |
1121 | fpe("the commas are replaced by newlines before writing" | |
1122 | " to map files.\\n"); | |
1123 | ||
1124 | exit(EXIT_FAILURE); | |
1125 | } | |
1126 | ||
1127 | /* Update the mapping file \(aqmap_file\(aq, with the value provided in | |
1128 | \(aqmapping\(aq, a string that defines a UID or GID mapping. A UID or | |
1129 | GID mapping consists of one or more newline\-delimited records | |
1130 | of the form: | |
1131 | ||
1132 | ID_inside\-ns ID\-outside\-ns length | |
1133 | ||
1134 | Requiring the user to supply a string that contains newlines is | |
1135 | of course inconvenient for command\-line use. Thus, we permit the | |
1136 | use of commas to delimit records in this string, and replace them | |
1137 | with newlines before writing the string to the file. */ | |
1138 | ||
1139 | static void | |
1140 | update_map(char *mapping, char *map_file) | |
1141 | { | |
1142 | int fd, j; | |
1143 | size_t map_len; /* Length of \(aqmapping\(aq */ | |
1144 | ||
1145 | /* Replace commas in mapping string with newlines */ | |
1146 | ||
1147 | map_len = strlen(mapping); | |
1148 | for (j = 0; j < map_len; j++) | |
1149 | if (mapping[j] == \(aq,\(aq) | |
1150 | mapping[j] = \(aq\\n\(aq; | |
1151 | ||
1152 | fd = open(map_file, O_RDWR); | |
1153 | if (fd == \-1) { | |
8f99aa89 MK |
1154 | fprintf(stderr, "ERROR: open %s: %s\\n", map_file, |
1155 | strerror(errno)); | |
1156 | exit(EXIT_FAILURE); | |
8d36d80c MK |
1157 | } |
1158 | ||
1159 | if (write(fd, mapping, map_len) != map_len) { | |
8f99aa89 MK |
1160 | fprintf(stderr, "ERROR: write %s: %s\\n", map_file, |
1161 | strerror(errno)); | |
1162 | exit(EXIT_FAILURE); | |
8d36d80c MK |
1163 | } |
1164 | ||
1165 | close(fd); | |
1166 | } | |
1167 | ||
c38a2a04 MK |
1168 | /* Linux 3.19 made a change in the handling of setgroups(2) and the |
1169 | \(aqgid_map\(aq file to address a security issue. The issue allowed | |
1170 | *unprivileged* users to employ user namespaces in order to drop | |
1171 | The upshot of the 3.19 changes is that in order to update the | |
1172 | \(aqgid_maps\(aq file, use of the setgroups() system call in this | |
1173 | user namespace must first be disabled by writing "deny" to one of | |
1174 | the /proc/PID/setgroups files for this namespace. That is the | |
1175 | purpose of the following function. */ | |
1176 | ||
1177 | static void | |
1178 | proc_setgroups_write(pid_t child_pid, char *str) | |
1179 | { | |
1180 | char setgroups_path[PATH_MAX]; | |
1181 | int fd; | |
1182 | ||
1183 | snprintf(setgroups_path, PATH_MAX, "/proc/%ld/setgroups", | |
1184 | (long) child_pid); | |
1185 | ||
1186 | fd = open(setgroups_path, O_RDWR); | |
1187 | if (fd == \-1) { | |
1188 | ||
1189 | /* We may be on a system that doesn\(aqt support | |
1190 | /proc/PID/setgroups. In that case, the file won\(aqt exist, | |
1191 | and the system won\(aqt impose the restrictions that Linux 3.19 | |
1192 | added. That\(aqs fine: we don\(aqt need to do anything in order | |
1193 | to permit \(aqgid_map\(aq to be updated. | |
1194 | ||
1195 | However, if the error from open() was something other than | |
1196 | the ENOENT error that is expected for that case, let the | |
1197 | user know. */ | |
1198 | ||
1199 | if (errno != ENOENT) | |
1200 | fprintf(stderr, "ERROR: open %s: %s\\n", setgroups_path, | |
1201 | strerror(errno)); | |
1202 | return; | |
1203 | } | |
1204 | ||
1205 | if (write(fd, str, strlen(str)) == \-1) | |
1206 | fprintf(stderr, "ERROR: write %s: %s\\n", setgroups_path, | |
1207 | strerror(errno)); | |
1208 | ||
1209 | close(fd); | |
1210 | } | |
1211 | ||
8d36d80c MK |
1212 | static int /* Start function for cloned child */ |
1213 | childFunc(void *arg) | |
1214 | { | |
1215 | struct child_args *args = (struct child_args *) arg; | |
1216 | char ch; | |
1217 | ||
1218 | /* Wait until the parent has updated the UID and GID mappings. | |
1219 | See the comment in main(). We wait for end of file on a | |
1220 | pipe that will be closed by the parent process once it has | |
1221 | updated the mappings. */ | |
1222 | ||
1223 | close(args\->pipe_fd[1]); /* Close our descriptor for the write | |
1224 | end of the pipe so that we see EOF | |
1225 | when parent closes its descriptor */ | |
1226 | if (read(args\->pipe_fd[0], &ch, 1) != 0) { | |
1227 | fprintf(stderr, | |
1228 | "Failure in child: read from pipe returned != 0\\n"); | |
1229 | exit(EXIT_FAILURE); | |
1230 | } | |
1231 | ||
1232 | /* Execute a shell command */ | |
1233 | ||
1234 | printf("About to exec %s\\n", args\->argv[0]); | |
1235 | execvp(args\->argv[0], args\->argv); | |
1236 | errExit("execvp"); | |
1237 | } | |
1238 | ||
1239 | #define STACK_SIZE (1024 * 1024) | |
1240 | ||
1241 | static char child_stack[STACK_SIZE]; /* Space for child\(aqs stack */ | |
1242 | ||
1243 | int | |
1244 | main(int argc, char *argv[]) | |
1245 | { | |
1246 | int flags, opt, map_zero; | |
1247 | pid_t child_pid; | |
1248 | struct child_args args; | |
1249 | char *uid_map, *gid_map; | |
1250 | const int MAP_BUF_SIZE = 100; | |
1251 | char map_buf[MAP_BUF_SIZE]; | |
1252 | char map_path[PATH_MAX]; | |
1253 | ||
1254 | /* Parse command\-line options. The initial \(aq+\(aq character in | |
1255 | the final getopt() argument prevents GNU\-style permutation | |
1256 | of command\-line options. That\(aqs useful, since sometimes | |
1257 | the \(aqcommand\(aq to be executed by this program itself | |
1258 | has command\-line options. We don\(aqt want getopt() to treat | |
1259 | those as options to this program. */ | |
1260 | ||
1261 | flags = 0; | |
1262 | verbose = 0; | |
1263 | gid_map = NULL; | |
1264 | uid_map = NULL; | |
1265 | map_zero = 0; | |
1266 | while ((opt = getopt(argc, argv, "+imnpuUM:G:zv")) != \-1) { | |
1267 | switch (opt) { | |
1268 | case \(aqi\(aq: flags |= CLONE_NEWIPC; break; | |
1269 | case \(aqm\(aq: flags |= CLONE_NEWNS; break; | |
1270 | case \(aqn\(aq: flags |= CLONE_NEWNET; break; | |
1271 | case \(aqp\(aq: flags |= CLONE_NEWPID; break; | |
1272 | case \(aqu\(aq: flags |= CLONE_NEWUTS; break; | |
1273 | case \(aqv\(aq: verbose = 1; break; | |
1274 | case \(aqz\(aq: map_zero = 1; break; | |
1275 | case \(aqM\(aq: uid_map = optarg; break; | |
1276 | case \(aqG\(aq: gid_map = optarg; break; | |
1277 | case \(aqU\(aq: flags |= CLONE_NEWUSER; break; | |
1278 | default: usage(argv[0]); | |
1279 | } | |
1280 | } | |
1281 | ||
1282 | /* \-M or \-G without \-U is nonsensical */ | |
1283 | ||
1284 | if (((uid_map != NULL || gid_map != NULL || map_zero) && | |
1285 | !(flags & CLONE_NEWUSER)) || | |
1286 | (map_zero && (uid_map != NULL || gid_map != NULL))) | |
1287 | usage(argv[0]); | |
1288 | ||
1289 | args.argv = &argv[optind]; | |
1290 | ||
1291 | /* We use a pipe to synchronize the parent and child, in order to | |
1292 | ensure that the parent sets the UID and GID maps before the child | |
1293 | calls execve(). This ensures that the child maintains its | |
1294 | capabilities during the execve() in the common case where we | |
1295 | want to map the child\(aqs effective user ID to 0 in the new user | |
1296 | namespace. Without this synchronization, the child would lose | |
1297 | its capabilities if it performed an execve() with nonzero | |
1298 | user IDs (see the capabilities(7) man page for details of the | |
1299 | transformation of a process\(aqs capabilities during execve()). */ | |
1300 | ||
1301 | if (pipe(args.pipe_fd) == \-1) | |
1302 | errExit("pipe"); | |
1303 | ||
1304 | /* Create the child in new namespace(s) */ | |
1305 | ||
1306 | child_pid = clone(childFunc, child_stack + STACK_SIZE, | |
1307 | flags | SIGCHLD, &args); | |
1308 | if (child_pid == \-1) | |
1309 | errExit("clone"); | |
1310 | ||
1311 | /* Parent falls through to here */ | |
1312 | ||
1313 | if (verbose) | |
1314 | printf("%s: PID of child created by clone() is %ld\\n", | |
1315 | argv[0], (long) child_pid); | |
1316 | ||
1317 | /* Update the UID and GID maps in the child */ | |
1318 | ||
1319 | if (uid_map != NULL || map_zero) { | |
1320 | snprintf(map_path, PATH_MAX, "/proc/%ld/uid_map", | |
1321 | (long) child_pid); | |
1322 | if (map_zero) { | |
1323 | snprintf(map_buf, MAP_BUF_SIZE, "0 %ld 1", (long) getuid()); | |
1324 | uid_map = map_buf; | |
1325 | } | |
1326 | update_map(uid_map, map_path); | |
1327 | } | |
4990f759 | 1328 | |
8d36d80c | 1329 | if (gid_map != NULL || map_zero) { |
c38a2a04 MK |
1330 | proc_setgroups_write(child_pid, "deny"); |
1331 | ||
8d36d80c MK |
1332 | snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map", |
1333 | (long) child_pid); | |
1334 | if (map_zero) { | |
1335 | snprintf(map_buf, MAP_BUF_SIZE, "0 %ld 1", (long) getgid()); | |
1336 | gid_map = map_buf; | |
1337 | } | |
1338 | update_map(gid_map, map_path); | |
1339 | } | |
1340 | ||
1341 | /* Close the write end of the pipe, to signal to the child that we | |
1342 | have updated the UID and GID maps */ | |
1343 | ||
1344 | close(args.pipe_fd[1]); | |
1345 | ||
1346 | if (waitpid(child_pid, NULL, 0) == \-1) /* Wait for child */ | |
1347 | errExit("waitpid"); | |
1348 | ||
1349 | if (verbose) | |
1350 | printf("%s: terminating\\n", argv[0]); | |
1351 | ||
1352 | exit(EXIT_SUCCESS); | |
1353 | } | |
1354 | .fi | |
046de6a7 | 1355 | .SH SEE ALSO |
b6462f75 MK |
1356 | .BR newgidmap (1), \" From the shadow package |
1357 | .BR newuidmap (1), \" From the shadow package | |
046de6a7 | 1358 | .BR clone (2), |
801245a1 | 1359 | .BR ptrace (2), |
046de6a7 MK |
1360 | .BR setns (2), |
1361 | .BR unshare (2), | |
1362 | .BR proc (5), | |
b6462f75 MK |
1363 | .BR subgid (5), \" From the shadow package |
1364 | .BR subuid (5), \" From the shadow package | |
589e43bb | 1365 | .BR capabilities (7), |
3afb0c6a | 1366 | .BR cgroup_namespaces (7) |
3525268c MK |
1367 | .BR credentials (7), |
1368 | .BR namespaces (7), | |
8d36d80c | 1369 | .BR pid_namespaces (7) |
c94eb4a6 MK |
1370 | .sp |
1371 | The kernel source file | |
1372 | .IR Documentation/namespaces/resource-control.txt . |