]>
Commit | Line | Data |
---|---|---|
b10c74ff EB |
1 | .\" Copyright (c) 2013, 2014 by Michael Kerrisk <mtk.manpages@gmail.com> |
2 | .\" and Copyright (c) 2012, 2014 by Eric W. Biederman <ebiederm@xmission.com> | |
046de6a7 | 3 | .\" |
5fbde956 | 4 | .\" SPDX-License-Identifier: Linux-man-pages-copyleft |
046de6a7 MK |
5 | .\" |
6 | .\" | |
6e00b7a8 | 7 | .TH USER_NAMESPACES 7 2021-08-27 "Linux" "Linux Programmer's Manual" |
046de6a7 | 8 | .SH NAME |
445d38c9 | 9 | user_namespaces \- overview of Linux user namespaces |
046de6a7 MK |
10 | .SH DESCRIPTION |
11 | For an overview of namespaces, see | |
12 | .BR namespaces (7). | |
a721e8b2 | 13 | .PP |
99f04bb1 MK |
14 | User namespaces isolate security-related identifiers and attributes, |
15 | in particular, | |
03611be8 | 16 | user IDs and group IDs (see |
dba9ebf2 | 17 | .BR credentials (7)), |
99f04bb1 | 18 | the root directory, |
03611be8 | 19 | keys (see |
9d85c789 | 20 | .BR keyrings (7)), |
6b928030 MK |
21 | .\" FIXME: This page says very little about the interaction |
22 | .\" of user namespaces and keys. Add something on this topic. | |
03611be8 | 23 | and capabilities (see |
d916d9d0 | 24 | .BR capabilities (7)). |
046de6a7 MK |
25 | A process's user and group IDs can be different |
26 | inside and outside a user namespace. | |
27 | In particular, | |
28 | a process can have a normal unprivileged user ID outside a user namespace | |
29 | while at the same time having a user ID of 0 inside the namespace; | |
30 | in other words, | |
31 | the process has full privileges for operations inside the user namespace, | |
32 | but is unprivileged for operations outside the namespace. | |
d916d9d0 MK |
33 | .\" |
34 | .\" ============================================================ | |
35 | .\" | |
36 | .SS Nested namespaces, namespace membership | |
37 | User namespaces can be nested; | |
38 | that is, each user namespace\(emexcept the initial ("root") | |
39 | namespace\(emhas a parent user namespace, | |
40 | and can have zero or more child user namespaces. | |
41 | The parent user namespace is the user namespace | |
42 | of the process that creates the user namespace via a call to | |
43 | .BR unshare (2) | |
44 | or | |
45 | .BR clone (2) | |
46 | with the | |
1ae6b2c7 | 47 | .B CLONE_NEWUSER |
d916d9d0 | 48 | flag. |
a721e8b2 | 49 | .PP |
e56b6c42 MK |
50 | The kernel imposes (since version 3.11) a limit of 32 nested levels of |
51 | .\" commit 8742f229b635bf1c1c84a3dfe5e47c814c20b5c8 | |
52 | user namespaces. | |
53 | .\" FIXME Explain the rationale for this limit. (What is the rationale?) | |
54 | Calls to | |
55 | .BR unshare (2) | |
56 | or | |
57 | .BR clone (2) | |
58 | that would cause this limit to be exceeded fail with the error | |
59 | .BR EUSERS . | |
a721e8b2 | 60 | .PP |
3b44624f | 61 | Each process is a member of exactly one user namespace. |
d916d9d0 MK |
62 | A process created via |
63 | .BR fork (2) | |
64 | or | |
65 | .BR clone (2) | |
66 | without the | |
1ae6b2c7 | 67 | .B CLONE_NEWUSER |
d916d9d0 | 68 | flag is a member of the same user namespace as its parent. |
7aba437a | 69 | A single-threaded process can join another user namespace with |
d916d9d0 MK |
70 | .BR setns (2) |
71 | if it has the | |
1ae6b2c7 | 72 | .B CAP_SYS_ADMIN |
d916d9d0 MK |
73 | in that namespace; |
74 | upon doing so, it gains a full set of capabilities in that namespace. | |
a721e8b2 | 75 | .PP |
d916d9d0 MK |
76 | A call to |
77 | .BR clone (2) | |
78 | or | |
79 | .BR unshare (2) | |
80 | with the | |
1ae6b2c7 | 81 | .B CLONE_NEWUSER |
d916d9d0 MK |
82 | flag makes the new child process (for |
83 | .BR clone (2)) | |
84 | or the caller (for | |
85 | .BR unshare (2)) | |
86 | a member of the new user namespace created by the call. | |
a721e8b2 | 87 | .PP |
7af6863b | 88 | The |
1ae6b2c7 | 89 | .B NS_GET_PARENT |
7af6863b MK |
90 | .BR ioctl (2) |
91 | operation can be used to discover the parental relationship | |
92 | between user namespaces; see | |
09860f31 | 93 | .BR ioctl_ns (2). |
d916d9d0 MK |
94 | .\" |
95 | .\" ============================================================ | |
96 | .\" | |
97 | .SS Capabilities | |
96ec9d12 MK |
98 | The child process created by |
99 | .BR clone (2) | |
100 | with the | |
1ae6b2c7 | 101 | .B CLONE_NEWUSER |
96ec9d12 MK |
102 | flag starts out with a complete set |
103 | of capabilities in the new user namespace. | |
d916d9d0 MK |
104 | Likewise, a process that creates a new user namespace using |
105 | .BR unshare (2) | |
106 | or joins an existing user namespace using | |
107 | .BR setns (2) | |
3b44624f | 108 | gains a full set of capabilities in that namespace. |
c0098e76 | 109 | On the other hand, |
d68c5f11 MK |
110 | that process has no capabilities in the parent (in the case of |
111 | .BR clone (2)) | |
112 | or previous (in the case of | |
113 | .BR unshare (2) | |
114 | and | |
115 | .BR setns (2)) | |
116 | user namespace, | |
d916d9d0 MK |
117 | even if the new namespace is created or joined by the root user |
118 | (i.e., a process with user ID 0 in the root namespace). | |
a721e8b2 | 119 | .PP |
77f95488 MK |
120 | Note that a call to |
121 | .BR execve (2) | |
6cfec3d8 | 122 | will cause a process's capabilities to be recalculated in the usual way (see |
a3969b76 MK |
123 | .BR capabilities (7)). |
124 | Consequently, | |
125 | unless the process has a user ID of 0 within the namespace, | |
126 | or the executable file has a nonempty inheritable capabilities mask, | |
127 | the process will lose all capabilities. | |
6c21c0f9 | 128 | See the discussion of user and group ID mappings, below. |
a721e8b2 | 129 | .PP |
f5d401dd | 130 | A call to |
41f974b9 | 131 | .BR clone (2) |
0666f549 | 132 | or |
41f974b9 | 133 | .BR unshare (2) |
0666f549 | 134 | using the |
1ae6b2c7 | 135 | .B CLONE_NEWUSER |
41f974b9 MK |
136 | flag |
137 | or a call to | |
138 | .BR setns (2) | |
dcf91cdc | 139 | that moves the caller into another user namespace |
41f974b9 | 140 | sets the "securebits" flags |
0666f549 MK |
141 | (see |
142 | .BR capabilities (7)) | |
143 | to their default values (all flags disabled) in the child (for | |
144 | .BR clone (2)) | |
145 | or caller (for | |
4d7a0dbc | 146 | .BR unshare (2) |
0666f549 MK |
147 | or |
148 | .BR setns (2)). | |
d68c5f11 MK |
149 | Note that because the caller no longer has capabilities |
150 | in its original user namespace after a call to | |
0666f549 MK |
151 | .BR setns (2), |
152 | it is not possible for a process to reset its "securebits" flags while | |
153 | retaining its user namespace membership by using a pair of | |
154 | .BR setns (2) | |
155 | calls to move to another user namespace and then return to | |
156 | its original user namespace. | |
a721e8b2 | 157 | .PP |
d916d9d0 MK |
158 | The rules for determining whether or not a process has a capability |
159 | in a particular user namespace are as follows: | |
160 | .IP 1. 3 | |
161 | A process has a capability inside a user namespace | |
162 | if it is a member of that namespace and | |
163 | it has the capability in its effective capability set. | |
164 | A process can gain capabilities in its effective capability | |
165 | set in various ways. | |
166 | For example, it may execute a set-user-ID program or an | |
167 | executable with associated file capabilities. | |
168 | In addition, | |
169 | a process may gain capabilities via the effect of | |
3b44624f MK |
170 | .BR clone (2), |
171 | .BR unshare (2), | |
046de6a7 | 172 | or |
d916d9d0 MK |
173 | .BR setns (2), |
174 | as already described. | |
175 | .\" In the 3.8 sources, see security/commoncap.c::cap_capable(): | |
176 | .IP 2. | |
177 | If a process has a capability in a user namespace, | |
178 | then it has that capability in all child (and further removed descendant) | |
179 | namespaces as well. | |
180 | .IP 3. | |
181 | .\" * The owner of the user namespace in the parent of the | |
182 | .\" * user namespace has all caps. | |
183 | When a user namespace is created, the kernel records the effective | |
184 | user ID of the creating process as being the "owner" of the namespace. | |
185 | .\" (and likewise associates the effective group ID of the creating process | |
186 | .\" with the namespace). | |
187 | A process that resides | |
188 | in the parent of the user namespace | |
189 | .\" See kernel commit 520d9eabce18edfef76a60b7b839d54facafe1f9 for a fix | |
190 | .\" on this point | |
191 | and whose effective user ID matches the owner of the namespace | |
192 | has all capabilities in the namespace. | |
193 | .\" This includes the case where the process executes a set-user-ID | |
194 | .\" program that confers the effective UID of the creator of the namespace. | |
195 | By virtue of the previous rule, | |
196 | this means that the process has all capabilities in all | |
197 | further removed descendant user namespaces as well. | |
8e821c3a MK |
198 | The |
199 | .B NS_GET_OWNER_UID | |
200 | .BR ioctl (2) | |
201 | operation can be used to discover the user ID of the owner of the namespace; | |
202 | see | |
203 | .BR ioctl_ns (2). | |
2304b0d7 MK |
204 | .\" |
205 | .\" ============================================================ | |
206 | .\" | |
207 | .SS Effect of capabilities within a user namespace | |
89070c1a MK |
208 | Having a capability inside a user namespace |
209 | permits a process to perform operations (that require privilege) | |
210 | only on resources governed by that namespace. | |
211 | In other words, having a capability in a user namespace permits a process | |
212 | to perform privileged operations on resources that are governed by (nonuser) | |
29af6f1a MK |
213 | namespaces owned by (associated with) the user namespace |
214 | (see the next subsection). | |
a721e8b2 | 215 | .PP |
89070c1a MK |
216 | On the other hand, there are many privileged operations that affect |
217 | resources that are not associated with any namespace type, | |
94e316bf | 218 | for example, changing the system (i.e., calendar) time (governed by |
89070c1a MK |
219 | .BR CAP_SYS_TIME ), |
220 | loading a kernel module (governed by | |
221 | .BR CAP_SYS_MODULE ), | |
222 | and creating a device (governed by | |
223 | .BR CAP_MKNOD ). | |
224 | Only a process with privileges in the | |
225 | .I initial | |
226 | user namespace can perform such operations. | |
a721e8b2 | 227 | .PP |
32efecaa MK |
228 | Holding |
229 | .B CAP_SYS_ADMIN | |
29af6f1a | 230 | within the user namespace that owns a process's mount namespace |
8a9fb19d MK |
231 | allows that process to create bind mounts |
232 | and mount the following types of filesystems: | |
32efecaa | 233 | .\" fs_flags = FS_USERNS_MOUNT in kernel sources |
a721e8b2 | 234 | .PP |
32efecaa MK |
235 | .RS 4 |
236 | .PD 0 | |
237 | .IP * 2 | |
1ae6b2c7 | 238 | .I /proc |
32efecaa MK |
239 | (since Linux 3.8) |
240 | .IP * | |
1ae6b2c7 | 241 | .I /sys |
32efecaa MK |
242 | (since Linux 3.8) |
243 | .IP * | |
1ae6b2c7 | 244 | .I devpts |
32efecaa MK |
245 | (since Linux 3.9) |
246 | .IP * | |
4e07c70f | 247 | .BR tmpfs (5) |
32efecaa MK |
248 | (since Linux 3.9) |
249 | .IP * | |
1ae6b2c7 | 250 | .I ramfs |
32efecaa MK |
251 | (since Linux 3.9) |
252 | .IP * | |
1ae6b2c7 | 253 | .I mqueue |
32efecaa MK |
254 | (since Linux 3.9) |
255 | .IP * | |
1ae6b2c7 | 256 | .I bpf |
32efecaa MK |
257 | .\" commit b2197755b2633e164a439682fb05a9b5ea48f706 |
258 | (since Linux 4.4) | |
c8e9c60b | 259 | .IP * |
1ae6b2c7 | 260 | .I overlayfs |
c8e9c60b MK |
261 | .\" commit 92dbc9dedccb9759c7f9f2f0ae6242396376988f |
262 | .\" commit 4cb2c00c43b3fe88b32f29df4f76da1b92c33224 | |
263 | (since Linux 5.11) | |
32efecaa MK |
264 | .PD |
265 | .RE | |
266 | .PP | |
7e52299f MK |
267 | Holding |
268 | .B CAP_SYS_ADMIN | |
29af6f1a | 269 | within the user namespace that owns a process's cgroup namespace |
7e52299f | 270 | allows (since Linux 4.6) |
aeeb4800 | 271 | that process to the mount the cgroup version 2 filesystem and |
8c74a1ce MK |
272 | cgroup version 1 named hierarchies |
273 | (i.e., cgroup filesystems mounted with the | |
1ae6b2c7 | 274 | .I """none,name=""" |
8c74a1ce | 275 | option). |
a721e8b2 | 276 | .PP |
687d3f4a MK |
277 | Holding |
278 | .B CAP_SYS_ADMIN | |
29af6f1a | 279 | within the user namespace that owns a process's PID namespace |
687d3f4a MK |
280 | allows (since Linux 3.8) |
281 | that process to mount | |
282 | .I /proc | |
283 | filesystems. | |
a721e8b2 | 284 | .PP |
9ae13fff | 285 | Note, however, that mounting block-based filesystems can be done |
32efecaa | 286 | only by a process that holds |
1ae6b2c7 | 287 | .B CAP_SYS_ADMIN |
32efecaa | 288 | in the initial user namespace. |
d6842bf1 MK |
289 | .\" |
290 | .\" ============================================================ | |
291 | .\" | |
62a5214c | 292 | .SS Interaction of user namespaces and other types of namespaces |
046de6a7 | 293 | Starting in Linux 3.8, unprivileged processes can create user namespaces, |
8bd6881e | 294 | and the other types of namespaces can be created with just the |
046de6a7 MK |
295 | .B CAP_SYS_ADMIN |
296 | capability in the caller's user namespace. | |
a721e8b2 | 297 | .PP |
6e8a3b42 | 298 | When a nonuser namespace is created, |
576233f0 MK |
299 | it is owned by the user namespace in which the creating process |
300 | was a member at the time of the creation of the namespace. | |
18b028be MK |
301 | Privileged operations on resources governed by the nonuser namespace |
302 | require that the process has the necessary capabilities | |
303 | in the user namespace that owns the nonuser namespace. | |
a721e8b2 | 304 | .PP |
046de6a7 | 305 | If |
1ae6b2c7 | 306 | .B CLONE_NEWUSER |
046de6a7 MK |
307 | is specified along with other |
308 | .B CLONE_NEW* | |
309 | flags in a single | |
310 | .BR clone (2) | |
311 | or | |
312 | .BR unshare (2) | |
313 | call, the user namespace is guaranteed to be created first, | |
96ec9d12 MK |
314 | giving the child |
315 | .RB ( clone (2)) | |
316 | or caller | |
317 | .RB ( unshare (2)) | |
318 | privileges over the remaining namespaces created by the call. | |
046de6a7 MK |
319 | Thus, it is possible for an unprivileged caller to specify this combination |
320 | of flags. | |
a721e8b2 | 321 | .PP |
06999763 | 322 | When a new namespace (other than a user namespace) is created via |
046de6a7 MK |
323 | .BR clone (2) |
324 | or | |
325 | .BR unshare (2), | |
29af6f1a | 326 | the kernel records the user namespace of the creating process as the owner of |
046de6a7 | 327 | the new namespace. |
d916d9d0 | 328 | (This association can't be changed.) |
046de6a7 MK |
329 | When a process in the new namespace subsequently performs |
330 | privileged operations that operate on global | |
331 | resources isolated by the namespace, | |
332 | the permission checks are performed according to the process's capabilities | |
333 | in the user namespace that the kernel associated with the new namespace. | |
7ea1c45e MK |
334 | For example, suppose that a process attempts to change the hostname |
335 | .RB ( sethostname (2)), | |
336 | a resource governed by the UTS namespace. | |
337 | In this case, | |
29af6f1a | 338 | the kernel will determine which user namespace owns |
7ea1c45e MK |
339 | the process's UTS namespace, and check whether the process has the |
340 | required capability | |
341 | .RB ( CAP_SYS_ADMIN ) | |
342 | in that user namespace. | |
a721e8b2 | 343 | .PP |
41490851 | 344 | The |
1ae6b2c7 | 345 | .B NS_GET_USERNS |
41490851 | 346 | .BR ioctl (2) |
29af6f1a | 347 | operation can be used to discover the user namespace |
b784b9d5 | 348 | that owns a nonuser namespace; see |
09860f31 | 349 | .BR ioctl_ns (2). |
d6842bf1 MK |
350 | .\" |
351 | .\" ============================================================ | |
352 | .\" | |
62a5214c | 353 | .SS User and group ID mappings: uid_map and gid_map |
6eda9441 MK |
354 | When a user namespace is created, |
355 | it starts out without a mapping of user IDs (group IDs) | |
356 | to the parent user namespace. | |
046de6a7 | 357 | The |
1ae6b2c7 | 358 | .IR /proc/ pid /uid_map |
046de6a7 | 359 | and |
1ae6b2c7 | 360 | .IR /proc/ pid /gid_map |
046de6a7 MK |
361 | files (available since Linux 3.5) |
362 | .\" commit 22d917d80e842829d0ca0a561967d728eb1d6303 | |
363 | expose the mappings for user and group IDs | |
364 | inside the user namespace for the process | |
365 | .IR pid . | |
62a5214c MK |
366 | These files can be read to view the mappings in a user namespace and |
367 | written to (once) to define the mappings. | |
a721e8b2 | 368 | .PP |
62a5214c | 369 | The description in the following paragraphs explains the details for |
046de6a7 | 370 | .IR uid_map ; |
1ae6b2c7 | 371 | .I gid_map |
046de6a7 MK |
372 | is exactly the same, |
373 | but each instance of "user ID" is replaced by "group ID". | |
a721e8b2 | 374 | .PP |
046de6a7 MK |
375 | The |
376 | .I uid_map | |
377 | file exposes the mapping of user IDs from the user namespace | |
378 | of the process | |
1ae6b2c7 | 379 | .I pid |
046de6a7 | 380 | to the user namespace of the process that opened |
1ae6b2c7 | 381 | .I uid_map |
046de6a7 MK |
382 | (but see a qualification to this point below). |
383 | In other words, processes that are in different user namespaces | |
384 | will potentially see different values when reading from a particular | |
385 | .I uid_map | |
386 | file, depending on the user ID mappings for the user namespaces | |
387 | of the reading processes. | |
a721e8b2 | 388 | .PP |
046de6a7 MK |
389 | Each line in the |
390 | .I uid_map | |
391 | file specifies a 1-to-1 mapping of a range of contiguous | |
392 | user IDs between two user namespaces. | |
393 | (When a user namespace is first created, this file is empty.) | |
394 | The specification in each line takes the form of | |
395 | three numbers delimited by white space. | |
d45d0128 | 396 | The first two numbers specify the starting user ID in |
3b44624f | 397 | each of the two user namespaces. |
046de6a7 MK |
398 | The third number specifies the length of the mapped range. |
399 | In detail, the fields are interpreted as follows: | |
400 | .IP (1) 4 | |
401 | The start of the range of user IDs in | |
402 | the user namespace of the process | |
403 | .IR pid . | |
404 | .IP (2) | |
405 | The start of the range of user | |
406 | IDs to which the user IDs specified by field one map. | |
407 | How field two is interpreted depends on whether the process that opened | |
408 | .I uid_map | |
409 | and the process | |
1ae6b2c7 | 410 | .I pid |
046de6a7 MK |
411 | are in the same user namespace, as follows: |
412 | .RS | |
413 | .IP a) 3 | |
414 | If the two processes are in different user namespaces: | |
415 | field two is the start of a range of | |
416 | user IDs in the user namespace of the process that opened | |
417 | .IR uid_map . | |
418 | .IP b) | |
419 | If the two processes are in the same user namespace: | |
420 | field two is the start of the range of | |
421 | user IDs in the parent user namespace of the process | |
422 | .IR pid . | |
423 | This case enables the opener of | |
424 | .I uid_map | |
425 | (the common case here is opening | |
426 | .IR /proc/self/uid_map ) | |
427 | to see the mapping of user IDs into the user namespace of the process | |
428 | that created this user namespace. | |
429 | .RE | |
430 | .IP (3) | |
431 | The length of the range of user IDs that is mapped between the two | |
432 | user namespaces. | |
6eda9441 MK |
433 | .PP |
434 | System calls that return user IDs (group IDs)\(emfor example, | |
435 | .BR getuid (2), | |
436 | .BR getgid (2), | |
437 | and the credential fields in the structure returned by | |
438 | .BR stat (2)\(emreturn | |
3b44624f | 439 | the user ID (group ID) mapped into the caller's user namespace. |
a721e8b2 | 440 | .PP |
6eda9441 MK |
441 | When a process accesses a file, its user and group IDs |
442 | are mapped into the initial user namespace for the purpose of permission | |
443 | checking and assigning IDs when creating a file. | |
444 | When a process retrieves file user and group IDs via | |
3b44624f | 445 | .BR stat (2), |
6eda9441 MK |
446 | the IDs are mapped in the opposite direction, |
447 | to produce values relative to the process user and group ID mappings. | |
a721e8b2 | 448 | .PP |
20e4a147 MK |
449 | The initial user namespace has no parent namespace, |
450 | but, for consistency, the kernel provides dummy user and group | |
451 | ID mapping files for this namespace. | |
452 | Looking at the | |
453 | .I uid_map | |
c9195ded MK |
454 | file |
455 | .RI ( gid_map | |
20e4a147 | 456 | is the same) from a shell in the initial namespace shows: |
a721e8b2 | 457 | .PP |
20e4a147 | 458 | .in +4n |
b8302363 | 459 | .EX |
20e4a147 MK |
460 | $ \fBcat /proc/$$/uid_map\fP |
461 | 0 0 4294967295 | |
b8302363 | 462 | .EE |
20e4a147 | 463 | .in |
a721e8b2 | 464 | .PP |
20e4a147 MK |
465 | This mapping tells us |
466 | that the range starting at user ID 0 in this namespace | |
467 | maps to a range starting at 0 in the (nonexistent) parent namespace, | |
468 | and the length of the range is the largest 32-bit unsigned integer. | |
364ce935 | 469 | This leaves 4294967295 (the 32-bit signed \-1 value) unmapped. |
6cfec3d8 | 470 | This is deliberate: |
1ae6b2c7 | 471 | .I (uid_t)\~\-1 |
6cfec3d8 MK |
472 | is used in several interfaces (e.g., |
473 | .BR setreuid (2)) | |
474 | as a way to specify "no user ID". | |
475 | Leaving | |
1ae6b2c7 | 476 | .I (uid_t)\~\-1 |
09fcbb82 | 477 | unmapped and unusable guarantees that there will be no |
6cfec3d8 | 478 | confusion when using these interfaces. |
d6842bf1 MK |
479 | .\" |
480 | .\" ============================================================ | |
481 | .\" | |
62a5214c | 482 | .SS Defining user and group ID mappings: writing to uid_map and gid_map |
046de6a7 MK |
483 | After the creation of a new user namespace, the |
484 | .I uid_map | |
485 | file of | |
486 | .I one | |
37909bee | 487 | of the processes in the namespace may be written to |
046de6a7 MK |
488 | .I once |
489 | to define the mapping of user IDs in the new user namespace. | |
1b3d5347 | 490 | An attempt to write more than once to a |
046de6a7 MK |
491 | .I uid_map |
492 | file in a user namespace fails with the error | |
1b3d5347 MK |
493 | .BR EPERM . |
494 | Similar rules apply for | |
495 | .I gid_map | |
496 | files. | |
a721e8b2 | 497 | .PP |
046de6a7 | 498 | The lines written to |
1ae6b2c7 | 499 | .I uid_map |
1b3d5347 | 500 | .RI ( gid_map ) |
6486faa9 | 501 | must conform to the following validity rules: |
046de6a7 MK |
502 | .IP * 3 |
503 | The three fields must be valid numbers, | |
504 | and the last field must be greater than 0. | |
505 | .IP * | |
506 | Lines are terminated by newline characters. | |
507 | .IP * | |
dc04b652 | 508 | There is a limit on the number of lines in the file. |
2660d010 MK |
509 | In Linux 4.14 and earlier, this limit was (arbitrarily) |
510 | .\" 5*12-byte records could fit in a 64B cache line | |
511 | set at 5 lines. | |
512 | Since Linux 4.15, | |
513 | .\" commit 6397fac4915ab3002dc15aae751455da1a852f25 | |
514 | the limit is 340 lines. | |
046de6a7 MK |
515 | In addition, the number of bytes written to |
516 | the file must be less than the system page size, | |
046de6a7 MK |
517 | and the write must be performed at the start of the file (i.e., |
518 | .BR lseek (2) | |
519 | and | |
520 | .BR pwrite (2) | |
521 | can't be used to write to nonzero offsets in the file). | |
522 | .IP * | |
1b3d5347 MK |
523 | The range of user IDs (group IDs) |
524 | specified in each line cannot overlap with the ranges | |
046de6a7 | 525 | in any other lines. |
df23ae04 MK |
526 | In the initial implementation (Linux 3.8), this requirement was |
527 | satisfied by a simplistic implementation that imposed the further | |
046de6a7 MK |
528 | requirement that |
529 | the values in both field 1 and field 2 of successive lines must be | |
df23ae04 MK |
530 | in ascending numerical order, |
531 | which prevented some otherwise valid maps from being created. | |
532 | Linux 3.9 and later | |
533 | .\" commit 0bd14b4fd72afd5df41e9fd59f356740f22fceba | |
534 | fix this limitation, allowing any valid set of nonoverlapping maps. | |
046de6a7 MK |
535 | .IP * |
536 | At least one line must be written to the file. | |
537 | .PP | |
538 | Writes that violate the above rules fail with the error | |
539 | .BR EINVAL . | |
a721e8b2 | 540 | .PP |
046de6a7 | 541 | In order for a process to write to the |
1ae6b2c7 AC |
542 | .IR /proc/ pid /uid_map |
543 | .RI ( /proc/ pid /gid_map ) | |
6486faa9 | 544 | file, all of the following permission requirements must be met: |
046de6a7 MK |
545 | .IP 1. 3 |
546 | The writing process must have the | |
1ae6b2c7 | 547 | .B CAP_SETUID |
046de6a7 MK |
548 | .RB ( CAP_SETGID ) |
549 | capability in the user namespace of the process | |
550 | .IR pid . | |
046de6a7 | 551 | .IP 2. |
31a7d506 | 552 | The writing process must either be in the user namespace of the process |
046de6a7 | 553 | .I pid |
31a7d506 | 554 | or be in the parent user namespace of the process |
046de6a7 MK |
555 | .IR pid . |
556 | .IP 3. | |
1863e451 MK |
557 | The mapped user IDs (group IDs) must in turn have a mapping |
558 | in the parent user namespace. | |
559 | .IP 4. | |
ab4c4b2f | 560 | If updating |
1ae6b2c7 | 561 | .IR /proc/ pid /uid_map |
ab4c4b2f MK |
562 | to create a mapping that maps UID 0 in the parent namespace, |
563 | then one of the following must be true: | |
564 | .RS | |
565 | .IP * 3 | |
566 | if writing process is in the parent user namespace, | |
567 | then it must have the | |
1ae6b2c7 | 568 | .B CAP_SETFCAP |
ab4c4b2f MK |
569 | capability in that user namespace; or |
570 | .IP * | |
571 | if the writing process is in the child user namespace, | |
572 | then the process that created the user namespace must have had the | |
1ae6b2c7 | 573 | .B CAP_SETFCAP |
ab4c4b2f MK |
574 | capability when the namespace was created. |
575 | .RE | |
576 | .IP | |
577 | This rule has been in place since | |
29c1f3cf | 578 | .\" commit db2e718a47984b9d71ed890eb2ea36ecf150de18 |
ab4c4b2f MK |
579 | Linux 5.12. |
580 | It eliminates an earlier security bug whereby | |
581 | a UID 0 process that lacks the | |
29c1f3cf | 582 | .B CAP_SETFCAP |
ab4c4b2f MK |
583 | capability, |
584 | which is needed to create a binary with namespaced file capabilities | |
585 | (as described in | |
586 | .BR capabilities (7)), | |
587 | could nevertheless create such a binary, | |
588 | by the following steps: | |
589 | .RS | |
590 | .IP * 3 | |
591 | Create a new user namespace with the identity mapping | |
592 | (i.e., UID 0 in the new user namespace maps to UID 0 in the parent namespace), | |
593 | so that UID 0 in both namespaces is equivalent to the same root user ID. | |
594 | .IP * | |
595 | Since the child process has the | |
596 | .B CAP_SETFCAP | |
597 | capability, it could create a binary with namespaced file capabilities | |
598 | that would then be effective in the parent user namespace | |
599 | (because the root user IDs are the same in the two namespaces). | |
600 | .RE | |
29c1f3cf | 601 | .IP 5. |
30b33164 | 602 | One of the following two cases applies: |
046de6a7 MK |
603 | .RS |
604 | .IP * 3 | |
1ae6b2c7 | 605 | .I Either |
30b33164 | 606 | the writing process has the |
1ae6b2c7 | 607 | .B CAP_SETUID |
30b33164 | 608 | .RB ( CAP_SETGID ) |
6c8571e0 MK |
609 | capability in the |
610 | .I parent | |
611 | user namespace. | |
30b33164 MK |
612 | .RS |
613 | .IP + 3 | |
614 | No further restrictions apply: | |
50b49f0b | 615 | the process can make mappings to arbitrary user IDs (group IDs) |
30b33164 MK |
616 | in the parent user namespace. |
617 | .RE | |
618 | .IP * 3 | |
1ae6b2c7 | 619 | .I Or |
30b33164 MK |
620 | otherwise all of the following restrictions apply: |
621 | .RS | |
622 | .IP + 3 | |
046de6a7 MK |
623 | The data written to |
624 | .I uid_map | |
625 | .RI ( gid_map ) | |
690c890a MK |
626 | must consist of a single line that maps |
627 | the writing process's effective user ID | |
046de6a7 MK |
628 | (group ID) in the parent user namespace to a user ID (group ID) |
629 | in the user namespace. | |
30b33164 | 630 | .IP + |
0c9abe8b EB |
631 | The writing process must have the same effective user ID as the process |
632 | that created the user namespace. | |
30b33164 | 633 | .IP + |
0c9abe8b | 634 | In the case of |
f2d61dbb | 635 | .IR gid_map , |
30b33164 | 636 | use of the |
f2d61dbb | 637 | .BR setgroups (2) |
30b33164 MK |
638 | system call must first be denied by writing |
639 | .RI \(dq deny \(dq | |
640 | to the | |
1ae6b2c7 | 641 | .IR /proc/ pid /setgroups |
30b33164 MK |
642 | file (see below) before writing to |
643 | .IR gid_map . | |
644 | .RE | |
046de6a7 MK |
645 | .RE |
646 | .PP | |
647 | Writes that violate the above rules fail with the error | |
648 | .BR EPERM . | |
d6842bf1 MK |
649 | .\" |
650 | .\" ============================================================ | |
651 | .\" | |
213e259e MK |
652 | .SS Project ID mappings: projid_map |
653 | Similarly to user and group ID mappings, | |
654 | it is possible to create project ID mappings for a user namespace. | |
655 | (Project IDs are used for disk quotas; see | |
656 | .BR setquota (8) | |
657 | and | |
658 | .BR quotactl (2).) | |
659 | .PP | |
660 | Project ID mappings are defined by writing to the | |
1ae6b2c7 | 661 | .IR /proc/ pid /projid_map |
213e259e MK |
662 | file (present since |
663 | .\" commit f76d207a66c3a53defea67e7d36c3eb1b7d6d61d | |
664 | Linux 3.7). | |
665 | .PP | |
666 | The validity rules for writing to the | |
1ae6b2c7 | 667 | .IR /proc/ pid /projid_map |
213e259e MK |
668 | file are as for writing to the |
669 | .I uid_map | |
670 | file; violation of these rules causes | |
671 | .BR write (2) | |
672 | to fail with the error | |
673 | .BR EINVAL . | |
674 | .PP | |
675 | The permission rules for writing to the | |
1ae6b2c7 | 676 | .IR /proc/ pid /projid_map |
213e259e MK |
677 | file are as follows: |
678 | .IP 1. 3 | |
679 | The writing process must either be in the user namespace of the process | |
680 | .I pid | |
681 | or be in the parent user namespace of the process | |
682 | .IR pid . | |
683 | .IP 2. | |
684 | The mapped project IDs must in turn have a mapping | |
685 | in the parent user namespace. | |
686 | .PP | |
687 | Violation of these rules causes | |
688 | .BR write (2) | |
689 | to fail with the error | |
690 | .BR EPERM . | |
691 | .\" | |
692 | .\" ============================================================ | |
693 | .\" | |
f2d61dbb MK |
694 | .SS Interaction with system calls that change process UIDs or GIDs |
695 | In a user namespace where the | |
0c9abe8b | 696 | .I uid_map |
f2d61dbb MK |
697 | file has not been written, the system calls that change user IDs will fail. |
698 | Similarly, if the | |
0c9abe8b | 699 | .I gid_map |
f2d61dbb MK |
700 | file has not been written, the system calls that change group IDs will fail. |
701 | After the | |
0c9abe8b EB |
702 | .I uid_map |
703 | and | |
704 | .I gid_map | |
f2d61dbb MK |
705 | files have been written, only the mapped values may be used in |
706 | system calls that change user and group IDs. | |
a721e8b2 | 707 | .PP |
f2d61dbb MK |
708 | For user IDs, the relevant system calls include |
709 | .BR setuid (2), | |
710 | .BR setfsuid (2), | |
711 | .BR setreuid (2), | |
0c9abe8b | 712 | and |
f2d61dbb MK |
713 | .BR setresuid (2). |
714 | For group IDs, the relevant system calls include | |
715 | .BR setgid (2), | |
716 | .BR setfsgid (2), | |
717 | .BR setregid (2), | |
718 | .BR setresgid (2), | |
0c9abe8b | 719 | and |
f2d61dbb | 720 | .BR setgroups (2). |
a721e8b2 | 721 | .PP |
0c9abe8b | 722 | Writing |
f2d61dbb | 723 | .RI \(dq deny \(dq |
0c9abe8b | 724 | to the |
1ae6b2c7 | 725 | .I /proc/ pid /setgroups |
0c9abe8b | 726 | file before writing to |
1ae6b2c7 | 727 | .I /proc/ pid /gid_map |
f2d61dbb MK |
728 | .\" Things changed in Linux 3.19 |
729 | .\" commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8 | |
730 | .\" commit 66d2f338ee4c449396b6f99f5e75cd18eb6df272 | |
731 | .\" http://lwn.net/Articles/626665/ | |
732 | will permanently disable | |
733 | .BR setgroups (2) | |
734 | in a user namespace and allow writing to | |
1ae6b2c7 | 735 | .I /proc/ pid /gid_map |
f2d61dbb | 736 | without having the |
1ae6b2c7 | 737 | .B CAP_SETGID |
f2d61dbb MK |
738 | capability in the parent user namespace. |
739 | .\" | |
740 | .\" ============================================================ | |
741 | .\" | |
1ae6b2c7 | 742 | .SS The /proc/ pid /setgroups file |
458abbe6 | 743 | .\" |
ab28dba9 MK |
744 | .\" commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8 |
745 | .\" commit 66d2f338ee4c449396b6f99f5e75cd18eb6df272 | |
746 | .\" http://lwn.net/Articles/626665/ | |
747 | .\" http://web.nvd.nist.gov/view/vuln/detail?vulnId=CVE-2014-8989 | |
748 | .\" | |
749 | The | |
1ae6b2c7 | 750 | .IR /proc/ pid /setgroups |
ab28dba9 MK |
751 | file displays the string |
752 | .RI \(dq allow \(dq | |
753 | if processes in the user namespace that contains the process | |
754 | .I pid | |
755 | are permitted to employ the | |
756 | .BR setgroups (2) | |
757 | system call; it displays | |
758 | .RI \(dq deny \(dq | |
759 | if | |
760 | .BR setgroups (2) | |
761 | is not permitted in that user namespace. | |
e2b6e58c | 762 | Note that regardless of the value in the |
1ae6b2c7 | 763 | .IR /proc/ pid /setgroups |
e2b6e58c MK |
764 | file (and regardless of the process's capabilities), calls to |
765 | .BR setgroups (2) | |
766 | are also not permitted if | |
1ae6b2c7 | 767 | .IR /proc/ pid /gid_map |
e2b6e58c | 768 | has not yet been set. |
a721e8b2 | 769 | .PP |
ab28dba9 | 770 | A privileged process (one with the |
1ae6b2c7 | 771 | .B CAP_SYS_ADMIN |
ab28dba9 MK |
772 | capability in the namespace) may write either of the strings |
773 | .RI \(dq allow \(dq | |
774 | or | |
775 | .RI \(dq deny \(dq | |
776 | to this file | |
777 | .I before | |
778 | writing a group ID mapping | |
779 | for this user namespace to the file | |
1ae6b2c7 | 780 | .IR /proc/ pid /gid_map . |
ab28dba9 MK |
781 | Writing the string |
782 | .RI \(dq deny \(dq | |
783 | prevents any process in the user namespace from employing | |
784 | .BR setgroups (2). | |
a721e8b2 | 785 | .PP |
d6add5ef MK |
786 | The essence of the restrictions described in the preceding |
787 | paragraph is that it is permitted to write to | |
1ae6b2c7 | 788 | .IR /proc/ pid /setgroups |
d6add5ef | 789 | only so long as calling |
ab28dba9 | 790 | .BR setgroups (2) |
d6add5ef | 791 | is disallowed because |
1ae6b2c7 | 792 | .IR /proc/ pid /gid_map |
ab28dba9 MK |
793 | has not been set. |
794 | This ensures that a process cannot transition from a state where | |
795 | .BR setgroups (2) | |
796 | is allowed to a state where | |
797 | .BR setgroups (2) | |
798 | is denied; | |
b64fbdca | 799 | a process can transition only from |
ab28dba9 MK |
800 | .BR setgroups (2) |
801 | being disallowed to | |
802 | .BR setgroups (2) | |
803 | being allowed. | |
a721e8b2 | 804 | .PP |
ab28dba9 MK |
805 | The default value of this file in the initial user namespace is |
806 | .RI \(dq allow \(dq. | |
a721e8b2 | 807 | .PP |
ab28dba9 | 808 | Once |
1ae6b2c7 | 809 | .IR /proc/ pid /gid_map |
ab28dba9 MK |
810 | has been written to |
811 | (which has the effect of enabling | |
812 | .BR setgroups (2) | |
813 | in the user namespace), | |
fe3e2b4e | 814 | it is no longer possible to disallow |
ab28dba9 | 815 | .BR setgroups (2) |
fe3e2b4e | 816 | by writing |
bb6adc58 | 817 | .RI \(dq deny \(dq |
fe3e2b4e | 818 | to |
1ae6b2c7 | 819 | .IR /proc/ pid /setgroups |
fe3e2b4e MK |
820 | (the write fails with the error |
821 | .BR EPERM ). | |
a721e8b2 | 822 | .PP |
ab28dba9 | 823 | A child user namespace inherits the |
1ae6b2c7 | 824 | .IR /proc/ pid /setgroups |
ab28dba9 | 825 | setting from its parent. |
a721e8b2 | 826 | .PP |
ab28dba9 MK |
827 | If the |
828 | .I setgroups | |
829 | file has the value | |
830 | .RI \(dq deny \(dq, | |
831 | then the | |
832 | .BR setgroups (2) | |
833 | system call can't subsequently be reenabled (by writing | |
834 | .RI \(dq allow \(dq | |
835 | to the file) in this user namespace. | |
a23d8efa | 836 | (Attempts to do so fail with the error |
d6add5ef | 837 | .BR EPERM .) |
ab28dba9 MK |
838 | This restriction also propagates down to all child user namespaces of |
839 | this user namespace. | |
a721e8b2 | 840 | .PP |
ecb0ff30 | 841 | The |
1ae6b2c7 | 842 | .I /proc/ pid /setgroups |
ecb0ff30 MK |
843 | file was added in Linux 3.19, |
844 | but was backported to many earlier stable kernel series, | |
845 | because it addresses a security issue. | |
846 | The issue concerned files with permissions such as "rwx\-\-\-rwx". | |
847 | Such files give fewer permissions to "group" than they do to "other". | |
848 | This means that dropping groups using | |
849 | .BR setgroups (2) | |
850 | might allow a process file access that it did not formerly have. | |
851 | Before the existence of user namespaces this was not a concern, | |
852 | since only a privileged process (one with the | |
1ae6b2c7 | 853 | .B CAP_SETGID |
ecb0ff30 MK |
854 | capability) could call |
855 | .BR setgroups (2). | |
856 | However, with the introduction of user namespaces, | |
857 | it became possible for an unprivileged process to create | |
858 | a new namespace in which the user had all privileges. | |
859 | This then allowed formerly unprivileged | |
860 | users to drop groups and thus gain file access | |
861 | that they did not previously have. | |
862 | The | |
1ae6b2c7 | 863 | .IR /proc/ pid /setgroups |
ecb0ff30 | 864 | file was added to address this security issue, |
1fc04edf | 865 | by denying any pathway for an unprivileged process to drop groups with |
ecb0ff30 MK |
866 | .BR setgroups (2). |
867 | .\" | |
868 | .\" /proc/PID/setgroups | |
869 | .\" [allow == setgroups() is allowed, "deny" == setgroups() is disallowed] | |
870 | .\" * Can write if have CAP_SYS_ADMIN in NS | |
871 | .\" * Must write BEFORE writing to /proc/PID/gid_map | |
872 | .\" | |
873 | .\" setgroups() | |
f2c2c308 | 874 | .\" * Must already have written to gid_map |
ecb0ff30 MK |
875 | .\" * /proc/PID/setgroups must be "allow" |
876 | .\" | |
877 | .\" /proc/PID/gid_map -- writing | |
878 | .\" * Must already have written "deny" to /proc/PID/setgroups | |
ab28dba9 MK |
879 | .\" |
880 | .\" ============================================================ | |
881 | .\" | |
3e2a37ec | 882 | .SS Unmapped user and group IDs |
3e2a37ec MK |
883 | There are various places where an unmapped user ID (group ID) |
884 | may be exposed to user space. | |
885 | For example, the first process in a new user namespace may call | |
a4680ab5 | 886 | .BR getuid (2) |
3e2a37ec MK |
887 | before a user ID mapping has been defined for the namespace. |
888 | In most such cases, an unmapped user ID is converted | |
889 | .\" from_kuid_munged(), from_kgid_munged() | |
890 | to the overflow user ID (group ID); | |
891 | the default value for the overflow user ID (group ID) is 65534. | |
892 | See the descriptions of | |
1ae6b2c7 | 893 | .I /proc/sys/kernel/overflowuid |
3e2a37ec | 894 | and |
1ae6b2c7 | 895 | .I /proc/sys/kernel/overflowgid |
3e2a37ec MK |
896 | in |
897 | .BR proc (5). | |
a721e8b2 | 898 | .PP |
3e2a37ec MK |
899 | The cases where unmapped IDs are mapped in this fashion include |
900 | system calls that return user IDs | |
dba9ebf2 | 901 | .RB ( getuid (2), |
3e2a37ec MK |
902 | .BR getgid (2), |
903 | and similar), | |
904 | credentials passed over a UNIX domain socket, | |
905 | .\" also SO_PEERCRED | |
906 | credentials returned by | |
907 | .BR stat (2), | |
908 | .BR waitid (2), | |
909 | and the System V IPC "ctl" | |
910 | .B IPC_STAT | |
911 | operations, | |
912 | credentials exposed by | |
1ae6b2c7 | 913 | .IR /proc/ pid /status |
3e2a37ec MK |
914 | and the files in |
915 | .IR /proc/sysvipc/* , | |
916 | credentials returned via the | |
917 | .I si_uid | |
918 | field in the | |
919 | .I siginfo_t | |
920 | received with a signal (see | |
921 | .BR sigaction (2)), | |
922 | credentials written to the process accounting file (see | |
3b44624f | 923 | .BR acct (5)), |
3e2a37ec MK |
924 | and credentials returned with POSIX message queue notifications (see |
925 | .BR mq_notify (3)). | |
a721e8b2 | 926 | .PP |
3e2a37ec MK |
927 | There is one notable case where unmapped user and group IDs are |
928 | .I not | |
929 | .\" from_kuid(), from_kgid() | |
930 | .\" Also F_GETOWNER_UIDS is an exception | |
931 | converted to the corresponding overflow ID value. | |
932 | When viewing a | |
933 | .I uid_map | |
934 | or | |
935 | .I gid_map | |
936 | file in which there is no mapping for the second field, | |
54b9d7bf | 937 | that field is displayed as 4294967295 (\-1 as an unsigned integer). |
3e2a37ec MK |
938 | .\" |
939 | .\" ============================================================ | |
940 | .\" | |
ea8ec578 | 941 | .SS Accessing files |
b8cf6c24 | 942 | In order to determine permissions when an unprivileged process accesses a file, |
ea8ec578 MK |
943 | the process credentials (UID, GID) and the file credentials |
944 | are in effect mapped back to what they would be in | |
945 | the initial user namespace and then compared to determine | |
946 | the permissions that the process has on the file. | |
b8cf6c24 MK |
947 | The same is also of other objects that employ the credentials plus |
948 | permissions mask accessibility model, such as System V IPC objects | |
ea8ec578 MK |
949 | .\" |
950 | .\" ============================================================ | |
951 | .\" | |
43f4bec1 | 952 | .SS Operation of file-related capabilities |
43f4bec1 MK |
953 | Certain capabilities allow a process to bypass various |
954 | kernel-enforced restrictions when performing operations on | |
955 | files owned by other users or groups. | |
956 | These capabilities are: | |
957 | .BR CAP_CHOWN , | |
958 | .BR CAP_DAC_OVERRIDE , | |
959 | .BR CAP_DAC_READ_SEARCH , | |
960 | .BR CAP_FOWNER , | |
961 | and | |
962 | .BR CAP_FSETID . | |
963 | .PP | |
964 | Within a user namespace, | |
965 | these capabilities allow a process to bypass the rules | |
966 | if the process has the relevant capability over the file, | |
967 | meaning that: | |
968 | .IP * 3 | |
969 | the process has the relevant effective capability in its user namespace; and | |
970 | .IP * | |
971 | the file's user ID and group ID both have valid mappings | |
972 | in the user namespace. | |
973 | .PP | |
974 | The | |
1ae6b2c7 | 975 | .B CAP_FOWNER |
43f4bec1 | 976 | capability is treated somewhat exceptionally: |
43f4bec1 MK |
977 | .\" These are the checks performed by the kernel function |
978 | .\" inode_owner_or_capable(). There is one exception to the exception: | |
979 | .\" overriding the directory sticky permission bit requires that | |
980 | .\" the file has a valid mapping for both its UID and GID. | |
89a0bd82 MK |
981 | it allows a process to bypass the corresponding rules so long as |
982 | at least the file's user ID has a mapping in the user namespace | |
43f4bec1 MK |
983 | (i.e., the file's group ID does not need to have a valid mapping). |
984 | .\" | |
985 | .\" ============================================================ | |
986 | .\" | |
62a5214c | 987 | .SS Set-user-ID and set-group-ID programs |
046de6a7 MK |
988 | When a process inside a user namespace executes |
989 | a set-user-ID (set-group-ID) program, | |
990 | the process's effective user (group) ID inside the namespace is changed | |
991 | to whatever value is mapped for the user (group) ID of the file. | |
992 | However, if either the user | |
993 | .I or | |
994 | the group ID of the file has no mapping inside the namespace, | |
995 | the set-user-ID (set-group-ID) bit is silently ignored: | |
996 | the new program is executed, | |
997 | but the process's effective user (group) ID is left unchanged. | |
998 | (This mirrors the semantics of executing a set-user-ID or set-group-ID | |
ab3311aa | 999 | program that resides on a filesystem that was mounted with the |
1ae6b2c7 | 1000 | .B MS_NOSUID |
3b44624f | 1001 | flag, as described in |
046de6a7 | 1002 | .BR mount (2).) |
6eda9441 MK |
1003 | .\" |
1004 | .\" ============================================================ | |
1005 | .\" | |
1006 | .SS Miscellaneous | |
6eda9441 MK |
1007 | When a process's user and group IDs are passed over a UNIX domain socket |
1008 | to a process in a different user namespace (see the description of | |
1009 | .B SCM_CREDENTIALS | |
1010 | in | |
1011 | .BR unix (7)), | |
1012 | they are translated into the corresponding values as per the | |
1013 | receiving process's user and group ID mappings. | |
63f66893 | 1014 | .\" |
046de6a7 MK |
1015 | .SH CONFORMING TO |
1016 | Namespaces are a Linux-specific feature. | |
63f66893 | 1017 | .\" |
62a5214c MK |
1018 | .SH NOTES |
1019 | Over the years, there have been a lot of features that have been added | |
7ae693d0 | 1020 | to the Linux kernel that have been made available only to privileged users |
62a5214c MK |
1021 | because of their potential to confuse set-user-ID-root applications. |
1022 | In general, it becomes safe to allow the root user in a user namespace to | |
1023 | use those features because it is impossible, while in a user namespace, | |
1024 | to gain more privilege than the root user of a user namespace has. | |
bc921757 MK |
1025 | .\" |
1026 | .\" ============================================================ | |
1027 | .\" | |
f9f75be0 MK |
1028 | .SS Global root |
1029 | The term "global root" is sometimes used as a shorthand for | |
1030 | user ID 0 in the initial user namespace. | |
1031 | .\" | |
1032 | .\" ============================================================ | |
1033 | .\" | |
c3f29a89 MK |
1034 | .SS Availability |
1035 | Use of user namespaces requires a kernel that is configured with the | |
1036 | .B CONFIG_USER_NS | |
1037 | option. | |
1038 | User namespaces require support in a range of subsystems across | |
1039 | the kernel. | |
1040 | When an unsupported subsystem is configured into the kernel, | |
1041 | it is not possible to configure user namespaces support. | |
a721e8b2 | 1042 | .PP |
ed8bd845 MK |
1043 | As at Linux 3.8, most relevant subsystems supported user namespaces, |
1044 | but a number of filesystems did not have the infrastructure needed | |
1045 | to map user and group IDs between user namespaces. | |
1046 | Linux 3.9 added the required infrastructure support for many of | |
1047 | the remaining unsupported filesystems | |
1048 | (Plan 9 (9P), Andrew File System (AFS), Ceph, CIFS, CODA, NFS, and OCFS2). | |
38598749 | 1049 | Linux 3.12 added support for the last of the unsupported major filesystems, |
c0d02ab0 MK |
1050 | .\" commit d6970d4b726cea6d7a9bc4120814f95c09571fc3 |
1051 | XFS. | |
63f66893 | 1052 | .\" |
a14af333 | 1053 | .SH EXAMPLES |
8d36d80c MK |
1054 | The program below is designed to allow experimenting with |
1055 | user namespaces, as well as other types of namespaces. | |
1056 | It creates namespaces as specified by command-line options and then executes | |
1057 | a command inside those namespaces. | |
1058 | The comments and | |
1059 | .I usage() | |
1060 | function inside the program provide a full explanation of the program. | |
3e2a37ec | 1061 | The following shell session demonstrates its use. |
a721e8b2 | 1062 | .PP |
3e2a37ec | 1063 | First, we look at the run-time environment: |
a721e8b2 | 1064 | .PP |
8d36d80c | 1065 | .in +4n |
b8302363 | 1066 | .EX |
791ea4b3 | 1067 | $ \fBuname \-rs\fP # Need Linux 3.8 or later |
8d36d80c | 1068 | Linux 3.8.0 |
791ea4b3 | 1069 | $ \fBid \-u\fP # Running as unprivileged user |
8d36d80c | 1070 | 1000 |
791ea4b3 | 1071 | $ \fBid \-g\fP |
8d36d80c | 1072 | 1000 |
b8302363 | 1073 | .EE |
8d36d80c | 1074 | .in |
a721e8b2 | 1075 | .PP |
3e2a37ec | 1076 | Now start a new shell in new user |
8d36d80c MK |
1077 | .RI ( \-U ), |
1078 | mount | |
1079 | .RI ( \-m ), | |
1080 | and PID | |
1081 | .RI ( \-p ) | |
1082 | namespaces, with user ID | |
1083 | .RI ( \-M ) | |
3b44624f | 1084 | and group ID |
8d36d80c | 1085 | .RI ( \-G ) |
3b44624f | 1086 | 1000 mapped to 0 inside the user namespace: |
a721e8b2 | 1087 | .PP |
8d36d80c | 1088 | .in +4n |
b8302363 | 1089 | .EX |
861d36ba | 1090 | $ \fB./userns_child_exec \-p \-m \-U \-M \(aq0 1000 1\(aq \-G \(aq0 1000 1\(aq bash\fP |
b8302363 | 1091 | .EE |
8d36d80c | 1092 | .in |
a721e8b2 | 1093 | .PP |
f5d401dd | 1094 | The shell has PID 1, because it is the first process in the new |
8d36d80c | 1095 | PID namespace: |
a721e8b2 | 1096 | .PP |
8d36d80c | 1097 | .in +4n |
b8302363 | 1098 | .EX |
8d36d80c MK |
1099 | bash$ \fBecho $$\fP |
1100 | 1 | |
b8302363 | 1101 | .EE |
8d36d80c | 1102 | .in |
c6c28d52 | 1103 | .PP |
8d36d80c MK |
1104 | Mounting a new |
1105 | .I /proc | |
ab3311aa | 1106 | filesystem and listing all of the processes visible |
8d36d80c MK |
1107 | in the new PID namespace shows that the shell can't see |
1108 | any processes outside the PID namespace: | |
a721e8b2 | 1109 | .PP |
8d36d80c | 1110 | .in +4n |
b8302363 | 1111 | .EX |
791ea4b3 | 1112 | bash$ \fBmount \-t proc proc /proc\fP |
8d36d80c MK |
1113 | bash$ \fBps ax\fP |
1114 | PID TTY STAT TIME COMMAND | |
1115 | 1 pts/3 S 0:00 bash | |
1116 | 22 pts/3 R+ 0:00 ps ax | |
b8302363 | 1117 | .EE |
8d36d80c | 1118 | .in |
a721e8b2 | 1119 | .PP |
a2b1485b SB |
1120 | Inside the user namespace, the shell has user and group ID 0, |
1121 | and a full set of permitted and effective capabilities: | |
a721e8b2 | 1122 | .PP |
a2b1485b | 1123 | .in +4n |
b8302363 | 1124 | .EX |
861d36ba | 1125 | bash$ \fBcat /proc/$$/status | egrep \(aq\(ha[UG]id\(aq\fP |
a2b1485b SB |
1126 | Uid: 0 0 0 0 |
1127 | Gid: 0 0 0 0 | |
861d36ba | 1128 | bash$ \fBcat /proc/$$/status | egrep \(aq\(haCap(Prm|Inh|Eff)\(aq\fP |
a2b1485b SB |
1129 | CapInh: 0000000000000000 |
1130 | CapPrm: 0000001fffffffff | |
1131 | CapEff: 0000001fffffffff | |
b8302363 | 1132 | .EE |
a2b1485b | 1133 | .in |
8d36d80c MK |
1134 | .SS Program source |
1135 | \& | |
e7d0bb47 | 1136 | .EX |
8d36d80c MK |
1137 | /* userns_child_exec.c |
1138 | ||
1139 | Licensed under GNU General Public License v2 or later | |
1140 | ||
1141 | Create a child process that executes a shell command in new | |
1142 | namespace(s); allow UID and GID mappings to be specified when | |
1143 | creating a user namespace. | |
1144 | */ | |
1145 | #define _GNU_SOURCE | |
1146 | #include <sched.h> | |
1147 | #include <unistd.h> | |
8eb90116 | 1148 | #include <stdint.h> |
8d36d80c MK |
1149 | #include <stdlib.h> |
1150 | #include <sys/wait.h> | |
1151 | #include <signal.h> | |
1152 | #include <fcntl.h> | |
1153 | #include <stdio.h> | |
1154 | #include <string.h> | |
1155 | #include <limits.h> | |
1156 | #include <errno.h> | |
1157 | ||
1158 | /* A simple error\-handling function: print an error message based | |
c6beb8a1 | 1159 | on the value in \(aqerrno\(aq and terminate the calling process. */ |
8d36d80c | 1160 | |
d1a71985 | 1161 | #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e |
8d36d80c MK |
1162 | } while (0) |
1163 | ||
1164 | struct child_args { | |
1165 | char **argv; /* Command to be executed by child, with args */ | |
1166 | int pipe_fd[2]; /* Pipe used to synchronize parent and child */ | |
1167 | }; | |
1168 | ||
1169 | static int verbose; | |
1170 | ||
1171 | static void | |
1172 | usage(char *pname) | |
1173 | { | |
d1a71985 | 1174 | fprintf(stderr, "Usage: %s [options] cmd [arg...]\en\en", pname); |
8d36d80c | 1175 | fprintf(stderr, "Create a child process that executes a shell " |
d1a71985 MK |
1176 | "command in a new user namespace,\en" |
1177 | "and possibly also other new namespace(s).\en\en"); | |
1178 | fprintf(stderr, "Options can be:\en\en"); | |
8d36d80c | 1179 | #define fpe(str) fprintf(stderr, " %s", str); |
d1a71985 MK |
1180 | fpe("\-i New IPC namespace\en"); |
1181 | fpe("\-m New mount namespace\en"); | |
1182 | fpe("\-n New network namespace\en"); | |
1183 | fpe("\-p New PID namespace\en"); | |
1184 | fpe("\-u New UTS namespace\en"); | |
1185 | fpe("\-U New user namespace\en"); | |
1186 | fpe("\-M uid_map Specify UID map for user namespace\en"); | |
1187 | fpe("\-G gid_map Specify GID map for user namespace\en"); | |
1188 | fpe("\-z Map user\(aqs UID and GID to 0 in user namespace\en"); | |
1189 | fpe(" (equivalent to: \-M \(aq0 <uid> 1\(aq \-G \(aq0 <gid> 1\(aq)\en"); | |
1190 | fpe("\-v Display verbose messages\en"); | |
1191 | fpe("\en"); | |
1192 | fpe("If \-z, \-M, or \-G is specified, \-U is required.\en"); | |
1193 | fpe("It is not permitted to specify both \-z and either \-M or \-G.\en"); | |
1194 | fpe("\en"); | |
1195 | fpe("Map strings for \-M and \-G consist of records of the form:\en"); | |
1196 | fpe("\en"); | |
1197 | fpe(" ID\-inside\-ns ID\-outside\-ns len\en"); | |
1198 | fpe("\en"); | |
8d36d80c | 1199 | fpe("A map string can contain multiple records, separated" |
d1a71985 | 1200 | " by commas;\en"); |
8d36d80c | 1201 | fpe("the commas are replaced by newlines before writing" |
d1a71985 | 1202 | " to map files.\en"); |
8d36d80c MK |
1203 | |
1204 | exit(EXIT_FAILURE); | |
1205 | } | |
1206 | ||
1207 | /* Update the mapping file \(aqmap_file\(aq, with the value provided in | |
1208 | \(aqmapping\(aq, a string that defines a UID or GID mapping. A UID or | |
1209 | GID mapping consists of one or more newline\-delimited records | |
1210 | of the form: | |
1211 | ||
1212 | ID_inside\-ns ID\-outside\-ns length | |
1213 | ||
1214 | Requiring the user to supply a string that contains newlines is | |
1215 | of course inconvenient for command\-line use. Thus, we permit the | |
1216 | use of commas to delimit records in this string, and replace them | |
1217 | with newlines before writing the string to the file. */ | |
1218 | ||
1219 | static void | |
1220 | update_map(char *mapping, char *map_file) | |
1221 | { | |
88893a77 | 1222 | int fd; |
8d36d80c MK |
1223 | size_t map_len; /* Length of \(aqmapping\(aq */ |
1224 | ||
c6beb8a1 | 1225 | /* Replace commas in mapping string with newlines. */ |
8d36d80c MK |
1226 | |
1227 | map_len = strlen(mapping); | |
88893a77 | 1228 | for (int j = 0; j < map_len; j++) |
8d36d80c | 1229 | if (mapping[j] == \(aq,\(aq) |
d1a71985 | 1230 | mapping[j] = \(aq\en\(aq; |
8d36d80c MK |
1231 | |
1232 | fd = open(map_file, O_RDWR); | |
1233 | if (fd == \-1) { | |
d1a71985 | 1234 | fprintf(stderr, "ERROR: open %s: %s\en", map_file, |
8f99aa89 MK |
1235 | strerror(errno)); |
1236 | exit(EXIT_FAILURE); | |
8d36d80c MK |
1237 | } |
1238 | ||
1239 | if (write(fd, mapping, map_len) != map_len) { | |
d1a71985 | 1240 | fprintf(stderr, "ERROR: write %s: %s\en", map_file, |
8f99aa89 MK |
1241 | strerror(errno)); |
1242 | exit(EXIT_FAILURE); | |
8d36d80c MK |
1243 | } |
1244 | ||
1245 | close(fd); | |
1246 | } | |
1247 | ||
c38a2a04 MK |
1248 | /* Linux 3.19 made a change in the handling of setgroups(2) and the |
1249 | \(aqgid_map\(aq file to address a security issue. The issue allowed | |
1250 | *unprivileged* users to employ user namespaces in order to drop | |
1251 | The upshot of the 3.19 changes is that in order to update the | |
1252 | \(aqgid_maps\(aq file, use of the setgroups() system call in this | |
1253 | user namespace must first be disabled by writing "deny" to one of | |
1254 | the /proc/PID/setgroups files for this namespace. That is the | |
1255 | purpose of the following function. */ | |
1256 | ||
1257 | static void | |
1258 | proc_setgroups_write(pid_t child_pid, char *str) | |
1259 | { | |
1260 | char setgroups_path[PATH_MAX]; | |
1261 | int fd; | |
1262 | ||
8eb90116 AC |
1263 | snprintf(setgroups_path, PATH_MAX, "/proc/%jd/setgroups", |
1264 | (intmax_t) child_pid); | |
c38a2a04 MK |
1265 | |
1266 | fd = open(setgroups_path, O_RDWR); | |
1267 | if (fd == \-1) { | |
1268 | ||
1269 | /* We may be on a system that doesn\(aqt support | |
1270 | /proc/PID/setgroups. In that case, the file won\(aqt exist, | |
1271 | and the system won\(aqt impose the restrictions that Linux 3.19 | |
1272 | added. That\(aqs fine: we don\(aqt need to do anything in order | |
1273 | to permit \(aqgid_map\(aq to be updated. | |
1274 | ||
1275 | However, if the error from open() was something other than | |
1276 | the ENOENT error that is expected for that case, let the | |
1277 | user know. */ | |
1278 | ||
1279 | if (errno != ENOENT) | |
d1a71985 | 1280 | fprintf(stderr, "ERROR: open %s: %s\en", setgroups_path, |
c38a2a04 MK |
1281 | strerror(errno)); |
1282 | return; | |
1283 | } | |
1284 | ||
1285 | if (write(fd, str, strlen(str)) == \-1) | |
d1a71985 | 1286 | fprintf(stderr, "ERROR: write %s: %s\en", setgroups_path, |
c38a2a04 MK |
1287 | strerror(errno)); |
1288 | ||
1289 | close(fd); | |
1290 | } | |
1291 | ||
8d36d80c MK |
1292 | static int /* Start function for cloned child */ |
1293 | childFunc(void *arg) | |
1294 | { | |
dc0bba35 | 1295 | struct child_args *args = arg; |
8d36d80c MK |
1296 | char ch; |
1297 | ||
1298 | /* Wait until the parent has updated the UID and GID mappings. | |
1299 | See the comment in main(). We wait for end of file on a | |
1300 | pipe that will be closed by the parent process once it has | |
1301 | updated the mappings. */ | |
1302 | ||
1303 | close(args\->pipe_fd[1]); /* Close our descriptor for the write | |
1304 | end of the pipe so that we see EOF | |
c6beb8a1 | 1305 | when parent closes its descriptor. */ |
8d36d80c MK |
1306 | if (read(args\->pipe_fd[0], &ch, 1) != 0) { |
1307 | fprintf(stderr, | |
d1a71985 | 1308 | "Failure in child: read from pipe returned != 0\en"); |
8d36d80c MK |
1309 | exit(EXIT_FAILURE); |
1310 | } | |
1311 | ||
a2b1485b SB |
1312 | close(args\->pipe_fd[0]); |
1313 | ||
c6beb8a1 | 1314 | /* Execute a shell command. */ |
8d36d80c | 1315 | |
d1a71985 | 1316 | printf("About to exec %s\en", args\->argv[0]); |
8d36d80c MK |
1317 | execvp(args\->argv[0], args\->argv); |
1318 | errExit("execvp"); | |
1319 | } | |
1320 | ||
1321 | #define STACK_SIZE (1024 * 1024) | |
1322 | ||
1323 | static char child_stack[STACK_SIZE]; /* Space for child\(aqs stack */ | |
1324 | ||
1325 | int | |
1326 | main(int argc, char *argv[]) | |
1327 | { | |
1328 | int flags, opt, map_zero; | |
1329 | pid_t child_pid; | |
1330 | struct child_args args; | |
1331 | char *uid_map, *gid_map; | |
1332 | const int MAP_BUF_SIZE = 100; | |
1333 | char map_buf[MAP_BUF_SIZE]; | |
1334 | char map_path[PATH_MAX]; | |
1335 | ||
1336 | /* Parse command\-line options. The initial \(aq+\(aq character in | |
1337 | the final getopt() argument prevents GNU\-style permutation | |
1338 | of command\-line options. That\(aqs useful, since sometimes | |
1339 | the \(aqcommand\(aq to be executed by this program itself | |
1340 | has command\-line options. We don\(aqt want getopt() to treat | |
1341 | those as options to this program. */ | |
1342 | ||
1343 | flags = 0; | |
1344 | verbose = 0; | |
1345 | gid_map = NULL; | |
1346 | uid_map = NULL; | |
1347 | map_zero = 0; | |
1348 | while ((opt = getopt(argc, argv, "+imnpuUM:G:zv")) != \-1) { | |
1349 | switch (opt) { | |
1350 | case \(aqi\(aq: flags |= CLONE_NEWIPC; break; | |
1351 | case \(aqm\(aq: flags |= CLONE_NEWNS; break; | |
1352 | case \(aqn\(aq: flags |= CLONE_NEWNET; break; | |
1353 | case \(aqp\(aq: flags |= CLONE_NEWPID; break; | |
1354 | case \(aqu\(aq: flags |= CLONE_NEWUTS; break; | |
1355 | case \(aqv\(aq: verbose = 1; break; | |
1356 | case \(aqz\(aq: map_zero = 1; break; | |
1357 | case \(aqM\(aq: uid_map = optarg; break; | |
1358 | case \(aqG\(aq: gid_map = optarg; break; | |
1359 | case \(aqU\(aq: flags |= CLONE_NEWUSER; break; | |
1360 | default: usage(argv[0]); | |
1361 | } | |
1362 | } | |
1363 | ||
1364 | /* \-M or \-G without \-U is nonsensical */ | |
1365 | ||
1366 | if (((uid_map != NULL || gid_map != NULL || map_zero) && | |
1367 | !(flags & CLONE_NEWUSER)) || | |
1368 | (map_zero && (uid_map != NULL || gid_map != NULL))) | |
1369 | usage(argv[0]); | |
1370 | ||
1371 | args.argv = &argv[optind]; | |
1372 | ||
1373 | /* We use a pipe to synchronize the parent and child, in order to | |
1374 | ensure that the parent sets the UID and GID maps before the child | |
1375 | calls execve(). This ensures that the child maintains its | |
1376 | capabilities during the execve() in the common case where we | |
1377 | want to map the child\(aqs effective user ID to 0 in the new user | |
1378 | namespace. Without this synchronization, the child would lose | |
1379 | its capabilities if it performed an execve() with nonzero | |
1380 | user IDs (see the capabilities(7) man page for details of the | |
1381 | transformation of a process\(aqs capabilities during execve()). */ | |
1382 | ||
1383 | if (pipe(args.pipe_fd) == \-1) | |
1384 | errExit("pipe"); | |
1385 | ||
c6beb8a1 | 1386 | /* Create the child in new namespace(s). */ |
8d36d80c MK |
1387 | |
1388 | child_pid = clone(childFunc, child_stack + STACK_SIZE, | |
1389 | flags | SIGCHLD, &args); | |
1390 | if (child_pid == \-1) | |
1391 | errExit("clone"); | |
1392 | ||
c6beb8a1 | 1393 | /* Parent falls through to here. */ |
8d36d80c MK |
1394 | |
1395 | if (verbose) | |
8eb90116 AC |
1396 | printf("%s: PID of child created by clone() is %jd\en", |
1397 | argv[0], (intmax_t) child_pid); | |
8d36d80c | 1398 | |
c6beb8a1 | 1399 | /* Update the UID and GID maps in the child. */ |
8d36d80c MK |
1400 | |
1401 | if (uid_map != NULL || map_zero) { | |
8eb90116 AC |
1402 | snprintf(map_path, PATH_MAX, "/proc/%jd/uid_map", |
1403 | (intmax_t) child_pid); | |
8d36d80c | 1404 | if (map_zero) { |
8eb90116 AC |
1405 | snprintf(map_buf, MAP_BUF_SIZE, "0 %jd 1", |
1406 | (intmax_t) getuid()); | |
8d36d80c MK |
1407 | uid_map = map_buf; |
1408 | } | |
1409 | update_map(uid_map, map_path); | |
1410 | } | |
4990f759 | 1411 | |
8d36d80c | 1412 | if (gid_map != NULL || map_zero) { |
c38a2a04 MK |
1413 | proc_setgroups_write(child_pid, "deny"); |
1414 | ||
8eb90116 AC |
1415 | snprintf(map_path, PATH_MAX, "/proc/%jd/gid_map", |
1416 | (intmax_t) child_pid); | |
8d36d80c | 1417 | if (map_zero) { |
8eb90116 AC |
1418 | snprintf(map_buf, MAP_BUF_SIZE, "0 %ld 1", |
1419 | (intmax_t) getgid()); | |
8d36d80c MK |
1420 | gid_map = map_buf; |
1421 | } | |
1422 | update_map(gid_map, map_path); | |
1423 | } | |
1424 | ||
1425 | /* Close the write end of the pipe, to signal to the child that we | |
c6beb8a1 | 1426 | have updated the UID and GID maps. */ |
8d36d80c MK |
1427 | |
1428 | close(args.pipe_fd[1]); | |
1429 | ||
1430 | if (waitpid(child_pid, NULL, 0) == \-1) /* Wait for child */ | |
1431 | errExit("waitpid"); | |
1432 | ||
1433 | if (verbose) | |
d1a71985 | 1434 | printf("%s: terminating\en", argv[0]); |
8d36d80c MK |
1435 | |
1436 | exit(EXIT_SUCCESS); | |
1437 | } | |
e7d0bb47 | 1438 | .EE |
046de6a7 | 1439 | .SH SEE ALSO |
b6462f75 MK |
1440 | .BR newgidmap (1), \" From the shadow package |
1441 | .BR newuidmap (1), \" From the shadow package | |
046de6a7 | 1442 | .BR clone (2), |
801245a1 | 1443 | .BR ptrace (2), |
046de6a7 MK |
1444 | .BR setns (2), |
1445 | .BR unshare (2), | |
1446 | .BR proc (5), | |
b6462f75 MK |
1447 | .BR subgid (5), \" From the shadow package |
1448 | .BR subuid (5), \" From the shadow package | |
589e43bb | 1449 | .BR capabilities (7), |
bba4bbbd | 1450 | .BR cgroup_namespaces (7), |
3525268c MK |
1451 | .BR credentials (7), |
1452 | .BR namespaces (7), | |
8d36d80c | 1453 | .BR pid_namespaces (7) |
6545cc56 | 1454 | .PP |
c94eb4a6 | 1455 | The kernel source file |
57fb49f9 | 1456 | .IR Documentation/admin\-guide/namespaces/resource\-control.rst . |