]>
Commit | Line | Data |
---|---|---|
b10c74ff EB |
1 | .\" Copyright (c) 2013, 2014 by Michael Kerrisk <mtk.manpages@gmail.com> |
2 | .\" and Copyright (c) 2012, 2014 by Eric W. Biederman <ebiederm@xmission.com> | |
046de6a7 | 3 | .\" |
5fbde956 | 4 | .\" SPDX-License-Identifier: Linux-man-pages-copyleft |
046de6a7 MK |
5 | .\" |
6 | .\" | |
4c1c5274 | 7 | .TH user_namespaces 7 (date) "Linux man-pages (unreleased)" |
046de6a7 | 8 | .SH NAME |
445d38c9 | 9 | user_namespaces \- overview of Linux user namespaces |
046de6a7 MK |
10 | .SH DESCRIPTION |
11 | For an overview of namespaces, see | |
12 | .BR namespaces (7). | |
c6d039a3 | 13 | .P |
99f04bb1 MK |
14 | User namespaces isolate security-related identifiers and attributes, |
15 | in particular, | |
03611be8 | 16 | user IDs and group IDs (see |
dba9ebf2 | 17 | .BR credentials (7)), |
99f04bb1 | 18 | the root directory, |
03611be8 | 19 | keys (see |
9d85c789 | 20 | .BR keyrings (7)), |
6b928030 MK |
21 | .\" FIXME: This page says very little about the interaction |
22 | .\" of user namespaces and keys. Add something on this topic. | |
03611be8 | 23 | and capabilities (see |
d916d9d0 | 24 | .BR capabilities (7)). |
046de6a7 MK |
25 | A process's user and group IDs can be different |
26 | inside and outside a user namespace. | |
27 | In particular, | |
28 | a process can have a normal unprivileged user ID outside a user namespace | |
29 | while at the same time having a user ID of 0 inside the namespace; | |
30 | in other words, | |
31 | the process has full privileges for operations inside the user namespace, | |
32 | but is unprivileged for operations outside the namespace. | |
d916d9d0 MK |
33 | .\" |
34 | .\" ============================================================ | |
35 | .\" | |
36 | .SS Nested namespaces, namespace membership | |
37 | User namespaces can be nested; | |
36546c38 AC |
38 | that is, each user namespace\[em]except the initial ("root") |
39 | namespace\[em]has a parent user namespace, | |
d916d9d0 MK |
40 | and can have zero or more child user namespaces. |
41 | The parent user namespace is the user namespace | |
42 | of the process that creates the user namespace via a call to | |
43 | .BR unshare (2) | |
44 | or | |
45 | .BR clone (2) | |
46 | with the | |
1ae6b2c7 | 47 | .B CLONE_NEWUSER |
d916d9d0 | 48 | flag. |
c6d039a3 | 49 | .P |
b324e17d | 50 | The kernel imposes (since Linux 3.11) a limit of 32 nested levels of |
e56b6c42 MK |
51 | .\" commit 8742f229b635bf1c1c84a3dfe5e47c814c20b5c8 |
52 | user namespaces. | |
53 | .\" FIXME Explain the rationale for this limit. (What is the rationale?) | |
54 | Calls to | |
55 | .BR unshare (2) | |
56 | or | |
57 | .BR clone (2) | |
58 | that would cause this limit to be exceeded fail with the error | |
59 | .BR EUSERS . | |
c6d039a3 | 60 | .P |
3b44624f | 61 | Each process is a member of exactly one user namespace. |
d916d9d0 MK |
62 | A process created via |
63 | .BR fork (2) | |
64 | or | |
65 | .BR clone (2) | |
66 | without the | |
1ae6b2c7 | 67 | .B CLONE_NEWUSER |
d916d9d0 | 68 | flag is a member of the same user namespace as its parent. |
7aba437a | 69 | A single-threaded process can join another user namespace with |
d916d9d0 MK |
70 | .BR setns (2) |
71 | if it has the | |
1ae6b2c7 | 72 | .B CAP_SYS_ADMIN |
d916d9d0 MK |
73 | in that namespace; |
74 | upon doing so, it gains a full set of capabilities in that namespace. | |
c6d039a3 | 75 | .P |
d916d9d0 MK |
76 | A call to |
77 | .BR clone (2) | |
78 | or | |
79 | .BR unshare (2) | |
80 | with the | |
1ae6b2c7 | 81 | .B CLONE_NEWUSER |
d916d9d0 MK |
82 | flag makes the new child process (for |
83 | .BR clone (2)) | |
84 | or the caller (for | |
85 | .BR unshare (2)) | |
86 | a member of the new user namespace created by the call. | |
c6d039a3 | 87 | .P |
7af6863b | 88 | The |
1ae6b2c7 | 89 | .B NS_GET_PARENT |
7af6863b MK |
90 | .BR ioctl (2) |
91 | operation can be used to discover the parental relationship | |
92 | between user namespaces; see | |
09860f31 | 93 | .BR ioctl_ns (2). |
c6d039a3 | 94 | .P |
6aba08c6 RC |
95 | A task that changes one of its effective IDs |
96 | will have its dumpability reset to the value in | |
97 | .IR /proc/sys/fs/suid_dumpable . | |
98 | This may affect the ownership of proc files of child processes | |
99 | and may thus cause the parent to lack the permissions | |
100 | to write to mapping files of child processes running in a new user namespace. | |
101 | In such cases making the parent process dumpable, using | |
102 | .B PR_SET_DUMPABLE | |
103 | in a call to | |
104 | .BR prctl (2), | |
2d88fc80 AC |
105 | before creating a child process in a new user namespace |
106 | may rectify this problem. | |
6aba08c6 RC |
107 | See |
108 | .BR prctl (2) | |
109 | and | |
110 | .BR proc (5) | |
111 | for details on how ownership is affected. | |
d916d9d0 MK |
112 | .\" |
113 | .\" ============================================================ | |
114 | .\" | |
115 | .SS Capabilities | |
96ec9d12 MK |
116 | The child process created by |
117 | .BR clone (2) | |
118 | with the | |
1ae6b2c7 | 119 | .B CLONE_NEWUSER |
96ec9d12 MK |
120 | flag starts out with a complete set |
121 | of capabilities in the new user namespace. | |
d916d9d0 MK |
122 | Likewise, a process that creates a new user namespace using |
123 | .BR unshare (2) | |
124 | or joins an existing user namespace using | |
125 | .BR setns (2) | |
3b44624f | 126 | gains a full set of capabilities in that namespace. |
c0098e76 | 127 | On the other hand, |
d68c5f11 MK |
128 | that process has no capabilities in the parent (in the case of |
129 | .BR clone (2)) | |
130 | or previous (in the case of | |
131 | .BR unshare (2) | |
132 | and | |
133 | .BR setns (2)) | |
134 | user namespace, | |
d916d9d0 MK |
135 | even if the new namespace is created or joined by the root user |
136 | (i.e., a process with user ID 0 in the root namespace). | |
c6d039a3 | 137 | .P |
77f95488 MK |
138 | Note that a call to |
139 | .BR execve (2) | |
6cfec3d8 | 140 | will cause a process's capabilities to be recalculated in the usual way (see |
a3969b76 MK |
141 | .BR capabilities (7)). |
142 | Consequently, | |
143 | unless the process has a user ID of 0 within the namespace, | |
144 | or the executable file has a nonempty inheritable capabilities mask, | |
145 | the process will lose all capabilities. | |
6c21c0f9 | 146 | See the discussion of user and group ID mappings, below. |
c6d039a3 | 147 | .P |
f5d401dd | 148 | A call to |
41f974b9 | 149 | .BR clone (2) |
0666f549 | 150 | or |
41f974b9 | 151 | .BR unshare (2) |
0666f549 | 152 | using the |
1ae6b2c7 | 153 | .B CLONE_NEWUSER |
41f974b9 MK |
154 | flag |
155 | or a call to | |
156 | .BR setns (2) | |
dcf91cdc | 157 | that moves the caller into another user namespace |
41f974b9 | 158 | sets the "securebits" flags |
0666f549 MK |
159 | (see |
160 | .BR capabilities (7)) | |
161 | to their default values (all flags disabled) in the child (for | |
162 | .BR clone (2)) | |
163 | or caller (for | |
4d7a0dbc | 164 | .BR unshare (2) |
0666f549 MK |
165 | or |
166 | .BR setns (2)). | |
d68c5f11 MK |
167 | Note that because the caller no longer has capabilities |
168 | in its original user namespace after a call to | |
0666f549 MK |
169 | .BR setns (2), |
170 | it is not possible for a process to reset its "securebits" flags while | |
171 | retaining its user namespace membership by using a pair of | |
172 | .BR setns (2) | |
173 | calls to move to another user namespace and then return to | |
174 | its original user namespace. | |
c6d039a3 | 175 | .P |
d916d9d0 MK |
176 | The rules for determining whether or not a process has a capability |
177 | in a particular user namespace are as follows: | |
cdede5cd | 178 | .IP \[bu] 3 |
d916d9d0 MK |
179 | A process has a capability inside a user namespace |
180 | if it is a member of that namespace and | |
181 | it has the capability in its effective capability set. | |
182 | A process can gain capabilities in its effective capability | |
183 | set in various ways. | |
184 | For example, it may execute a set-user-ID program or an | |
185 | executable with associated file capabilities. | |
186 | In addition, | |
187 | a process may gain capabilities via the effect of | |
3b44624f MK |
188 | .BR clone (2), |
189 | .BR unshare (2), | |
046de6a7 | 190 | or |
d916d9d0 MK |
191 | .BR setns (2), |
192 | as already described. | |
193 | .\" In the 3.8 sources, see security/commoncap.c::cap_capable(): | |
cdede5cd | 194 | .IP \[bu] |
d916d9d0 MK |
195 | If a process has a capability in a user namespace, |
196 | then it has that capability in all child (and further removed descendant) | |
197 | namespaces as well. | |
cdede5cd | 198 | .IP \[bu] |
d916d9d0 MK |
199 | .\" * The owner of the user namespace in the parent of the |
200 | .\" * user namespace has all caps. | |
201 | When a user namespace is created, the kernel records the effective | |
202 | user ID of the creating process as being the "owner" of the namespace. | |
203 | .\" (and likewise associates the effective group ID of the creating process | |
204 | .\" with the namespace). | |
205 | A process that resides | |
206 | in the parent of the user namespace | |
207 | .\" See kernel commit 520d9eabce18edfef76a60b7b839d54facafe1f9 for a fix | |
208 | .\" on this point | |
209 | and whose effective user ID matches the owner of the namespace | |
210 | has all capabilities in the namespace. | |
211 | .\" This includes the case where the process executes a set-user-ID | |
212 | .\" program that confers the effective UID of the creator of the namespace. | |
213 | By virtue of the previous rule, | |
214 | this means that the process has all capabilities in all | |
215 | further removed descendant user namespaces as well. | |
8e821c3a MK |
216 | The |
217 | .B NS_GET_OWNER_UID | |
218 | .BR ioctl (2) | |
219 | operation can be used to discover the user ID of the owner of the namespace; | |
220 | see | |
221 | .BR ioctl_ns (2). | |
2304b0d7 MK |
222 | .\" |
223 | .\" ============================================================ | |
224 | .\" | |
225 | .SS Effect of capabilities within a user namespace | |
89070c1a MK |
226 | Having a capability inside a user namespace |
227 | permits a process to perform operations (that require privilege) | |
228 | only on resources governed by that namespace. | |
229 | In other words, having a capability in a user namespace permits a process | |
230 | to perform privileged operations on resources that are governed by (nonuser) | |
29af6f1a MK |
231 | namespaces owned by (associated with) the user namespace |
232 | (see the next subsection). | |
c6d039a3 | 233 | .P |
89070c1a MK |
234 | On the other hand, there are many privileged operations that affect |
235 | resources that are not associated with any namespace type, | |
94e316bf | 236 | for example, changing the system (i.e., calendar) time (governed by |
89070c1a MK |
237 | .BR CAP_SYS_TIME ), |
238 | loading a kernel module (governed by | |
239 | .BR CAP_SYS_MODULE ), | |
240 | and creating a device (governed by | |
241 | .BR CAP_MKNOD ). | |
242 | Only a process with privileges in the | |
243 | .I initial | |
244 | user namespace can perform such operations. | |
c6d039a3 | 245 | .P |
32efecaa MK |
246 | Holding |
247 | .B CAP_SYS_ADMIN | |
29af6f1a | 248 | within the user namespace that owns a process's mount namespace |
8a9fb19d MK |
249 | allows that process to create bind mounts |
250 | and mount the following types of filesystems: | |
32efecaa | 251 | .\" fs_flags = FS_USERNS_MOUNT in kernel sources |
c6d039a3 | 252 | .P |
32efecaa MK |
253 | .RS 4 |
254 | .PD 0 | |
cdede5cd | 255 | .IP \[bu] 3 |
1ae6b2c7 | 256 | .I /proc |
32efecaa | 257 | (since Linux 3.8) |
cdede5cd | 258 | .IP \[bu] |
1ae6b2c7 | 259 | .I /sys |
32efecaa | 260 | (since Linux 3.8) |
cdede5cd | 261 | .IP \[bu] |
1ae6b2c7 | 262 | .I devpts |
32efecaa | 263 | (since Linux 3.9) |
cdede5cd | 264 | .IP \[bu] |
4e07c70f | 265 | .BR tmpfs (5) |
32efecaa | 266 | (since Linux 3.9) |
cdede5cd | 267 | .IP \[bu] |
1ae6b2c7 | 268 | .I ramfs |
32efecaa | 269 | (since Linux 3.9) |
cdede5cd | 270 | .IP \[bu] |
1ae6b2c7 | 271 | .I mqueue |
32efecaa | 272 | (since Linux 3.9) |
cdede5cd | 273 | .IP \[bu] |
1ae6b2c7 | 274 | .I bpf |
32efecaa MK |
275 | .\" commit b2197755b2633e164a439682fb05a9b5ea48f706 |
276 | (since Linux 4.4) | |
cdede5cd | 277 | .IP \[bu] |
1ae6b2c7 | 278 | .I overlayfs |
c8e9c60b MK |
279 | .\" commit 92dbc9dedccb9759c7f9f2f0ae6242396376988f |
280 | .\" commit 4cb2c00c43b3fe88b32f29df4f76da1b92c33224 | |
281 | (since Linux 5.11) | |
32efecaa MK |
282 | .PD |
283 | .RE | |
c6d039a3 | 284 | .P |
7e52299f MK |
285 | Holding |
286 | .B CAP_SYS_ADMIN | |
29af6f1a | 287 | within the user namespace that owns a process's cgroup namespace |
7e52299f | 288 | allows (since Linux 4.6) |
aeeb4800 | 289 | that process to the mount the cgroup version 2 filesystem and |
8c74a1ce MK |
290 | cgroup version 1 named hierarchies |
291 | (i.e., cgroup filesystems mounted with the | |
c40c1a54 | 292 | .I \[dq]none,name=\[dq] |
8c74a1ce | 293 | option). |
c6d039a3 | 294 | .P |
687d3f4a MK |
295 | Holding |
296 | .B CAP_SYS_ADMIN | |
29af6f1a | 297 | within the user namespace that owns a process's PID namespace |
687d3f4a MK |
298 | allows (since Linux 3.8) |
299 | that process to mount | |
300 | .I /proc | |
301 | filesystems. | |
c6d039a3 | 302 | .P |
9ae13fff | 303 | Note, however, that mounting block-based filesystems can be done |
32efecaa | 304 | only by a process that holds |
1ae6b2c7 | 305 | .B CAP_SYS_ADMIN |
32efecaa | 306 | in the initial user namespace. |
d6842bf1 MK |
307 | .\" |
308 | .\" ============================================================ | |
309 | .\" | |
62a5214c | 310 | .SS Interaction of user namespaces and other types of namespaces |
046de6a7 | 311 | Starting in Linux 3.8, unprivileged processes can create user namespaces, |
8bd6881e | 312 | and the other types of namespaces can be created with just the |
046de6a7 MK |
313 | .B CAP_SYS_ADMIN |
314 | capability in the caller's user namespace. | |
c6d039a3 | 315 | .P |
6e8a3b42 | 316 | When a nonuser namespace is created, |
576233f0 MK |
317 | it is owned by the user namespace in which the creating process |
318 | was a member at the time of the creation of the namespace. | |
18b028be MK |
319 | Privileged operations on resources governed by the nonuser namespace |
320 | require that the process has the necessary capabilities | |
321 | in the user namespace that owns the nonuser namespace. | |
c6d039a3 | 322 | .P |
046de6a7 | 323 | If |
1ae6b2c7 | 324 | .B CLONE_NEWUSER |
046de6a7 MK |
325 | is specified along with other |
326 | .B CLONE_NEW* | |
327 | flags in a single | |
328 | .BR clone (2) | |
329 | or | |
330 | .BR unshare (2) | |
331 | call, the user namespace is guaranteed to be created first, | |
96ec9d12 MK |
332 | giving the child |
333 | .RB ( clone (2)) | |
334 | or caller | |
335 | .RB ( unshare (2)) | |
336 | privileges over the remaining namespaces created by the call. | |
046de6a7 MK |
337 | Thus, it is possible for an unprivileged caller to specify this combination |
338 | of flags. | |
c6d039a3 | 339 | .P |
06999763 | 340 | When a new namespace (other than a user namespace) is created via |
046de6a7 MK |
341 | .BR clone (2) |
342 | or | |
343 | .BR unshare (2), | |
29af6f1a | 344 | the kernel records the user namespace of the creating process as the owner of |
046de6a7 | 345 | the new namespace. |
d916d9d0 | 346 | (This association can't be changed.) |
046de6a7 MK |
347 | When a process in the new namespace subsequently performs |
348 | privileged operations that operate on global | |
349 | resources isolated by the namespace, | |
350 | the permission checks are performed according to the process's capabilities | |
351 | in the user namespace that the kernel associated with the new namespace. | |
7ea1c45e MK |
352 | For example, suppose that a process attempts to change the hostname |
353 | .RB ( sethostname (2)), | |
354 | a resource governed by the UTS namespace. | |
355 | In this case, | |
29af6f1a | 356 | the kernel will determine which user namespace owns |
7ea1c45e MK |
357 | the process's UTS namespace, and check whether the process has the |
358 | required capability | |
359 | .RB ( CAP_SYS_ADMIN ) | |
360 | in that user namespace. | |
c6d039a3 | 361 | .P |
41490851 | 362 | The |
1ae6b2c7 | 363 | .B NS_GET_USERNS |
41490851 | 364 | .BR ioctl (2) |
29af6f1a | 365 | operation can be used to discover the user namespace |
b784b9d5 | 366 | that owns a nonuser namespace; see |
09860f31 | 367 | .BR ioctl_ns (2). |
d6842bf1 MK |
368 | .\" |
369 | .\" ============================================================ | |
370 | .\" | |
62a5214c | 371 | .SS User and group ID mappings: uid_map and gid_map |
6eda9441 MK |
372 | When a user namespace is created, |
373 | it starts out without a mapping of user IDs (group IDs) | |
374 | to the parent user namespace. | |
046de6a7 | 375 | The |
1ae6b2c7 | 376 | .IR /proc/ pid /uid_map |
046de6a7 | 377 | and |
1ae6b2c7 | 378 | .IR /proc/ pid /gid_map |
046de6a7 MK |
379 | files (available since Linux 3.5) |
380 | .\" commit 22d917d80e842829d0ca0a561967d728eb1d6303 | |
381 | expose the mappings for user and group IDs | |
382 | inside the user namespace for the process | |
383 | .IR pid . | |
62a5214c MK |
384 | These files can be read to view the mappings in a user namespace and |
385 | written to (once) to define the mappings. | |
c6d039a3 | 386 | .P |
62a5214c | 387 | The description in the following paragraphs explains the details for |
046de6a7 | 388 | .IR uid_map ; |
1ae6b2c7 | 389 | .I gid_map |
046de6a7 MK |
390 | is exactly the same, |
391 | but each instance of "user ID" is replaced by "group ID". | |
c6d039a3 | 392 | .P |
046de6a7 MK |
393 | The |
394 | .I uid_map | |
395 | file exposes the mapping of user IDs from the user namespace | |
396 | of the process | |
1ae6b2c7 | 397 | .I pid |
046de6a7 | 398 | to the user namespace of the process that opened |
1ae6b2c7 | 399 | .I uid_map |
046de6a7 MK |
400 | (but see a qualification to this point below). |
401 | In other words, processes that are in different user namespaces | |
402 | will potentially see different values when reading from a particular | |
403 | .I uid_map | |
404 | file, depending on the user ID mappings for the user namespaces | |
405 | of the reading processes. | |
c6d039a3 | 406 | .P |
046de6a7 MK |
407 | Each line in the |
408 | .I uid_map | |
409 | file specifies a 1-to-1 mapping of a range of contiguous | |
410 | user IDs between two user namespaces. | |
411 | (When a user namespace is first created, this file is empty.) | |
412 | The specification in each line takes the form of | |
413 | three numbers delimited by white space. | |
d45d0128 | 414 | The first two numbers specify the starting user ID in |
3b44624f | 415 | each of the two user namespaces. |
046de6a7 MK |
416 | The third number specifies the length of the mapped range. |
417 | In detail, the fields are interpreted as follows: | |
22356d97 | 418 | .IP (1) 5 |
046de6a7 MK |
419 | The start of the range of user IDs in |
420 | the user namespace of the process | |
421 | .IR pid . | |
422 | .IP (2) | |
423 | The start of the range of user | |
424 | IDs to which the user IDs specified by field one map. | |
425 | How field two is interpreted depends on whether the process that opened | |
426 | .I uid_map | |
427 | and the process | |
1ae6b2c7 | 428 | .I pid |
046de6a7 MK |
429 | are in the same user namespace, as follows: |
430 | .RS | |
22356d97 | 431 | .IP (a) 5 |
046de6a7 MK |
432 | If the two processes are in different user namespaces: |
433 | field two is the start of a range of | |
434 | user IDs in the user namespace of the process that opened | |
435 | .IR uid_map . | |
22356d97 | 436 | .IP (b) |
046de6a7 MK |
437 | If the two processes are in the same user namespace: |
438 | field two is the start of the range of | |
439 | user IDs in the parent user namespace of the process | |
440 | .IR pid . | |
441 | This case enables the opener of | |
442 | .I uid_map | |
443 | (the common case here is opening | |
444 | .IR /proc/self/uid_map ) | |
445 | to see the mapping of user IDs into the user namespace of the process | |
446 | that created this user namespace. | |
447 | .RE | |
448 | .IP (3) | |
449 | The length of the range of user IDs that is mapped between the two | |
450 | user namespaces. | |
c6d039a3 | 451 | .P |
36546c38 | 452 | System calls that return user IDs (group IDs)\[em]for example, |
6eda9441 MK |
453 | .BR getuid (2), |
454 | .BR getgid (2), | |
455 | and the credential fields in the structure returned by | |
36546c38 | 456 | .BR stat (2)\[em]return |
3b44624f | 457 | the user ID (group ID) mapped into the caller's user namespace. |
c6d039a3 | 458 | .P |
6eda9441 MK |
459 | When a process accesses a file, its user and group IDs |
460 | are mapped into the initial user namespace for the purpose of permission | |
461 | checking and assigning IDs when creating a file. | |
462 | When a process retrieves file user and group IDs via | |
3b44624f | 463 | .BR stat (2), |
6eda9441 MK |
464 | the IDs are mapped in the opposite direction, |
465 | to produce values relative to the process user and group ID mappings. | |
c6d039a3 | 466 | .P |
20e4a147 MK |
467 | The initial user namespace has no parent namespace, |
468 | but, for consistency, the kernel provides dummy user and group | |
469 | ID mapping files for this namespace. | |
470 | Looking at the | |
471 | .I uid_map | |
c9195ded MK |
472 | file |
473 | .RI ( gid_map | |
20e4a147 | 474 | is the same) from a shell in the initial namespace shows: |
c6d039a3 | 475 | .P |
20e4a147 | 476 | .in +4n |
b8302363 | 477 | .EX |
20e4a147 MK |
478 | $ \fBcat /proc/$$/uid_map\fP |
479 | 0 0 4294967295 | |
b8302363 | 480 | .EE |
20e4a147 | 481 | .in |
c6d039a3 | 482 | .P |
20e4a147 MK |
483 | This mapping tells us |
484 | that the range starting at user ID 0 in this namespace | |
485 | maps to a range starting at 0 in the (nonexistent) parent namespace, | |
486 | and the length of the range is the largest 32-bit unsigned integer. | |
364ce935 | 487 | This leaves 4294967295 (the 32-bit signed \-1 value) unmapped. |
6cfec3d8 | 488 | This is deliberate: |
1ae6b2c7 | 489 | .I (uid_t)\~\-1 |
6cfec3d8 MK |
490 | is used in several interfaces (e.g., |
491 | .BR setreuid (2)) | |
492 | as a way to specify "no user ID". | |
493 | Leaving | |
1ae6b2c7 | 494 | .I (uid_t)\~\-1 |
09fcbb82 | 495 | unmapped and unusable guarantees that there will be no |
6cfec3d8 | 496 | confusion when using these interfaces. |
d6842bf1 MK |
497 | .\" |
498 | .\" ============================================================ | |
499 | .\" | |
62a5214c | 500 | .SS Defining user and group ID mappings: writing to uid_map and gid_map |
046de6a7 MK |
501 | After the creation of a new user namespace, the |
502 | .I uid_map | |
503 | file of | |
504 | .I one | |
37909bee | 505 | of the processes in the namespace may be written to |
046de6a7 MK |
506 | .I once |
507 | to define the mapping of user IDs in the new user namespace. | |
1b3d5347 | 508 | An attempt to write more than once to a |
046de6a7 MK |
509 | .I uid_map |
510 | file in a user namespace fails with the error | |
1b3d5347 MK |
511 | .BR EPERM . |
512 | Similar rules apply for | |
513 | .I gid_map | |
514 | files. | |
c6d039a3 | 515 | .P |
046de6a7 | 516 | The lines written to |
1ae6b2c7 | 517 | .I uid_map |
1b3d5347 | 518 | .RI ( gid_map ) |
6486faa9 | 519 | must conform to the following validity rules: |
cdede5cd | 520 | .IP \[bu] 3 |
046de6a7 MK |
521 | The three fields must be valid numbers, |
522 | and the last field must be greater than 0. | |
cdede5cd | 523 | .IP \[bu] |
046de6a7 | 524 | Lines are terminated by newline characters. |
cdede5cd | 525 | .IP \[bu] |
dc04b652 | 526 | There is a limit on the number of lines in the file. |
2660d010 MK |
527 | In Linux 4.14 and earlier, this limit was (arbitrarily) |
528 | .\" 5*12-byte records could fit in a 64B cache line | |
529 | set at 5 lines. | |
530 | Since Linux 4.15, | |
531 | .\" commit 6397fac4915ab3002dc15aae751455da1a852f25 | |
532 | the limit is 340 lines. | |
046de6a7 MK |
533 | In addition, the number of bytes written to |
534 | the file must be less than the system page size, | |
046de6a7 MK |
535 | and the write must be performed at the start of the file (i.e., |
536 | .BR lseek (2) | |
537 | and | |
538 | .BR pwrite (2) | |
539 | can't be used to write to nonzero offsets in the file). | |
cdede5cd | 540 | .IP \[bu] |
1b3d5347 MK |
541 | The range of user IDs (group IDs) |
542 | specified in each line cannot overlap with the ranges | |
046de6a7 | 543 | in any other lines. |
df23ae04 MK |
544 | In the initial implementation (Linux 3.8), this requirement was |
545 | satisfied by a simplistic implementation that imposed the further | |
046de6a7 MK |
546 | requirement that |
547 | the values in both field 1 and field 2 of successive lines must be | |
df23ae04 MK |
548 | in ascending numerical order, |
549 | which prevented some otherwise valid maps from being created. | |
550 | Linux 3.9 and later | |
551 | .\" commit 0bd14b4fd72afd5df41e9fd59f356740f22fceba | |
552 | fix this limitation, allowing any valid set of nonoverlapping maps. | |
cdede5cd | 553 | .IP \[bu] |
046de6a7 | 554 | At least one line must be written to the file. |
c6d039a3 | 555 | .P |
046de6a7 MK |
556 | Writes that violate the above rules fail with the error |
557 | .BR EINVAL . | |
c6d039a3 | 558 | .P |
046de6a7 | 559 | In order for a process to write to the |
1ae6b2c7 AC |
560 | .IR /proc/ pid /uid_map |
561 | .RI ( /proc/ pid /gid_map ) | |
6486faa9 | 562 | file, all of the following permission requirements must be met: |
cdede5cd | 563 | .IP \[bu] 3 |
046de6a7 | 564 | The writing process must have the |
1ae6b2c7 | 565 | .B CAP_SETUID |
046de6a7 MK |
566 | .RB ( CAP_SETGID ) |
567 | capability in the user namespace of the process | |
568 | .IR pid . | |
cdede5cd | 569 | .IP \[bu] |
31a7d506 | 570 | The writing process must either be in the user namespace of the process |
046de6a7 | 571 | .I pid |
31a7d506 | 572 | or be in the parent user namespace of the process |
046de6a7 | 573 | .IR pid . |
cdede5cd | 574 | .IP \[bu] |
1863e451 MK |
575 | The mapped user IDs (group IDs) must in turn have a mapping |
576 | in the parent user namespace. | |
cdede5cd | 577 | .IP \[bu] |
ab4c4b2f | 578 | If updating |
1ae6b2c7 | 579 | .IR /proc/ pid /uid_map |
ab4c4b2f MK |
580 | to create a mapping that maps UID 0 in the parent namespace, |
581 | then one of the following must be true: | |
582 | .RS | |
22356d97 | 583 | .IP (a) 5 |
ab4c4b2f MK |
584 | if writing process is in the parent user namespace, |
585 | then it must have the | |
1ae6b2c7 | 586 | .B CAP_SETFCAP |
ab4c4b2f | 587 | capability in that user namespace; or |
22356d97 | 588 | .IP (b) |
ab4c4b2f MK |
589 | if the writing process is in the child user namespace, |
590 | then the process that created the user namespace must have had the | |
1ae6b2c7 | 591 | .B CAP_SETFCAP |
ab4c4b2f MK |
592 | capability when the namespace was created. |
593 | .RE | |
594 | .IP | |
595 | This rule has been in place since | |
29c1f3cf | 596 | .\" commit db2e718a47984b9d71ed890eb2ea36ecf150de18 |
ab4c4b2f MK |
597 | Linux 5.12. |
598 | It eliminates an earlier security bug whereby | |
599 | a UID 0 process that lacks the | |
29c1f3cf | 600 | .B CAP_SETFCAP |
ab4c4b2f MK |
601 | capability, |
602 | which is needed to create a binary with namespaced file capabilities | |
603 | (as described in | |
604 | .BR capabilities (7)), | |
605 | could nevertheless create such a binary, | |
606 | by the following steps: | |
607 | .RS | |
22356d97 | 608 | .IP (1) 5 |
ab4c4b2f MK |
609 | Create a new user namespace with the identity mapping |
610 | (i.e., UID 0 in the new user namespace maps to UID 0 in the parent namespace), | |
611 | so that UID 0 in both namespaces is equivalent to the same root user ID. | |
22356d97 | 612 | .IP (2) |
ab4c4b2f MK |
613 | Since the child process has the |
614 | .B CAP_SETFCAP | |
615 | capability, it could create a binary with namespaced file capabilities | |
616 | that would then be effective in the parent user namespace | |
617 | (because the root user IDs are the same in the two namespaces). | |
618 | .RE | |
cdede5cd | 619 | .IP \[bu] |
30b33164 | 620 | One of the following two cases applies: |
046de6a7 | 621 | .RS |
22356d97 | 622 | .IP (a) 5 |
1ae6b2c7 | 623 | .I Either |
30b33164 | 624 | the writing process has the |
1ae6b2c7 | 625 | .B CAP_SETUID |
30b33164 | 626 | .RB ( CAP_SETGID ) |
6c8571e0 MK |
627 | capability in the |
628 | .I parent | |
629 | user namespace. | |
30b33164 | 630 | .RS |
cdede5cd | 631 | .IP \[bu] 3 |
30b33164 | 632 | No further restrictions apply: |
50b49f0b | 633 | the process can make mappings to arbitrary user IDs (group IDs) |
30b33164 MK |
634 | in the parent user namespace. |
635 | .RE | |
22356d97 | 636 | .IP (b) |
1ae6b2c7 | 637 | .I Or |
30b33164 MK |
638 | otherwise all of the following restrictions apply: |
639 | .RS | |
cdede5cd | 640 | .IP \[bu] 3 |
046de6a7 MK |
641 | The data written to |
642 | .I uid_map | |
643 | .RI ( gid_map ) | |
690c890a MK |
644 | must consist of a single line that maps |
645 | the writing process's effective user ID | |
046de6a7 MK |
646 | (group ID) in the parent user namespace to a user ID (group ID) |
647 | in the user namespace. | |
cdede5cd | 648 | .IP \[bu] |
0c9abe8b EB |
649 | The writing process must have the same effective user ID as the process |
650 | that created the user namespace. | |
cdede5cd | 651 | .IP \[bu] |
0c9abe8b | 652 | In the case of |
f2d61dbb | 653 | .IR gid_map , |
30b33164 | 654 | use of the |
f2d61dbb | 655 | .BR setgroups (2) |
30b33164 | 656 | system call must first be denied by writing |
9c708002 | 657 | .RI \[dq] deny \[dq] |
30b33164 | 658 | to the |
1ae6b2c7 | 659 | .IR /proc/ pid /setgroups |
30b33164 MK |
660 | file (see below) before writing to |
661 | .IR gid_map . | |
662 | .RE | |
046de6a7 | 663 | .RE |
c6d039a3 | 664 | .P |
046de6a7 MK |
665 | Writes that violate the above rules fail with the error |
666 | .BR EPERM . | |
d6842bf1 MK |
667 | .\" |
668 | .\" ============================================================ | |
669 | .\" | |
213e259e MK |
670 | .SS Project ID mappings: projid_map |
671 | Similarly to user and group ID mappings, | |
672 | it is possible to create project ID mappings for a user namespace. | |
673 | (Project IDs are used for disk quotas; see | |
674 | .BR setquota (8) | |
675 | and | |
676 | .BR quotactl (2).) | |
c6d039a3 | 677 | .P |
213e259e | 678 | Project ID mappings are defined by writing to the |
1ae6b2c7 | 679 | .IR /proc/ pid /projid_map |
213e259e MK |
680 | file (present since |
681 | .\" commit f76d207a66c3a53defea67e7d36c3eb1b7d6d61d | |
682 | Linux 3.7). | |
c6d039a3 | 683 | .P |
213e259e | 684 | The validity rules for writing to the |
1ae6b2c7 | 685 | .IR /proc/ pid /projid_map |
213e259e MK |
686 | file are as for writing to the |
687 | .I uid_map | |
688 | file; violation of these rules causes | |
689 | .BR write (2) | |
690 | to fail with the error | |
691 | .BR EINVAL . | |
c6d039a3 | 692 | .P |
213e259e | 693 | The permission rules for writing to the |
1ae6b2c7 | 694 | .IR /proc/ pid /projid_map |
213e259e | 695 | file are as follows: |
cdede5cd | 696 | .IP \[bu] 3 |
213e259e MK |
697 | The writing process must either be in the user namespace of the process |
698 | .I pid | |
699 | or be in the parent user namespace of the process | |
700 | .IR pid . | |
cdede5cd | 701 | .IP \[bu] |
213e259e MK |
702 | The mapped project IDs must in turn have a mapping |
703 | in the parent user namespace. | |
c6d039a3 | 704 | .P |
213e259e MK |
705 | Violation of these rules causes |
706 | .BR write (2) | |
707 | to fail with the error | |
708 | .BR EPERM . | |
709 | .\" | |
710 | .\" ============================================================ | |
711 | .\" | |
f2d61dbb MK |
712 | .SS Interaction with system calls that change process UIDs or GIDs |
713 | In a user namespace where the | |
0c9abe8b | 714 | .I uid_map |
f2d61dbb MK |
715 | file has not been written, the system calls that change user IDs will fail. |
716 | Similarly, if the | |
0c9abe8b | 717 | .I gid_map |
f2d61dbb MK |
718 | file has not been written, the system calls that change group IDs will fail. |
719 | After the | |
0c9abe8b EB |
720 | .I uid_map |
721 | and | |
722 | .I gid_map | |
f2d61dbb MK |
723 | files have been written, only the mapped values may be used in |
724 | system calls that change user and group IDs. | |
c6d039a3 | 725 | .P |
f2d61dbb MK |
726 | For user IDs, the relevant system calls include |
727 | .BR setuid (2), | |
728 | .BR setfsuid (2), | |
729 | .BR setreuid (2), | |
0c9abe8b | 730 | and |
f2d61dbb MK |
731 | .BR setresuid (2). |
732 | For group IDs, the relevant system calls include | |
733 | .BR setgid (2), | |
734 | .BR setfsgid (2), | |
735 | .BR setregid (2), | |
736 | .BR setresgid (2), | |
0c9abe8b | 737 | and |
f2d61dbb | 738 | .BR setgroups (2). |
c6d039a3 | 739 | .P |
0c9abe8b | 740 | Writing |
9c708002 | 741 | .RI \[dq] deny \[dq] |
0c9abe8b | 742 | to the |
d752f865 | 743 | .IR /proc/ pid /setgroups |
0c9abe8b | 744 | file before writing to |
d752f865 | 745 | .IR /proc/ pid /gid_map |
f2d61dbb MK |
746 | .\" Things changed in Linux 3.19 |
747 | .\" commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8 | |
748 | .\" commit 66d2f338ee4c449396b6f99f5e75cd18eb6df272 | |
749 | .\" http://lwn.net/Articles/626665/ | |
750 | will permanently disable | |
751 | .BR setgroups (2) | |
752 | in a user namespace and allow writing to | |
d752f865 | 753 | .IR /proc/ pid /gid_map |
f2d61dbb | 754 | without having the |
1ae6b2c7 | 755 | .B CAP_SETGID |
f2d61dbb MK |
756 | capability in the parent user namespace. |
757 | .\" | |
758 | .\" ============================================================ | |
759 | .\" | |
2d2db85d | 760 | .SS The \fI/proc/\fPpid\fI/setgroups\fP file |
458abbe6 | 761 | .\" |
ab28dba9 MK |
762 | .\" commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8 |
763 | .\" commit 66d2f338ee4c449396b6f99f5e75cd18eb6df272 | |
764 | .\" http://lwn.net/Articles/626665/ | |
765 | .\" http://web.nvd.nist.gov/view/vuln/detail?vulnId=CVE-2014-8989 | |
766 | .\" | |
767 | The | |
1ae6b2c7 | 768 | .IR /proc/ pid /setgroups |
ab28dba9 | 769 | file displays the string |
9c708002 | 770 | .RI \[dq] allow \[dq] |
ab28dba9 MK |
771 | if processes in the user namespace that contains the process |
772 | .I pid | |
773 | are permitted to employ the | |
774 | .BR setgroups (2) | |
775 | system call; it displays | |
9c708002 | 776 | .RI \[dq] deny \[dq] |
ab28dba9 MK |
777 | if |
778 | .BR setgroups (2) | |
779 | is not permitted in that user namespace. | |
e2b6e58c | 780 | Note that regardless of the value in the |
1ae6b2c7 | 781 | .IR /proc/ pid /setgroups |
e2b6e58c MK |
782 | file (and regardless of the process's capabilities), calls to |
783 | .BR setgroups (2) | |
784 | are also not permitted if | |
1ae6b2c7 | 785 | .IR /proc/ pid /gid_map |
e2b6e58c | 786 | has not yet been set. |
c6d039a3 | 787 | .P |
ab28dba9 | 788 | A privileged process (one with the |
1ae6b2c7 | 789 | .B CAP_SYS_ADMIN |
ab28dba9 | 790 | capability in the namespace) may write either of the strings |
9c708002 | 791 | .RI \[dq] allow \[dq] |
ab28dba9 | 792 | or |
9c708002 | 793 | .RI \[dq] deny \[dq] |
ab28dba9 MK |
794 | to this file |
795 | .I before | |
796 | writing a group ID mapping | |
797 | for this user namespace to the file | |
1ae6b2c7 | 798 | .IR /proc/ pid /gid_map . |
ab28dba9 | 799 | Writing the string |
9c708002 | 800 | .RI \[dq] deny \[dq] |
ab28dba9 MK |
801 | prevents any process in the user namespace from employing |
802 | .BR setgroups (2). | |
c6d039a3 | 803 | .P |
d6add5ef MK |
804 | The essence of the restrictions described in the preceding |
805 | paragraph is that it is permitted to write to | |
1ae6b2c7 | 806 | .IR /proc/ pid /setgroups |
d6add5ef | 807 | only so long as calling |
ab28dba9 | 808 | .BR setgroups (2) |
d6add5ef | 809 | is disallowed because |
1ae6b2c7 | 810 | .IR /proc/ pid /gid_map |
ab28dba9 MK |
811 | has not been set. |
812 | This ensures that a process cannot transition from a state where | |
813 | .BR setgroups (2) | |
814 | is allowed to a state where | |
815 | .BR setgroups (2) | |
816 | is denied; | |
b64fbdca | 817 | a process can transition only from |
ab28dba9 MK |
818 | .BR setgroups (2) |
819 | being disallowed to | |
820 | .BR setgroups (2) | |
821 | being allowed. | |
c6d039a3 | 822 | .P |
ab28dba9 | 823 | The default value of this file in the initial user namespace is |
9c708002 | 824 | .RI \[dq] allow \[dq]. |
c6d039a3 | 825 | .P |
ab28dba9 | 826 | Once |
1ae6b2c7 | 827 | .IR /proc/ pid /gid_map |
ab28dba9 MK |
828 | has been written to |
829 | (which has the effect of enabling | |
830 | .BR setgroups (2) | |
831 | in the user namespace), | |
fe3e2b4e | 832 | it is no longer possible to disallow |
ab28dba9 | 833 | .BR setgroups (2) |
fe3e2b4e | 834 | by writing |
9c708002 | 835 | .RI \[dq] deny \[dq] |
fe3e2b4e | 836 | to |
1ae6b2c7 | 837 | .IR /proc/ pid /setgroups |
fe3e2b4e MK |
838 | (the write fails with the error |
839 | .BR EPERM ). | |
c6d039a3 | 840 | .P |
ab28dba9 | 841 | A child user namespace inherits the |
1ae6b2c7 | 842 | .IR /proc/ pid /setgroups |
ab28dba9 | 843 | setting from its parent. |
c6d039a3 | 844 | .P |
ab28dba9 MK |
845 | If the |
846 | .I setgroups | |
847 | file has the value | |
9c708002 | 848 | .RI \[dq] deny \[dq], |
ab28dba9 MK |
849 | then the |
850 | .BR setgroups (2) | |
851 | system call can't subsequently be reenabled (by writing | |
9c708002 | 852 | .RI \[dq] allow \[dq] |
ab28dba9 | 853 | to the file) in this user namespace. |
a23d8efa | 854 | (Attempts to do so fail with the error |
d6add5ef | 855 | .BR EPERM .) |
ab28dba9 MK |
856 | This restriction also propagates down to all child user namespaces of |
857 | this user namespace. | |
c6d039a3 | 858 | .P |
ecb0ff30 | 859 | The |
d752f865 | 860 | .IR /proc/ pid /setgroups |
ecb0ff30 MK |
861 | file was added in Linux 3.19, |
862 | but was backported to many earlier stable kernel series, | |
863 | because it addresses a security issue. | |
864 | The issue concerned files with permissions such as "rwx\-\-\-rwx". | |
865 | Such files give fewer permissions to "group" than they do to "other". | |
866 | This means that dropping groups using | |
867 | .BR setgroups (2) | |
868 | might allow a process file access that it did not formerly have. | |
869 | Before the existence of user namespaces this was not a concern, | |
870 | since only a privileged process (one with the | |
1ae6b2c7 | 871 | .B CAP_SETGID |
ecb0ff30 MK |
872 | capability) could call |
873 | .BR setgroups (2). | |
874 | However, with the introduction of user namespaces, | |
875 | it became possible for an unprivileged process to create | |
876 | a new namespace in which the user had all privileges. | |
877 | This then allowed formerly unprivileged | |
878 | users to drop groups and thus gain file access | |
879 | that they did not previously have. | |
880 | The | |
1ae6b2c7 | 881 | .IR /proc/ pid /setgroups |
ecb0ff30 | 882 | file was added to address this security issue, |
1fc04edf | 883 | by denying any pathway for an unprivileged process to drop groups with |
ecb0ff30 MK |
884 | .BR setgroups (2). |
885 | .\" | |
886 | .\" /proc/PID/setgroups | |
887 | .\" [allow == setgroups() is allowed, "deny" == setgroups() is disallowed] | |
888 | .\" * Can write if have CAP_SYS_ADMIN in NS | |
889 | .\" * Must write BEFORE writing to /proc/PID/gid_map | |
890 | .\" | |
891 | .\" setgroups() | |
f2c2c308 | 892 | .\" * Must already have written to gid_map |
ecb0ff30 MK |
893 | .\" * /proc/PID/setgroups must be "allow" |
894 | .\" | |
895 | .\" /proc/PID/gid_map -- writing | |
896 | .\" * Must already have written "deny" to /proc/PID/setgroups | |
ab28dba9 MK |
897 | .\" |
898 | .\" ============================================================ | |
899 | .\" | |
3e2a37ec | 900 | .SS Unmapped user and group IDs |
3e2a37ec MK |
901 | There are various places where an unmapped user ID (group ID) |
902 | may be exposed to user space. | |
903 | For example, the first process in a new user namespace may call | |
a4680ab5 | 904 | .BR getuid (2) |
3e2a37ec MK |
905 | before a user ID mapping has been defined for the namespace. |
906 | In most such cases, an unmapped user ID is converted | |
907 | .\" from_kuid_munged(), from_kgid_munged() | |
908 | to the overflow user ID (group ID); | |
909 | the default value for the overflow user ID (group ID) is 65534. | |
910 | See the descriptions of | |
1ae6b2c7 | 911 | .I /proc/sys/kernel/overflowuid |
3e2a37ec | 912 | and |
1ae6b2c7 | 913 | .I /proc/sys/kernel/overflowgid |
3e2a37ec MK |
914 | in |
915 | .BR proc (5). | |
c6d039a3 | 916 | .P |
3e2a37ec MK |
917 | The cases where unmapped IDs are mapped in this fashion include |
918 | system calls that return user IDs | |
dba9ebf2 | 919 | .RB ( getuid (2), |
3e2a37ec MK |
920 | .BR getgid (2), |
921 | and similar), | |
922 | credentials passed over a UNIX domain socket, | |
923 | .\" also SO_PEERCRED | |
924 | credentials returned by | |
925 | .BR stat (2), | |
926 | .BR waitid (2), | |
927 | and the System V IPC "ctl" | |
928 | .B IPC_STAT | |
929 | operations, | |
930 | credentials exposed by | |
1ae6b2c7 | 931 | .IR /proc/ pid /status |
3e2a37ec MK |
932 | and the files in |
933 | .IR /proc/sysvipc/* , | |
934 | credentials returned via the | |
935 | .I si_uid | |
936 | field in the | |
937 | .I siginfo_t | |
938 | received with a signal (see | |
939 | .BR sigaction (2)), | |
940 | credentials written to the process accounting file (see | |
3b44624f | 941 | .BR acct (5)), |
3e2a37ec MK |
942 | and credentials returned with POSIX message queue notifications (see |
943 | .BR mq_notify (3)). | |
c6d039a3 | 944 | .P |
3e2a37ec MK |
945 | There is one notable case where unmapped user and group IDs are |
946 | .I not | |
947 | .\" from_kuid(), from_kgid() | |
948 | .\" Also F_GETOWNER_UIDS is an exception | |
949 | converted to the corresponding overflow ID value. | |
950 | When viewing a | |
951 | .I uid_map | |
952 | or | |
953 | .I gid_map | |
954 | file in which there is no mapping for the second field, | |
54b9d7bf | 955 | that field is displayed as 4294967295 (\-1 as an unsigned integer). |
3e2a37ec MK |
956 | .\" |
957 | .\" ============================================================ | |
958 | .\" | |
ea8ec578 | 959 | .SS Accessing files |
b8cf6c24 | 960 | In order to determine permissions when an unprivileged process accesses a file, |
ea8ec578 MK |
961 | the process credentials (UID, GID) and the file credentials |
962 | are in effect mapped back to what they would be in | |
963 | the initial user namespace and then compared to determine | |
964 | the permissions that the process has on the file. | |
158c714e AC |
965 | The same is also true of other objects that employ the credentials plus |
966 | permissions mask accessibility model, such as System V IPC objects. | |
ea8ec578 MK |
967 | .\" |
968 | .\" ============================================================ | |
969 | .\" | |
43f4bec1 | 970 | .SS Operation of file-related capabilities |
43f4bec1 MK |
971 | Certain capabilities allow a process to bypass various |
972 | kernel-enforced restrictions when performing operations on | |
973 | files owned by other users or groups. | |
974 | These capabilities are: | |
975 | .BR CAP_CHOWN , | |
976 | .BR CAP_DAC_OVERRIDE , | |
977 | .BR CAP_DAC_READ_SEARCH , | |
978 | .BR CAP_FOWNER , | |
979 | and | |
980 | .BR CAP_FSETID . | |
c6d039a3 | 981 | .P |
43f4bec1 MK |
982 | Within a user namespace, |
983 | these capabilities allow a process to bypass the rules | |
984 | if the process has the relevant capability over the file, | |
985 | meaning that: | |
cdede5cd | 986 | .IP \[bu] 3 |
43f4bec1 | 987 | the process has the relevant effective capability in its user namespace; and |
cdede5cd | 988 | .IP \[bu] |
43f4bec1 MK |
989 | the file's user ID and group ID both have valid mappings |
990 | in the user namespace. | |
c6d039a3 | 991 | .P |
43f4bec1 | 992 | The |
1ae6b2c7 | 993 | .B CAP_FOWNER |
43f4bec1 | 994 | capability is treated somewhat exceptionally: |
43f4bec1 MK |
995 | .\" These are the checks performed by the kernel function |
996 | .\" inode_owner_or_capable(). There is one exception to the exception: | |
997 | .\" overriding the directory sticky permission bit requires that | |
998 | .\" the file has a valid mapping for both its UID and GID. | |
89a0bd82 MK |
999 | it allows a process to bypass the corresponding rules so long as |
1000 | at least the file's user ID has a mapping in the user namespace | |
43f4bec1 MK |
1001 | (i.e., the file's group ID does not need to have a valid mapping). |
1002 | .\" | |
1003 | .\" ============================================================ | |
1004 | .\" | |
62a5214c | 1005 | .SS Set-user-ID and set-group-ID programs |
046de6a7 MK |
1006 | When a process inside a user namespace executes |
1007 | a set-user-ID (set-group-ID) program, | |
1008 | the process's effective user (group) ID inside the namespace is changed | |
1009 | to whatever value is mapped for the user (group) ID of the file. | |
1010 | However, if either the user | |
1011 | .I or | |
1012 | the group ID of the file has no mapping inside the namespace, | |
1013 | the set-user-ID (set-group-ID) bit is silently ignored: | |
1014 | the new program is executed, | |
1015 | but the process's effective user (group) ID is left unchanged. | |
1016 | (This mirrors the semantics of executing a set-user-ID or set-group-ID | |
ab3311aa | 1017 | program that resides on a filesystem that was mounted with the |
1ae6b2c7 | 1018 | .B MS_NOSUID |
3b44624f | 1019 | flag, as described in |
046de6a7 | 1020 | .BR mount (2).) |
6eda9441 MK |
1021 | .\" |
1022 | .\" ============================================================ | |
1023 | .\" | |
1024 | .SS Miscellaneous | |
6eda9441 MK |
1025 | When a process's user and group IDs are passed over a UNIX domain socket |
1026 | to a process in a different user namespace (see the description of | |
1027 | .B SCM_CREDENTIALS | |
1028 | in | |
1029 | .BR unix (7)), | |
1030 | they are translated into the corresponding values as per the | |
1031 | receiving process's user and group ID mappings. | |
63f66893 | 1032 | .\" |
3113c7f3 | 1033 | .SH STANDARDS |
4131356c | 1034 | Linux. |
63f66893 | 1035 | .\" |
62a5214c MK |
1036 | .SH NOTES |
1037 | Over the years, there have been a lot of features that have been added | |
7ae693d0 | 1038 | to the Linux kernel that have been made available only to privileged users |
62a5214c MK |
1039 | because of their potential to confuse set-user-ID-root applications. |
1040 | In general, it becomes safe to allow the root user in a user namespace to | |
1041 | use those features because it is impossible, while in a user namespace, | |
1042 | to gain more privilege than the root user of a user namespace has. | |
bc921757 MK |
1043 | .\" |
1044 | .\" ============================================================ | |
1045 | .\" | |
f9f75be0 MK |
1046 | .SS Global root |
1047 | The term "global root" is sometimes used as a shorthand for | |
1048 | user ID 0 in the initial user namespace. | |
1049 | .\" | |
1050 | .\" ============================================================ | |
1051 | .\" | |
c3f29a89 MK |
1052 | .SS Availability |
1053 | Use of user namespaces requires a kernel that is configured with the | |
1054 | .B CONFIG_USER_NS | |
1055 | option. | |
1056 | User namespaces require support in a range of subsystems across | |
1057 | the kernel. | |
1058 | When an unsupported subsystem is configured into the kernel, | |
1059 | it is not possible to configure user namespaces support. | |
c6d039a3 | 1060 | .P |
ed8bd845 MK |
1061 | As at Linux 3.8, most relevant subsystems supported user namespaces, |
1062 | but a number of filesystems did not have the infrastructure needed | |
1063 | to map user and group IDs between user namespaces. | |
1064 | Linux 3.9 added the required infrastructure support for many of | |
1065 | the remaining unsupported filesystems | |
1066 | (Plan 9 (9P), Andrew File System (AFS), Ceph, CIFS, CODA, NFS, and OCFS2). | |
38598749 | 1067 | Linux 3.12 added support for the last of the unsupported major filesystems, |
c0d02ab0 MK |
1068 | .\" commit d6970d4b726cea6d7a9bc4120814f95c09571fc3 |
1069 | XFS. | |
63f66893 | 1070 | .\" |
a14af333 | 1071 | .SH EXAMPLES |
8d36d80c MK |
1072 | The program below is designed to allow experimenting with |
1073 | user namespaces, as well as other types of namespaces. | |
1074 | It creates namespaces as specified by command-line options and then executes | |
1075 | a command inside those namespaces. | |
1076 | The comments and | |
e9a20748 | 1077 | .IR usage () |
8d36d80c | 1078 | function inside the program provide a full explanation of the program. |
3e2a37ec | 1079 | The following shell session demonstrates its use. |
c6d039a3 | 1080 | .P |
3e2a37ec | 1081 | First, we look at the run-time environment: |
c6d039a3 | 1082 | .P |
8d36d80c | 1083 | .in +4n |
b8302363 | 1084 | .EX |
791ea4b3 | 1085 | $ \fBuname \-rs\fP # Need Linux 3.8 or later |
8d36d80c | 1086 | Linux 3.8.0 |
791ea4b3 | 1087 | $ \fBid \-u\fP # Running as unprivileged user |
8d36d80c | 1088 | 1000 |
791ea4b3 | 1089 | $ \fBid \-g\fP |
8d36d80c | 1090 | 1000 |
b8302363 | 1091 | .EE |
8d36d80c | 1092 | .in |
c6d039a3 | 1093 | .P |
3e2a37ec | 1094 | Now start a new shell in new user |
8d36d80c MK |
1095 | .RI ( \-U ), |
1096 | mount | |
1097 | .RI ( \-m ), | |
1098 | and PID | |
1099 | .RI ( \-p ) | |
1100 | namespaces, with user ID | |
1101 | .RI ( \-M ) | |
3b44624f | 1102 | and group ID |
8d36d80c | 1103 | .RI ( \-G ) |
3b44624f | 1104 | 1000 mapped to 0 inside the user namespace: |
c6d039a3 | 1105 | .P |
8d36d80c | 1106 | .in +4n |
b8302363 | 1107 | .EX |
b957f81f | 1108 | $ \fB./userns_child_exec \-p \-m \-U \-M \[aq]0 1000 1\[aq] \-G \[aq]0 1000 1\[aq] bash\fP |
b8302363 | 1109 | .EE |
8d36d80c | 1110 | .in |
c6d039a3 | 1111 | .P |
f5d401dd | 1112 | The shell has PID 1, because it is the first process in the new |
8d36d80c | 1113 | PID namespace: |
c6d039a3 | 1114 | .P |
8d36d80c | 1115 | .in +4n |
b8302363 | 1116 | .EX |
8d36d80c MK |
1117 | bash$ \fBecho $$\fP |
1118 | 1 | |
b8302363 | 1119 | .EE |
8d36d80c | 1120 | .in |
c6d039a3 | 1121 | .P |
8d36d80c MK |
1122 | Mounting a new |
1123 | .I /proc | |
ab3311aa | 1124 | filesystem and listing all of the processes visible |
8d36d80c MK |
1125 | in the new PID namespace shows that the shell can't see |
1126 | any processes outside the PID namespace: | |
c6d039a3 | 1127 | .P |
8d36d80c | 1128 | .in +4n |
b8302363 | 1129 | .EX |
791ea4b3 | 1130 | bash$ \fBmount \-t proc proc /proc\fP |
8d36d80c MK |
1131 | bash$ \fBps ax\fP |
1132 | PID TTY STAT TIME COMMAND | |
1133 | 1 pts/3 S 0:00 bash | |
1134 | 22 pts/3 R+ 0:00 ps ax | |
b8302363 | 1135 | .EE |
8d36d80c | 1136 | .in |
c6d039a3 | 1137 | .P |
a2b1485b SB |
1138 | Inside the user namespace, the shell has user and group ID 0, |
1139 | and a full set of permitted and effective capabilities: | |
c6d039a3 | 1140 | .P |
a2b1485b | 1141 | .in +4n |
b8302363 | 1142 | .EX |
a1e9245d | 1143 | bash$ \fBcat /proc/$$/status | egrep \[aq]\[ha][UG]id\[aq]\fP |
a2b1485b SB |
1144 | Uid: 0 0 0 0 |
1145 | Gid: 0 0 0 0 | |
a1e9245d | 1146 | bash$ \fBcat /proc/$$/status | egrep \[aq]\[ha]Cap(Prm|Inh|Eff)\[aq]\fP |
a2b1485b SB |
1147 | CapInh: 0000000000000000 |
1148 | CapPrm: 0000001fffffffff | |
1149 | CapEff: 0000001fffffffff | |
b8302363 | 1150 | .EE |
a2b1485b | 1151 | .in |
8d36d80c MK |
1152 | .SS Program source |
1153 | \& | |
e7d0bb47 | 1154 | .EX |
8d36d80c | 1155 | /* userns_child_exec.c |
fe5dba13 | 1156 | \& |
8d36d80c | 1157 | Licensed under GNU General Public License v2 or later |
fe5dba13 | 1158 | \& |
8d36d80c MK |
1159 | Create a child process that executes a shell command in new |
1160 | namespace(s); allow UID and GID mappings to be specified when | |
1161 | creating a user namespace. | |
1162 | */ | |
1163 | #define _GNU_SOURCE | |
5a5208c1 | 1164 | #include <err.h> |
8d36d80c MK |
1165 | #include <sched.h> |
1166 | #include <unistd.h> | |
8eb90116 | 1167 | #include <stdint.h> |
8d36d80c MK |
1168 | #include <stdlib.h> |
1169 | #include <sys/wait.h> | |
1170 | #include <signal.h> | |
1171 | #include <fcntl.h> | |
1172 | #include <stdio.h> | |
1173 | #include <string.h> | |
1174 | #include <limits.h> | |
1175 | #include <errno.h> | |
fe5dba13 | 1176 | \& |
8d36d80c MK |
1177 | struct child_args { |
1178 | char **argv; /* Command to be executed by child, with args */ | |
1179 | int pipe_fd[2]; /* Pipe used to synchronize parent and child */ | |
1180 | }; | |
fe5dba13 | 1181 | \& |
8d36d80c | 1182 | static int verbose; |
fe5dba13 | 1183 | \& |
8d36d80c MK |
1184 | static void |
1185 | usage(char *pname) | |
1186 | { | |
d1a71985 | 1187 | fprintf(stderr, "Usage: %s [options] cmd [arg...]\en\en", pname); |
8d36d80c | 1188 | fprintf(stderr, "Create a child process that executes a shell " |
d1a71985 MK |
1189 | "command in a new user namespace,\en" |
1190 | "and possibly also other new namespace(s).\en\en"); | |
1191 | fprintf(stderr, "Options can be:\en\en"); | |
8d36d80c | 1192 | #define fpe(str) fprintf(stderr, " %s", str); |
d1a71985 MK |
1193 | fpe("\-i New IPC namespace\en"); |
1194 | fpe("\-m New mount namespace\en"); | |
1195 | fpe("\-n New network namespace\en"); | |
1196 | fpe("\-p New PID namespace\en"); | |
1197 | fpe("\-u New UTS namespace\en"); | |
1198 | fpe("\-U New user namespace\en"); | |
1199 | fpe("\-M uid_map Specify UID map for user namespace\en"); | |
1200 | fpe("\-G gid_map Specify GID map for user namespace\en"); | |
b957f81f AC |
1201 | fpe("\-z Map user\[aq]s UID and GID to 0 in user namespace\en"); |
1202 | fpe(" (equivalent to: \-M \[aq]0 <uid> 1\[aq] \-G \[aq]0 <gid> 1\[aq])\en"); | |
d1a71985 MK |
1203 | fpe("\-v Display verbose messages\en"); |
1204 | fpe("\en"); | |
1205 | fpe("If \-z, \-M, or \-G is specified, \-U is required.\en"); | |
1206 | fpe("It is not permitted to specify both \-z and either \-M or \-G.\en"); | |
1207 | fpe("\en"); | |
1208 | fpe("Map strings for \-M and \-G consist of records of the form:\en"); | |
1209 | fpe("\en"); | |
1210 | fpe(" ID\-inside\-ns ID\-outside\-ns len\en"); | |
1211 | fpe("\en"); | |
8d36d80c | 1212 | fpe("A map string can contain multiple records, separated" |
d1a71985 | 1213 | " by commas;\en"); |
8d36d80c | 1214 | fpe("the commas are replaced by newlines before writing" |
d1a71985 | 1215 | " to map files.\en"); |
fe5dba13 | 1216 | \& |
8d36d80c MK |
1217 | exit(EXIT_FAILURE); |
1218 | } | |
fe5dba13 | 1219 | \& |
b957f81f AC |
1220 | /* Update the mapping file \[aq]map_file\[aq], with the value provided in |
1221 | \[aq]mapping\[aq], a string that defines a UID or GID mapping. A UID or | |
8d36d80c MK |
1222 | GID mapping consists of one or more newline\-delimited records |
1223 | of the form: | |
fe5dba13 | 1224 | \& |
8d36d80c | 1225 | ID_inside\-ns ID\-outside\-ns length |
fe5dba13 | 1226 | \& |
8d36d80c MK |
1227 | Requiring the user to supply a string that contains newlines is |
1228 | of course inconvenient for command\-line use. Thus, we permit the | |
1229 | use of commas to delimit records in this string, and replace them | |
1230 | with newlines before writing the string to the file. */ | |
fe5dba13 | 1231 | \& |
8d36d80c MK |
1232 | static void |
1233 | update_map(char *mapping, char *map_file) | |
1234 | { | |
88893a77 | 1235 | int fd; |
b957f81f | 1236 | size_t map_len; /* Length of \[aq]mapping\[aq] */ |
fe5dba13 | 1237 | \& |
c6beb8a1 | 1238 | /* Replace commas in mapping string with newlines. */ |
fe5dba13 | 1239 | \& |
8d36d80c | 1240 | map_len = strlen(mapping); |
b42296e4 | 1241 | for (size_t j = 0; j < map_len; j++) |
b957f81f AC |
1242 | if (mapping[j] == \[aq],\[aq]) |
1243 | mapping[j] = \[aq]\en\[aq]; | |
fe5dba13 | 1244 | \& |
8d36d80c MK |
1245 | fd = open(map_file, O_RDWR); |
1246 | if (fd == \-1) { | |
d1a71985 | 1247 | fprintf(stderr, "ERROR: open %s: %s\en", map_file, |
8f99aa89 MK |
1248 | strerror(errno)); |
1249 | exit(EXIT_FAILURE); | |
8d36d80c | 1250 | } |
fe5dba13 | 1251 | \& |
8d36d80c | 1252 | if (write(fd, mapping, map_len) != map_len) { |
d1a71985 | 1253 | fprintf(stderr, "ERROR: write %s: %s\en", map_file, |
8f99aa89 MK |
1254 | strerror(errno)); |
1255 | exit(EXIT_FAILURE); | |
8d36d80c | 1256 | } |
fe5dba13 | 1257 | \& |
8d36d80c MK |
1258 | close(fd); |
1259 | } | |
fe5dba13 | 1260 | \& |
98c9ca4b AC |
1261 | /* Linux 3.19 made a change in the handling of setgroups(2) and |
1262 | the \[aq]gid_map\[aq] file to address a security issue. The issue | |
1263 | allowed *unprivileged* users to employ user namespaces in | |
1264 | order to drop groups. The upshot of the 3.19 changes is that | |
1265 | in order to update the \[aq]gid_maps\[aq] file, use of the setgroups() | |
1266 | system call in this user namespace must first be disabled by | |
1267 | writing "deny" to one of the /proc/PID/setgroups files for | |
1268 | this namespace. That is the purpose of the following function. */ | |
fe5dba13 | 1269 | \& |
c38a2a04 MK |
1270 | static void |
1271 | proc_setgroups_write(pid_t child_pid, char *str) | |
1272 | { | |
1273 | char setgroups_path[PATH_MAX]; | |
1274 | int fd; | |
fe5dba13 | 1275 | \& |
8eb90116 AC |
1276 | snprintf(setgroups_path, PATH_MAX, "/proc/%jd/setgroups", |
1277 | (intmax_t) child_pid); | |
fe5dba13 | 1278 | \& |
c38a2a04 MK |
1279 | fd = open(setgroups_path, O_RDWR); |
1280 | if (fd == \-1) { | |
fe5dba13 | 1281 | \& |
b957f81f AC |
1282 | /* We may be on a system that doesn\[aq]t support |
1283 | /proc/PID/setgroups. In that case, the file won\[aq]t exist, | |
1284 | and the system won\[aq]t impose the restrictions that Linux 3.19 | |
1285 | added. That\[aq]s fine: we don\[aq]t need to do anything in order | |
1286 | to permit \[aq]gid_map\[aq] to be updated. | |
fe5dba13 | 1287 | \& |
c38a2a04 MK |
1288 | However, if the error from open() was something other than |
1289 | the ENOENT error that is expected for that case, let the | |
1290 | user know. */ | |
fe5dba13 | 1291 | \& |
c38a2a04 | 1292 | if (errno != ENOENT) |
d1a71985 | 1293 | fprintf(stderr, "ERROR: open %s: %s\en", setgroups_path, |
c38a2a04 MK |
1294 | strerror(errno)); |
1295 | return; | |
1296 | } | |
fe5dba13 | 1297 | \& |
c38a2a04 | 1298 | if (write(fd, str, strlen(str)) == \-1) |
d1a71985 | 1299 | fprintf(stderr, "ERROR: write %s: %s\en", setgroups_path, |
c38a2a04 | 1300 | strerror(errno)); |
fe5dba13 | 1301 | \& |
c38a2a04 MK |
1302 | close(fd); |
1303 | } | |
fe5dba13 | 1304 | \& |
8d36d80c MK |
1305 | static int /* Start function for cloned child */ |
1306 | childFunc(void *arg) | |
1307 | { | |
dc0bba35 | 1308 | struct child_args *args = arg; |
8d36d80c | 1309 | char ch; |
fe5dba13 | 1310 | \& |
8d36d80c MK |
1311 | /* Wait until the parent has updated the UID and GID mappings. |
1312 | See the comment in main(). We wait for end of file on a | |
1313 | pipe that will be closed by the parent process once it has | |
1314 | updated the mappings. */ | |
fe5dba13 | 1315 | \& |
8d36d80c MK |
1316 | close(args\->pipe_fd[1]); /* Close our descriptor for the write |
1317 | end of the pipe so that we see EOF | |
c6beb8a1 | 1318 | when parent closes its descriptor. */ |
8d36d80c MK |
1319 | if (read(args\->pipe_fd[0], &ch, 1) != 0) { |
1320 | fprintf(stderr, | |
d1a71985 | 1321 | "Failure in child: read from pipe returned != 0\en"); |
8d36d80c MK |
1322 | exit(EXIT_FAILURE); |
1323 | } | |
fe5dba13 | 1324 | \& |
a2b1485b | 1325 | close(args\->pipe_fd[0]); |
fe5dba13 | 1326 | \& |
c6beb8a1 | 1327 | /* Execute a shell command. */ |
fe5dba13 | 1328 | \& |
d1a71985 | 1329 | printf("About to exec %s\en", args\->argv[0]); |
8d36d80c | 1330 | execvp(args\->argv[0], args\->argv); |
5a5208c1 | 1331 | err(EXIT_FAILURE, "execvp"); |
8d36d80c | 1332 | } |
fe5dba13 | 1333 | \& |
8d36d80c | 1334 | #define STACK_SIZE (1024 * 1024) |
fe5dba13 | 1335 | \& |
b957f81f | 1336 | static char child_stack[STACK_SIZE]; /* Space for child\[aq]s stack */ |
fe5dba13 | 1337 | \& |
8d36d80c MK |
1338 | int |
1339 | main(int argc, char *argv[]) | |
1340 | { | |
1341 | int flags, opt, map_zero; | |
1342 | pid_t child_pid; | |
1343 | struct child_args args; | |
1344 | char *uid_map, *gid_map; | |
1345 | const int MAP_BUF_SIZE = 100; | |
1346 | char map_buf[MAP_BUF_SIZE]; | |
1347 | char map_path[PATH_MAX]; | |
fe5dba13 | 1348 | \& |
b957f81f | 1349 | /* Parse command\-line options. The initial \[aq]+\[aq] character in |
8d36d80c | 1350 | the final getopt() argument prevents GNU\-style permutation |
b957f81f AC |
1351 | of command\-line options. That\[aq]s useful, since sometimes |
1352 | the \[aq]command\[aq] to be executed by this program itself | |
1353 | has command\-line options. We don\[aq]t want getopt() to treat | |
8d36d80c | 1354 | those as options to this program. */ |
fe5dba13 | 1355 | \& |
8d36d80c MK |
1356 | flags = 0; |
1357 | verbose = 0; | |
1358 | gid_map = NULL; | |
1359 | uid_map = NULL; | |
1360 | map_zero = 0; | |
1361 | while ((opt = getopt(argc, argv, "+imnpuUM:G:zv")) != \-1) { | |
1362 | switch (opt) { | |
b957f81f AC |
1363 | case \[aq]i\[aq]: flags |= CLONE_NEWIPC; break; |
1364 | case \[aq]m\[aq]: flags |= CLONE_NEWNS; break; | |
1365 | case \[aq]n\[aq]: flags |= CLONE_NEWNET; break; | |
1366 | case \[aq]p\[aq]: flags |= CLONE_NEWPID; break; | |
1367 | case \[aq]u\[aq]: flags |= CLONE_NEWUTS; break; | |
1368 | case \[aq]v\[aq]: verbose = 1; break; | |
1369 | case \[aq]z\[aq]: map_zero = 1; break; | |
1370 | case \[aq]M\[aq]: uid_map = optarg; break; | |
1371 | case \[aq]G\[aq]: gid_map = optarg; break; | |
1372 | case \[aq]U\[aq]: flags |= CLONE_NEWUSER; break; | |
8d36d80c MK |
1373 | default: usage(argv[0]); |
1374 | } | |
1375 | } | |
fe5dba13 | 1376 | \& |
8d36d80c | 1377 | /* \-M or \-G without \-U is nonsensical */ |
fe5dba13 | 1378 | \& |
8d36d80c MK |
1379 | if (((uid_map != NULL || gid_map != NULL || map_zero) && |
1380 | !(flags & CLONE_NEWUSER)) || | |
1381 | (map_zero && (uid_map != NULL || gid_map != NULL))) | |
1382 | usage(argv[0]); | |
fe5dba13 | 1383 | \& |
8d36d80c | 1384 | args.argv = &argv[optind]; |
fe5dba13 | 1385 | \& |
8d36d80c MK |
1386 | /* We use a pipe to synchronize the parent and child, in order to |
1387 | ensure that the parent sets the UID and GID maps before the child | |
1388 | calls execve(). This ensures that the child maintains its | |
1389 | capabilities during the execve() in the common case where we | |
b957f81f | 1390 | want to map the child\[aq]s effective user ID to 0 in the new user |
8d36d80c MK |
1391 | namespace. Without this synchronization, the child would lose |
1392 | its capabilities if it performed an execve() with nonzero | |
1393 | user IDs (see the capabilities(7) man page for details of the | |
b957f81f | 1394 | transformation of a process\[aq]s capabilities during execve()). */ |
fe5dba13 | 1395 | \& |
8d36d80c | 1396 | if (pipe(args.pipe_fd) == \-1) |
5a5208c1 | 1397 | err(EXIT_FAILURE, "pipe"); |
fe5dba13 | 1398 | \& |
c6beb8a1 | 1399 | /* Create the child in new namespace(s). */ |
fe5dba13 | 1400 | \& |
8d36d80c MK |
1401 | child_pid = clone(childFunc, child_stack + STACK_SIZE, |
1402 | flags | SIGCHLD, &args); | |
1403 | if (child_pid == \-1) | |
5a5208c1 | 1404 | err(EXIT_FAILURE, "clone"); |
fe5dba13 | 1405 | \& |
c6beb8a1 | 1406 | /* Parent falls through to here. */ |
fe5dba13 | 1407 | \& |
8d36d80c | 1408 | if (verbose) |
8eb90116 AC |
1409 | printf("%s: PID of child created by clone() is %jd\en", |
1410 | argv[0], (intmax_t) child_pid); | |
fe5dba13 | 1411 | \& |
c6beb8a1 | 1412 | /* Update the UID and GID maps in the child. */ |
fe5dba13 | 1413 | \& |
8d36d80c | 1414 | if (uid_map != NULL || map_zero) { |
8eb90116 AC |
1415 | snprintf(map_path, PATH_MAX, "/proc/%jd/uid_map", |
1416 | (intmax_t) child_pid); | |
8d36d80c | 1417 | if (map_zero) { |
8eb90116 AC |
1418 | snprintf(map_buf, MAP_BUF_SIZE, "0 %jd 1", |
1419 | (intmax_t) getuid()); | |
8d36d80c MK |
1420 | uid_map = map_buf; |
1421 | } | |
1422 | update_map(uid_map, map_path); | |
1423 | } | |
fe5dba13 | 1424 | \& |
8d36d80c | 1425 | if (gid_map != NULL || map_zero) { |
c38a2a04 | 1426 | proc_setgroups_write(child_pid, "deny"); |
fe5dba13 | 1427 | \& |
8eb90116 AC |
1428 | snprintf(map_path, PATH_MAX, "/proc/%jd/gid_map", |
1429 | (intmax_t) child_pid); | |
8d36d80c | 1430 | if (map_zero) { |
8eb90116 AC |
1431 | snprintf(map_buf, MAP_BUF_SIZE, "0 %ld 1", |
1432 | (intmax_t) getgid()); | |
8d36d80c MK |
1433 | gid_map = map_buf; |
1434 | } | |
1435 | update_map(gid_map, map_path); | |
1436 | } | |
fe5dba13 | 1437 | \& |
8d36d80c | 1438 | /* Close the write end of the pipe, to signal to the child that we |
c6beb8a1 | 1439 | have updated the UID and GID maps. */ |
fe5dba13 | 1440 | \& |
8d36d80c | 1441 | close(args.pipe_fd[1]); |
fe5dba13 | 1442 | \& |
8d36d80c | 1443 | if (waitpid(child_pid, NULL, 0) == \-1) /* Wait for child */ |
5a5208c1 | 1444 | err(EXIT_FAILURE, "waitpid"); |
fe5dba13 | 1445 | \& |
8d36d80c | 1446 | if (verbose) |
d1a71985 | 1447 | printf("%s: terminating\en", argv[0]); |
fe5dba13 | 1448 | \& |
8d36d80c MK |
1449 | exit(EXIT_SUCCESS); |
1450 | } | |
e7d0bb47 | 1451 | .EE |
046de6a7 | 1452 | .SH SEE ALSO |
b6462f75 MK |
1453 | .BR newgidmap (1), \" From the shadow package |
1454 | .BR newuidmap (1), \" From the shadow package | |
046de6a7 | 1455 | .BR clone (2), |
801245a1 | 1456 | .BR ptrace (2), |
046de6a7 MK |
1457 | .BR setns (2), |
1458 | .BR unshare (2), | |
1459 | .BR proc (5), | |
b6462f75 MK |
1460 | .BR subgid (5), \" From the shadow package |
1461 | .BR subuid (5), \" From the shadow package | |
589e43bb | 1462 | .BR capabilities (7), |
bba4bbbd | 1463 | .BR cgroup_namespaces (7), |
3525268c MK |
1464 | .BR credentials (7), |
1465 | .BR namespaces (7), | |
8d36d80c | 1466 | .BR pid_namespaces (7) |
c6d039a3 | 1467 | .P |
c94eb4a6 | 1468 | The kernel source file |
57fb49f9 | 1469 | .IR Documentation/admin\-guide/namespaces/resource\-control.rst . |