]>
Commit | Line | Data |
---|---|---|
a2dd6388 MK |
1 | .\" Copyright (C) 2019 Michael Kerrisk <mtk.manpages@gmail.com> |
2 | .\" A very few fragments remain from an earlier page written by | |
3 | .\" Werner Almesberger in 2000 | |
2297bf0e | 4 | .\" |
a2dd6388 MK |
5 | .\" %%%LICENSE_START(VERBATIM) |
6 | .\" Permission is granted to make and distribute verbatim copies of this | |
7 | .\" manual provided the copyright notice and this permission notice are | |
8 | .\" preserved on all copies. | |
9 | .\" | |
10 | .\" Permission is granted to copy and distribute modified versions of this | |
11 | .\" manual under the conditions for verbatim copying, provided that the | |
12 | .\" entire resulting derived work is distributed under the terms of a | |
13 | .\" permission notice identical to this one. | |
fea681da | 14 | .\" |
a2dd6388 MK |
15 | .\" Since the Linux kernel and libraries are constantly changing, this |
16 | .\" manual page may be incorrect or out-of-date. The author(s) assume no | |
17 | .\" responsibility for errors or omissions, or for damages resulting from | |
18 | .\" the use of the information contained herein. The author(s) may not | |
19 | .\" have taken the same level of care in the production of this manual, | |
20 | .\" which is licensed free of charge, as they might when working | |
21 | .\" professionally. | |
22 | .\" | |
23 | .\" Formatted or processed versions of this manual, if unaccompanied by | |
24 | .\" the source, must acknowledge the copyright and authors of this work. | |
25 | .\" %%%LICENSE_END | |
fea681da | 26 | .\" |
a5409de9 | 27 | .TH PIVOT_ROOT 2 2019-11-19 "Linux" "Linux Programmer's Manual" |
fea681da | 28 | .SH NAME |
0843016c | 29 | pivot_root \- change the root mount |
fea681da | 30 | .SH SYNOPSIS |
fea681da | 31 | .BI "int pivot_root(const char *" new_root ", const char *" put_old ); |
dbfe9c70 | 32 | .PP |
45c99e3e MK |
33 | .IR Note : |
34 | There is no glibc wrapper for this system call; see NOTES. | |
fea681da | 35 | .SH DESCRIPTION |
60a90ecd | 36 | .BR pivot_root () |
0843016c MK |
37 | changes the root mount in the mount namespace of the calling process. |
38 | More precisely, it moves the root mount to the | |
39 | directory \fIput_old\fP and makes \fInew_root\fP the new root mount. | |
fdc558bd MK |
40 | The calling process must have the |
41 | .B CAP_SYS_ADMIN | |
42 | capability in the user namespace that owns the caller's mount namespace. | |
efeece04 | 43 | .PP |
60a90ecd | 44 | .BR pivot_root () |
81b24320 MK |
45 | changes the root directory and the current working directory |
46 | of each process or thread in the same mount namespace to | |
47 | .I new_root | |
48 | if they point to the old root directory. | |
682e1329 MK |
49 | (See also NOTES.) |
50 | On the other hand, | |
51 | .BR pivot_root () | |
52 | does not change the caller's current working directory | |
53 | (unless it is on the old root directory), | |
54 | and thus it should be followed by a | |
55 | \fBchdir("/")\fP call. | |
efeece04 | 56 | .PP |
41d4557c | 57 | The following restrictions apply: |
fea681da | 58 | .IP \- 3 |
41d4557c MK |
59 | .IR new_root |
60 | and | |
61 | .IR put_old | |
62 | must be directories. | |
0ac6f900 | 63 | .IP \- |
33313a26 MK |
64 | .I new_root |
65 | and | |
66 | .I put_old | |
67 | must not be on the same mount as the current root. | |
0ac6f900 | 68 | .IP \- |
57bab66a | 69 | \fIput_old\fP must be at or underneath \fInew_root\fP; |
87529800 MK |
70 | that is, adding some nonnegative |
71 | number of "\fI/..\fP" prefixes to the pathname pointed to by | |
72 | .I put_old | |
73 | must yield the same directory as \fInew_root\fP. | |
0ac6f900 | 74 | .IP \- |
37704bfc | 75 | .I new_root |
666373fc | 76 | must be a path to a mount point, but can't be |
9f3af6b8 | 77 | .IR """/""" . |
666373fc MK |
78 | A path that is not already a mount point can be converted into one by |
79 | bind mounting the path onto itself. | |
0ac6f900 | 80 | .IP \- |
d4b2104a MK |
81 | The propagation type of the parent mount of |
82 | .IR new_root | |
83 | and the parent mount of the current root directory must not be | |
a39e880f MK |
84 | .BR MS_SHARED ; |
85 | similarly, if | |
86 | .I put_old | |
87 | is an existing mount point, its propagation type must not be | |
1a0b1fd7 | 88 | .BR MS_SHARED . |
9d33e03b MK |
89 | These restrictions ensure that |
90 | .BR pivot_root () | |
91 | never propagates any changes to another mount namespace. | |
eb9078a7 MK |
92 | .IP \- |
93 | The current root directory must be a mount point. | |
47297adb | 94 | .SH RETURN VALUE |
c13182ef MK |
95 | On success, zero is returned. |
96 | On error, \-1 is returned, and | |
fea681da MK |
97 | \fIerrno\fP is set appropriately. |
98 | .SH ERRORS | |
60a90ecd | 99 | .BR pivot_root () |
5f5751d3 | 100 | may fail with any of the same errors as |
60a90ecd | 101 | .BR stat (2). |
5f5751d3 | 102 | Additionally, it may fail with the following errors: |
fea681da MK |
103 | .TP |
104 | .B EBUSY | |
b647c4c9 MK |
105 | .\" Reconfirmed that the following error occurs on Linux 5.0 by |
106 | .\" specifying 'new_root' as "/rootfs" and 'put_old' as | |
107 | .\" "/rootfs/oldrootfs", and *not* bind mounting "/rootfs" on top of | |
108 | .\" itself. Of course, this is an odd situation, since a later check | |
109 | .\" in the kernel code will in any case yield EINVAL if 'new_root' is | |
110 | .\" not a mount point. However, when the system call was first added, | |
bf421740 | 111 | .\" 'new_root' was not required to be a mount point. So, this |
b647c4c9 MK |
112 | .\" error is nowadays probably just the result of crufty accumulation. |
113 | .\" This error can also occur if we bind mount "/" on top of itself | |
114 | .\" and try to specify "/" as the 'new' (again, an odd situation). So, | |
115 | .\" the EBUSY check in the kernel does still seem necessary to prevent | |
116 | .\" that case. Furthermore, the "or put_old" piece is probably | |
117 | .\" redundant text (although the check is in the kernel), since, | |
118 | .\" in another check, 'put_old' is required to be under 'new_root'. | |
119 | .I new_root | |
120 | or | |
121 | .I put_old | |
ba4b07c3 | 122 | is on the current root mount. |
b647c4c9 MK |
123 | (This error covers the pathological case where |
124 | .I new_root | |
125 | is | |
126 | .IR """/""" .) | |
fea681da MK |
127 | .TP |
128 | .B EINVAL | |
37704bfc MK |
129 | .I new_root |
130 | is not a mount point. | |
131 | .TP | |
132 | .B EINVAL | |
542175d8 | 133 | \fIput_old\fP is not at or underneath \fInew_root\fP. |
fea681da | 134 | .TP |
dc9b6c92 | 135 | .B EINVAL |
eb9078a7 MK |
136 | The current root directory is not a mount point |
137 | (because of an earlier | |
138 | .BR chroot (2)). | |
139 | .TP | |
140 | .B EINVAL | |
ba4b07c3 | 141 | The current root is on the rootfs (initial ramfs) mount; see NOTES. |
dc9b6c92 | 142 | .TP |
1a0b1fd7 MK |
143 | .B EINVAL |
144 | Either the mount point at | |
145 | .IR new_root , | |
146 | or the parent mount of that mount point, | |
147 | has propagation type | |
148 | .BR MS_SHARED . | |
149 | .TP | |
a39e880f MK |
150 | .B EINVAL |
151 | .I put_old | |
152 | is a mount point and has the propagation type | |
153 | .BR MS_SHARED . | |
154 | .TP | |
fea681da MK |
155 | .B ENOTDIR |
156 | \fInew_root\fP or \fIput_old\fP is not a directory. | |
157 | .TP | |
158 | .B EPERM | |
edd1fa35 | 159 | The calling process does not have the |
fea681da MK |
160 | .B CAP_SYS_ADMIN |
161 | capability. | |
a1d5f77c MK |
162 | .SH VERSIONS |
163 | .BR pivot_root () | |
164 | was introduced in Linux 2.3.41. | |
47297adb | 165 | .SH CONFORMING TO |
a1d5f77c | 166 | .BR pivot_root () |
8382f16d | 167 | is Linux-specific and hence is not portable. |
f5b03186 MK |
168 | .SH NOTES |
169 | Glibc does not provide a wrapper for this system call; call it using | |
170 | .BR syscall (2). | |
82320f42 | 171 | .PP |
14caaed2 MK |
172 | A command-line interface for this system call is provided by |
173 | .BR pivot_root (8). | |
174 | .PP | |
422e36b7 MK |
175 | .BR pivot_root () |
176 | allows the caller to switch to a new root filesystem while at the same time | |
177 | placing the old root mount at a location under | |
178 | .I new_root | |
179 | from where it can subsequently be unmounted. | |
180 | (The fact that it moves all processes that have a root directory | |
b27d444f MK |
181 | or current working directory on the old root directory to the |
182 | new root frees the old root directory of users, | |
33313a26 | 183 | allowing the old root mount to be unmounted more easily.) |
c4bf3333 | 184 | .PP |
87529800 | 185 | One use of |
422e36b7 MK |
186 | .BR pivot_root () |
187 | is during system startup, when the | |
87529800 MK |
188 | system mounts a temporary root filesystem (e.g., an |
189 | .BR initrd (4)), | |
190 | then mounts the real root filesystem, and eventually turns the latter into | |
191 | the root directory of all relevant processes and threads. | |
422e36b7 MK |
192 | A modern use is to set up a root filesystem during |
193 | the creation of a container. | |
194 | .PP | |
fc2f474d MK |
195 | The fact that |
196 | .BR pivot_root () | |
197 | modifies process root and current working directories in the | |
198 | manner noted in DESCRIPTION | |
199 | is necessary in order to prevent kernel threads from keeping the old | |
87529800 | 200 | root mount busy with their root and current working directories, |
fc2f474d MK |
201 | even if they never access |
202 | the filesystem in any way. | |
fc2f474d | 203 | .PP |
97076c5a MK |
204 | The rootfs (initial ramfs) cannot be |
205 | .BR pivot_root ()ed. | |
206 | The recommended method of changing the root filesystem in this case is | |
207 | to delete everything in rootfs, overmount rootfs with the new root, attach | |
208 | .IR stdin / stdout / stderr | |
209 | to the new | |
210 | .IR /dev/console , | |
211 | and exec the new | |
212 | .BR init (1). | |
213 | Helper programs for this process exist; see | |
214 | .BR switch_root (8). | |
3db820fe MK |
215 | .\" |
216 | .SS pivot_root(\(dq.\(dq, \(dq.\(dq) | |
97076c5a | 217 | .PP |
57bab66a MK |
218 | .I new_root |
219 | and | |
220 | .I put_old | |
221 | may be the same directory. | |
222 | In particular, the following sequence allows a pivot-root operation | |
223 | without needing to create and remove a temporary directory: | |
224 | .PP | |
225 | .in +4n | |
226 | .EX | |
227 | chdir(new_root); | |
228 | pivot_root(".", "."); | |
229 | umount2(".", MNT_DETACH); | |
230 | .EE | |
231 | .in | |
232 | .PP | |
233 | This sequence succeeds because the | |
234 | .BR pivot_root () | |
235 | call stacks the old root mount point | |
57bab66a MK |
236 | on top of the new root mount point at |
237 | .IR / . | |
238 | At that point, the calling process's root directory and current | |
239 | working directory refer to the new root mount point | |
240 | .RI ( new_root ). | |
241 | During the subsequent | |
242 | .BR umount () | |
243 | call, resolution of | |
244 | .IR """.""" | |
245 | starts with | |
246 | .I new_root | |
247 | and then moves up the list of mounts stacked at | |
248 | .IR / , | |
8f2a9129 | 249 | with the result that old root mount point is unmounted. |
01c64c3b MK |
250 | .\" |
251 | .SS Historical notes | |
252 | For many years, this manual page carried the following text: | |
253 | .RS | |
254 | .PP | |
255 | .BR pivot_root () | |
256 | may or may not change the current root and the current | |
257 | working directory of any processes or threads which use the old | |
258 | root directory. | |
259 | The caller of | |
260 | .BR pivot_root () | |
261 | must ensure that processes with root or current working directory | |
262 | at the old root operate correctly in either case. | |
263 | An easy way to ensure this is to change their | |
264 | root and current working directory to \fInew_root\fP before invoking | |
265 | .BR pivot_root (). | |
266 | .RE | |
267 | .PP | |
268 | This text, written before the system call implementation was | |
269 | even finalized in the kernel, was probably intended to warn users | |
270 | at that time that the implementation might change before final release. | |
271 | However, the behavior stated in DESCRIPTION | |
272 | has remained consistent since this system call | |
273 | was first implemented and will not change now. | |
2f2e1a22 | 274 | .SH EXAMPLE |
47b69a37 MK |
275 | .\" FIXME |
276 | .\" Would it be better, because simpler, to use unshare(2) | |
277 | .\" rather than clone(2) in the example below? | |
2f2e1a22 MK |
278 | .PP |
279 | The program below demonstrates the use of | |
280 | .BR pivot_root () | |
281 | inside a mount namespace that is created using | |
282 | .BR clone (2). | |
283 | After pivoting to the root directory named in the program's | |
284 | first command-line argument, the child created by | |
285 | .BR clone (2) | |
286 | then executes the program named in the remaining command-line arguments. | |
287 | .PP | |
288 | We demonstrate the program by creating a directory that will serve as | |
289 | the new root filesystem and placing a copy of the (statically linked) | |
290 | .BR busybox (1) | |
291 | executable in that directory. | |
292 | .PP | |
293 | .in +4n | |
294 | .EX | |
295 | $ \fBmkdir /tmp/rootfs\fP | |
296 | $ \fBls \-id /tmp/rootfs\fP # Show inode number of new root directory | |
297 | 319459 /tmp/rootfs | |
298 | $ \fBcp $(which busybox) /tmp/rootfs\fP | |
299 | $ \fBPS1='bbsh$ ' sudo ./pivot_root_demo /tmp/rootfs /busybox sh\fP | |
300 | bbsh$ \fBPATH=/\fP | |
301 | bbsh$ \fBbusybox ln busybox ln\fP | |
302 | bbsh$ \fBln busybox echo\fP | |
303 | bbsh$ \fBln busybox ls\fP | |
304 | bbsh$ \fBls\fP | |
305 | busybox echo ln ls | |
306 | bbsh$ \fBls \-id /\fP # Compare with inode number above | |
307 | 319459 / | |
308 | bbsh$ \fBecho \(aqhello world\(aq\fP | |
309 | hello world | |
310 | .EE | |
311 | .in | |
312 | .SS Program source | |
313 | \& | |
314 | .PP | |
315 | .EX | |
316 | /* pivot_root_demo.c */ | |
317 | ||
318 | #define _GNU_SOURCE | |
319 | #include <sched.h> | |
320 | #include <stdio.h> | |
321 | #include <stdlib.h> | |
322 | #include <unistd.h> | |
323 | #include <sys/wait.h> | |
324 | #include <sys/syscall.h> | |
325 | #include <sys/mount.h> | |
326 | #include <sys/stat.h> | |
327 | #include <limits.h> | |
1b547316 | 328 | #include <sys/mman.h> |
2f2e1a22 MK |
329 | |
330 | #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e | |
331 | } while (0) | |
332 | ||
333 | static int | |
334 | pivot_root(const char *new_root, const char *put_old) | |
335 | { | |
336 | return syscall(SYS_pivot_root, new_root, put_old); | |
337 | } | |
338 | ||
339 | #define STACK_SIZE (1024 * 1024) | |
340 | ||
341 | static int /* Startup function for cloned child */ | |
342 | child(void *arg) | |
343 | { | |
344 | char **args = arg; | |
345 | char *new_root = args[0]; | |
346 | const char *put_old = "/oldrootfs"; | |
347 | char path[PATH_MAX]; | |
348 | ||
349 | /* Ensure that \(aqnew_root\(aq and its parent mount don\(aqt have | |
350 | shared propagation (which would cause pivot_root() to | |
351 | return an error), and prevent propagation of mount | |
352 | events to the initial mount namespace */ | |
353 | ||
354 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) == 1) | |
355 | errExit("mount\-MS_PRIVATE"); | |
356 | ||
357 | /* Ensure that \(aqnew_root\(aq is a mount point */ | |
358 | ||
359 | if (mount(new_root, new_root, NULL, MS_BIND, NULL) == \-1) | |
360 | errExit("mount\-MS_BIND"); | |
361 | ||
362 | /* Create directory to which old root will be pivoted */ | |
363 | ||
364 | snprintf(path, sizeof(path), "%s/%s", new_root, put_old); | |
365 | if (mkdir(path, 0777) == \-1) | |
366 | errExit("mkdir"); | |
367 | ||
368 | /* And pivot the root filesystem */ | |
369 | ||
370 | if (pivot_root(new_root, path) == \-1) | |
371 | errExit("pivot_root"); | |
372 | ||
bf421740 | 373 | /* Switch the current working directory to "/" */ |
2f2e1a22 MK |
374 | |
375 | if (chdir("/") == \-1) | |
376 | errExit("chdir"); | |
377 | ||
378 | /* Unmount old root and remove mount point */ | |
379 | ||
380 | if (umount2(put_old, MNT_DETACH) == \-1) | |
381 | perror("umount2"); | |
382 | if (rmdir(put_old) == \-1) | |
383 | perror("rmdir"); | |
384 | ||
385 | /* Execute the command specified in argv[1]... */ | |
386 | ||
387 | execv(args[1], &args[1]); | |
388 | errExit("execv"); | |
389 | } | |
390 | ||
391 | int | |
392 | main(int argc, char *argv[]) | |
393 | { | |
394 | /* Create a child process in a new mount namespace */ | |
395 | ||
1b547316 MK |
396 | char *stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, |
397 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, \-1, 0); | |
398 | if (stack == MAP_FAILED) | |
399 | errExit("mmap"); | |
2f2e1a22 MK |
400 | |
401 | if (clone(child, stack + STACK_SIZE, | |
402 | CLONE_NEWNS | SIGCHLD, &argv[1]) == \-1) | |
403 | errExit("clone"); | |
404 | ||
405 | /* Parent falls through to here; wait for child */ | |
406 | ||
407 | if (wait(NULL) == \-1) | |
408 | errExit("wait"); | |
409 | ||
410 | exit(EXIT_SUCCESS); | |
411 | } | |
412 | .EE | |
47297adb | 413 | .SH SEE ALSO |
fea681da MK |
414 | .BR chdir (2), |
415 | .BR chroot (2), | |
34a0f19c | 416 | .BR mount (2), |
fea681da MK |
417 | .BR stat (2), |
418 | .BR initrd (4), | |
f42778c4 | 419 | .BR mount_namespaces (7), |
b2bced6d MK |
420 | .BR pivot_root (8), |
421 | .BR switch_root (8) |