]>
Commit | Line | Data |
---|---|---|
bf9b5158 MR |
1 | .\" Copyright (c) 2016, IBM Corporation. |
2 | .\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com> | |
54e031a1 | 3 | .\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com> |
bf9b5158 MR |
4 | .\" |
5 | .\" %%%LICENSE_START(VERBATIM) | |
6 | .\" Permission is granted to make and distribute verbatim copies of this | |
7 | .\" manual provided the copyright notice and this permission notice are | |
8 | .\" preserved on all copies. | |
9 | .\" | |
10 | .\" Permission is granted to copy and distribute modified versions of this | |
11 | .\" manual under the conditions for verbatim copying, provided that the | |
12 | .\" entire resulting derived work is distributed under the terms of a | |
13 | .\" permission notice identical to this one. | |
14 | .\" | |
15 | .\" Since the Linux kernel and libraries are constantly changing, this | |
16 | .\" manual page may be incorrect or out-of-date. The author(s) assume no | |
17 | .\" responsibility for errors or omissions, or for damages resulting from | |
18 | .\" the use of the information contained herein. The author(s) may not | |
19 | .\" have taken the same level of care in the production of this manual, | |
20 | .\" which is licensed free of charge, as they might when working | |
21 | .\" professionally. | |
22 | .\" | |
23 | .\" Formatted or processed versions of this manual, if unaccompanied by | |
24 | .\" the source, must acknowledge the copyright and authors of this work. | |
25 | .\" %%%LICENSE_END | |
26 | .\" | |
6b621d05 | 27 | .TH USERFAULTFD 2 2020-02-09 "Linux" "Linux Programmer's Manual" |
bf9b5158 | 28 | .SH NAME |
68a9e23c | 29 | userfaultfd \- create a file descriptor for handling page faults in user space |
bf9b5158 MR |
30 | .SH SYNOPSIS |
31 | .nf | |
32 | .B #include <sys/types.h> | |
3f894561 | 33 | .B #include <linux/userfaultfd.h> |
68e4db0a | 34 | .PP |
bf9b5158 MR |
35 | .BI "int userfaultfd(int " flags ); |
36 | .fi | |
37 | .PP | |
38 | .IR Note : | |
39 | There is no glibc wrapper for this system call; see NOTES. | |
40 | .SH DESCRIPTION | |
4aa7f5cf MK |
41 | .BR userfaultfd () |
42 | creates a new userfaultfd object that can be used for delegation of page-fault | |
43 | handling to a user-space application, | |
44 | and returns a file descriptor that refers to the new object. | |
45 | The new userfaultfd object is configured using | |
bf9b5158 | 46 | .BR ioctl (2). |
efeece04 | 47 | .PP |
4aa7f5cf | 48 | Once the userfaultfd object is configured, the application can use |
bf9b5158 MR |
49 | .BR read (2) |
50 | to receive userfaultfd notifications. | |
4aa7f5cf MK |
51 | The reads from userfaultfd may be blocking or non-blocking, |
52 | depending on the value of | |
bf9b5158 MR |
53 | .I flags |
54 | used for the creation of the userfaultfd or subsequent calls to | |
4aa7f5cf | 55 | .BR fcntl (2). |
efeece04 | 56 | .PP |
bf9b5158 MR |
57 | The following values may be bitwise ORed in |
58 | .IR flags | |
59 | to change the behavior of | |
60 | .BR userfaultfd (): | |
61 | .TP | |
62 | .BR O_CLOEXEC | |
4aa7f5cf | 63 | Enable the close-on-exec flag for the new userfaultfd file descriptor. |
bf9b5158 MR |
64 | See the description of the |
65 | .B O_CLOEXEC | |
66 | flag in | |
4aa7f5cf | 67 | .BR open (2). |
bf9b5158 MR |
68 | .TP |
69 | .BR O_NONBLOCK | |
4aa7f5cf | 70 | Enables non-blocking operation for the userfaultfd object. |
bf9b5158 MR |
71 | See the description of the |
72 | .BR O_NONBLOCK | |
73 | flag in | |
74 | .BR open (2). | |
7b8695f2 MK |
75 | .PP |
76 | When the last file descriptor referring to a userfaultfd object is closed, | |
77 | all memory ranges that were registered with the object are unregistered | |
5b1c4a1e | 78 | and unread events are flushed. |
bf9b5158 | 79 | .\" |
54e031a1 MK |
80 | .SS Usage |
81 | The userfaultfd mechanism is designed to allow a thread in a multithreaded | |
82 | program to perform user-space paging for the other threads in the process. | |
83 | When a page fault occurs for one of the regions registered | |
84 | to the userfaultfd object, | |
85 | the faulting thread is put to sleep and | |
86 | an event is generated that can be read via the userfaultfd file descriptor. | |
87 | The fault-handling thread reads events from this file descriptor and services | |
88 | them using the operations described in | |
89 | .BR ioctl_userfaultfd (2). | |
90 | When servicing the page fault events, | |
91 | the fault-handling thread can trigger a wake-up for the sleeping thread. | |
efeece04 | 92 | .PP |
d28aadf6 MR |
93 | It is possible for the faulting threads and the fault-handling threads |
94 | to run in the context of different processes. | |
95 | In this case, these threads may belong to different programs, | |
96 | and the program that executes the faulting threads | |
97 | will not necessarily cooperate with the program that handles the page faults. | |
98 | In such non-cooperative mode, | |
792570de | 99 | the process that monitors userfaultfd and handles page faults |
d28aadf6 MR |
100 | needs to be aware of the changes in the virtual memory layout |
101 | of the faulting process to avoid memory corruption. | |
efeece04 | 102 | .PP |
5b1c4a1e | 103 | Starting from Linux 4.11, |
c2b3668b | 104 | userfaultfd can also notify the fault-handling threads about changes |
5b1c4a1e MR |
105 | in the virtual memory layout of the faulting process. |
106 | In addition, if the faulting process invokes | |
c2b3668b | 107 | .BR fork (2), |
5b1c4a1e MR |
108 | the userfaultfd objects associated with the parent may be duplicated |
109 | into the child process and the userfaultfd monitor will be notified | |
522ab2ff MK |
110 | (via the |
111 | .B UFFD_EVENT_FORK | |
112 | described below) | |
5b1c4a1e MR |
113 | about the file descriptor associated with the userfault objects |
114 | created for the child process, | |
522ab2ff | 115 | which allows the userfaultfd monitor to perform user-space paging |
5b1c4a1e | 116 | for the child process. |
265225c1 | 117 | Unlike page faults which have to be synchronous and require an |
20050169 MR |
118 | explicit or implicit wakeup, |
119 | all other events are delivered asynchronously and | |
120 | the non-cooperative process resumes execution as | |
265225c1 MK |
121 | soon as the userfaultfd manager executes |
122 | .BR read (2). | |
123 | The userfaultfd manager should carefully synchronize calls to | |
124 | .B UFFDIO_COPY | |
125 | with the processing of events. | |
efeece04 | 126 | .PP |
20050169 MR |
127 | The current asynchronous model of the event delivery is optimal for |
128 | single threaded non-cooperative userfaultfd manager implementations. | |
94a8ebda MK |
129 | .\" Regarding the preceding sentence, Mike Rapoport says: |
130 | .\" The major point here is that current events delivery model could be | |
131 | .\" problematic for multi-threaded monitor. I even suspect that it would be | |
132 | .\" impossible to ensure synchronization between page faults and non-page | |
133 | .\" fault events in multi-threaded monitor. | |
efeece04 | 134 | .PP |
d28aadf6 | 135 | .\" FIXME elaborate about non-cooperating mode, describe its limitations |
792570de | 136 | .\" for kernels before 4.11, features added in 4.11 |
d28aadf6 MR |
137 | .\" and limitations remaining in 4.11 |
138 | .\" Maybe it's worth adding a dedicated sub-section... | |
54e031a1 | 139 | .\" |
bf9b5158 MR |
140 | .SS Userfaultfd operation |
141 | After the userfaultfd object is created with | |
4aa7f5cf MK |
142 | .BR userfaultfd (), |
143 | the application must enable it using the | |
144 | .B UFFDIO_API | |
145 | .BR ioctl (2) | |
146 | operation. | |
147 | This operation allows a handshake between the kernel and user space | |
148 | to determine the API version and supported features. | |
5074d846 MK |
149 | This operation must be performed before any of the other |
150 | .BR ioctl (2) | |
151 | operations described below (or those operations fail with the | |
152 | .BR EINVAL | |
153 | error). | |
efeece04 | 154 | .PP |
4aa7f5cf MK |
155 | After a successful |
156 | .B UFFDIO_API | |
157 | operation, | |
158 | the application then registers memory address ranges using the | |
159 | .B UFFDIO_REGISTER | |
160 | .BR ioctl (2) | |
161 | operation. | |
162 | After successful completion of a | |
163 | .B UFFDIO_REGISTER | |
164 | operation, | |
165 | a page fault occurring in the requested memory range, and satisfying | |
166 | the mode defined at the registration time, will be forwarded by the kernel to | |
167 | the user-space application. | |
168 | The application can then use the | |
169 | .B UFFDIO_COPY | |
bf9b5158 | 170 | or |
99de80c5 | 171 | .B UFFDIO_ZEROPAGE |
4aa7f5cf MK |
172 | .BR ioctl (2) |
173 | operations to resolve the page fault. | |
2c371032 | 174 | .PP |
e81833be | 175 | Starting from Linux 4.14, if the application sets the |
2c371032 | 176 | .B UFFD_FEATURE_SIGBUS |
e81833be | 177 | feature bit using the |
2c371032 PS |
178 | .B UFFDIO_API |
179 | .BR ioctl (2), | |
e81833be MK |
180 | no page-fault notification will be forwarded to user space. |
181 | Instead a | |
2c371032 | 182 | .B SIGBUS |
e81833be MK |
183 | signal is delivered to the faulting process. |
184 | With this feature, | |
185 | userfaultfd can be used for robustness purposes to simply catch | |
2c371032 PS |
186 | any access to areas within the registered address range that do not |
187 | have pages allocated, without having to listen to userfaultfd events. | |
188 | No userfaultfd monitor will be required for dealing with such memory | |
e81833be MK |
189 | accesses. |
190 | For example, this feature can be useful for applications that | |
2c371032 | 191 | want to prevent the kernel from automatically allocating pages and filling |
e81833be | 192 | holes in sparse files when the hole is accessed through a memory mapping. |
2c371032 PS |
193 | .PP |
194 | The | |
195 | .B UFFD_FEATURE_SIGBUS | |
e81833be MK |
196 | feature is implicitly inherited through |
197 | .BR fork (2) | |
198 | if used in combination with | |
2c371032 | 199 | .BR UFFD_FEATURE_FORK . |
efeece04 | 200 | .PP |
6bc6d124 | 201 | Details of the various |
4aa7f5cf | 202 | .BR ioctl (2) |
6bc6d124 MK |
203 | operations can be found in |
204 | .BR ioctl_userfaultfd (2). | |
efeece04 | 205 | .PP |
5b1c4a1e MR |
206 | Since Linux 4.11, events other than page-fault may enabled during |
207 | .B UFFDIO_API | |
208 | operation. | |
efeece04 | 209 | .PP |
c3d5d9df MR |
210 | Up to Linux 4.11, |
211 | userfaultfd can be used only with anonymous private memory mappings. | |
c3d5d9df MR |
212 | Since Linux 4.11, |
213 | userfaultfd can be also used with hugetlbfs and shared memory mappings. | |
efeece04 | 214 | .PP |
54e031a1 MK |
215 | .\" |
216 | .SS Reading from the userfaultfd structure | |
54e031a1 MK |
217 | Each |
218 | .BR read (2) | |
219 | from the userfaultfd file descriptor returns one or more | |
220 | .I uffd_msg | |
5b1c4a1e MR |
221 | structures, each of which describes a page-fault event |
222 | or an event required for the non-cooperative userfaultfd usage: | |
efeece04 | 223 | .PP |
54e031a1 | 224 | .in +4n |
b8302363 | 225 | .EX |
54e031a1 | 226 | struct uffd_msg { |
c2b3668b | 227 | __u8 event; /* Type of event */ |
54e031a1 MK |
228 | ... |
229 | union { | |
00f4853c | 230 | struct { |
c2b3668b MK |
231 | __u64 flags; /* Flags describing fault */ |
232 | __u64 address; /* Faulting address */ | |
54e031a1 | 233 | } pagefault; |
c2b3668b MK |
234 | |
235 | struct { /* Since Linux 4.11 */ | |
236 | __u32 ufd; /* Userfault file descriptor | |
237 | of the child process */ | |
238 | } fork; | |
239 | ||
240 | struct { /* Since Linux 4.11 */ | |
241 | __u64 from; /* Old address of remapped area */ | |
242 | __u64 to; /* New address of remapped area */ | |
243 | __u64 len; /* Original mapping length */ | |
244 | } remap; | |
245 | ||
246 | struct { /* Since Linux 4.11 */ | |
247 | __u64 start; /* Start address of removed area */ | |
248 | __u64 end; /* End address of removed area */ | |
249 | } remove; | |
b07243ab | 250 | ... |
54e031a1 MK |
251 | } arg; |
252 | ||
253 | /* Padding fields omitted */ | |
254 | } __packed; | |
b8302363 | 255 | .EE |
e646a1ba | 256 | .in |
efeece04 | 257 | .PP |
54e031a1 MK |
258 | If multiple events are available and the supplied buffer is large enough, |
259 | .BR read (2) | |
260 | returns as many events as will fit in the supplied buffer. | |
261 | If the buffer supplied to | |
262 | .BR read (2) | |
263 | is smaller than the size of the | |
264 | .I uffd_msg | |
265 | structure, the | |
266 | .BR read (2) | |
267 | fails with the error | |
268 | .BR EINVAL . | |
efeece04 | 269 | .PP |
54e031a1 MK |
270 | The fields set in the |
271 | .I uffd_msg | |
272 | structure are as follows: | |
273 | .TP | |
274 | .I event | |
275 | The type of event. | |
5b1c4a1e MR |
276 | Depending of the event type, |
277 | different fields of the | |
278 | .I arg | |
279 | union represent details required for the event processing. | |
280 | The non-page-fault events are generated only when appropriate feature | |
281 | is enabled during API handshake with | |
282 | .B UFFDIO_API | |
283 | .BR ioctl (2). | |
efeece04 | 284 | .IP |
5b1c4a1e MR |
285 | The following values can appear in the |
286 | .I event | |
287 | field: | |
288 | .RS | |
289 | .TP | |
6c12d34e | 290 | .BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)" |
5b1c4a1e MR |
291 | A page-fault event. |
292 | The page-fault details are available in the | |
293 | .I pagefault | |
294 | field. | |
54e031a1 | 295 | .TP |
6c12d34e | 296 | .BR UFFD_EVENT_FORK " (since Linux 4.11)" |
5b1c4a1e | 297 | Generated when the faulting process invokes |
522ab2ff MK |
298 | .BR fork (2) |
299 | (or | |
300 | .BR clone (2) | |
301 | without the | |
302 | .BR CLONE_VM | |
303 | flag). | |
5b1c4a1e MR |
304 | The event details are available in the |
305 | .I fork | |
306 | field. | |
23db3790 | 307 | .\" FIXME describe duplication of userfault file descriptor during fork |
5b1c4a1e | 308 | .TP |
6c12d34e | 309 | .BR UFFD_EVENT_REMAP " (since Linux 4.11)" |
5b1c4a1e | 310 | Generated when the faulting process invokes |
c2b3668b | 311 | .BR mremap (2). |
5b1c4a1e MR |
312 | The event details are available in the |
313 | .I remap | |
314 | field. | |
315 | .TP | |
6c12d34e | 316 | .BR UFFD_EVENT_REMOVE " (since Linux 4.11)" |
5b1c4a1e MR |
317 | Generated when the faulting process invokes |
318 | .BR madvise (2) | |
c2b3668b | 319 | with |
5b1c4a1e MR |
320 | .BR MADV_DONTNEED |
321 | or | |
322 | .BR MADV_REMOVE | |
323 | advice. | |
324 | The event details are available in the | |
325 | .I remove | |
326 | field. | |
327 | .TP | |
6c12d34e | 328 | .BR UFFD_EVENT_UNMAP " (since Linux 4.11)" |
5b1c4a1e MR |
329 | Generated when the faulting process unmaps a memory range, |
330 | either explicitly using | |
331 | .BR munmap (2) | |
c2b3668b | 332 | or implicitly during |
5b1c4a1e MR |
333 | .BR mmap (2) |
334 | or | |
c2b3668b | 335 | .BR mremap (2). |
5b1c4a1e MR |
336 | The event details are available in the |
337 | .I remove | |
338 | field. | |
339 | .RE | |
340 | .TP | |
341 | .I pagefault.address | |
54e031a1 MK |
342 | The address that triggered the page fault. |
343 | .TP | |
5b1c4a1e | 344 | .I pagefault.flags |
54e031a1 MK |
345 | A bit mask of flags that describe the event. |
346 | For | |
347 | .BR UFFD_EVENT_PAGEFAULT , | |
348 | the following flag may appear: | |
349 | .RS | |
350 | .TP | |
351 | .B UFFD_PAGEFAULT_FLAG_WRITE | |
352 | If the address is in a range that was registered with the | |
353 | .B UFFDIO_REGISTER_MODE_MISSING | |
354 | flag (see | |
355 | .BR ioctl_userfaultfd (2)) | |
356 | and this flag is set, this a write fault; | |
357 | otherwise it is a read fault. | |
358 | .\" | |
359 | .\" UFFD_PAGEFAULT_FLAG_WP is not yet supported. | |
360 | .RE | |
5b1c4a1e MR |
361 | .TP |
362 | .I fork.ufd | |
363 | The file descriptor associated with the userfault object | |
522ab2ff MK |
364 | created for the child created by |
365 | .BR fork (2). | |
5b1c4a1e MR |
366 | .TP |
367 | .I remap.from | |
368 | The original address of the memory range that was remapped using | |
369 | .BR mremap (2). | |
370 | .TP | |
371 | .I remap.to | |
372 | The new address of the memory range that was remapped using | |
373 | .BR mremap (2). | |
374 | .TP | |
375 | .I remap.len | |
23db3790 | 376 | The original length of the memory range that was remapped using |
5b1c4a1e MR |
377 | .BR mremap (2). |
378 | .TP | |
379 | .I remove.start | |
380 | The start address of the memory range that was freed using | |
381 | .BR madvise (2) | |
382 | or unmapped | |
383 | .TP | |
384 | .I remove.end | |
385 | The end address of the memory range that was freed using | |
386 | .BR madvise (2) | |
387 | or unmapped | |
54e031a1 | 388 | .PP |
058b32ac MK |
389 | A |
390 | .BR read (2) | |
391 | on a userfaultfd file descriptor can fail with the following errors: | |
392 | .TP | |
393 | .B EINVAL | |
394 | The userfaultfd object has not yet been enabled using the | |
395 | .BR UFFDIO_API | |
396 | .BR ioctl (2) | |
397 | operation | |
398 | .PP | |
1bf00323 MK |
399 | If the |
400 | .B O_NONBLOCK | |
401 | flag is enabled in the associated open file description, | |
402 | the userfaultfd file descriptor can be monitored with | |
54e031a1 MK |
403 | .BR poll (2), |
404 | .BR select (2), | |
405 | and | |
406 | .BR epoll (7). | |
407 | When events are available, the file descriptor indicates as readable. | |
1bf00323 MK |
408 | If the |
409 | .B O_NONBLOCK | |
410 | flag is not enabled, then | |
411 | .BR poll (2) | |
412 | (always) indicates the file as having a | |
413 | .BR POLLERR | |
414 | condition, and | |
415 | .BR select (2) | |
416 | indicates the file descriptor as both readable and writable. | |
417 | .\" FIXME What is the reason for this seemingly odd behavior with respect | |
418 | .\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c). | |
419 | .\" Something needs to be said about this. | |
bf9b5158 | 420 | .SH RETURN VALUE |
4aa7f5cf MK |
421 | On success, |
422 | .BR userfaultfd () | |
423 | returns a new file descriptor that refers to the userfaultfd object. | |
bf9b5158 MR |
424 | On error, \-1 is returned, and |
425 | .I errno | |
426 | is set appropriately. | |
427 | .SH ERRORS | |
428 | .TP | |
429 | .B EINVAL | |
430 | An unsupported value was specified in | |
431 | .IR flags . | |
432 | .TP | |
433 | .BR EMFILE | |
434 | The per-process limit on the number of open file descriptors has been | |
435 | reached | |
436 | .TP | |
437 | .B ENFILE | |
438 | The system-wide limit on the total number of open files has been | |
439 | reached. | |
440 | .TP | |
441 | .B ENOMEM | |
442 | Insufficient kernel memory was available. | |
339b899c | 443 | .TP |
c4f13bc7 | 444 | .BR EPERM " (since Linux 5.2)" |
339b899c | 445 | .\" cefdca0a86be517bc390fc4541e3674b8e7803b0 |
c4f13bc7 MK |
446 | The caller is not privileged (does not have the |
447 | .B CAP_SYS_PTRACE | |
2b6923ba | 448 | capability in the initial user namespace), and |
c4f13bc7 MK |
449 | .I /proc/sys/vm/unprivileged_userfaultfd |
450 | has the value 0. | |
0da8f5c1 MK |
451 | .SH VERSIONS |
452 | The | |
453 | .BR userfaultfd () | |
454 | system call first appeared in Linux 4.3. | |
efeece04 | 455 | .PP |
be9021b1 MR |
456 | The support for hugetlbfs and shared memory areas and |
457 | non-page-fault events was added in Linux 4.11 | |
bf9b5158 MR |
458 | .SH CONFORMING TO |
459 | .BR userfaultfd () | |
460 | is Linux-specific and should not be used in programs intended to be | |
461 | portable. | |
462 | .SH NOTES | |
463 | Glibc does not provide a wrapper for this system call; call it using | |
464 | .BR syscall (2). | |
efeece04 | 465 | .PP |
de6943fa MK |
466 | The userfaultfd mechanism can be used as an alternative to |
467 | traditional user-space paging techniques based on the use of the | |
468 | .BR SIGSEGV | |
469 | signal and | |
470 | .BR mmap (2). | |
471 | It can also be used to implement lazy restore | |
472 | for checkpoint/restore mechanisms, | |
473 | as well as post-copy migration to allow (nearly) uninterrupted execution | |
18448166 MR |
474 | when transferring virtual machines and Linux containers |
475 | from one host to another. | |
2a885973 MR |
476 | .SH BUGS |
477 | If the | |
478 | .B UFFD_FEATURE_EVENT_FORK | |
479 | is enabled and a system call from the | |
480 | .BR fork (2) | |
047a0bb8 | 481 | family is interrupted by a signal or failed, a stale userfaultfd descriptor |
2a885973 | 482 | might be created. |
047a0bb8 | 483 | In this case, a spurious |
2a885973 MR |
484 | .B UFFD_EVENT_FORK |
485 | will be delivered to the userfaultfd monitor. | |
b07243ab MK |
486 | .SH EXAMPLE |
487 | The program below demonstrates the use of the userfaultfd mechanism. | |
488 | The program creates two threads, one of which acts as the | |
489 | page-fault handler for the process, for the pages in a demand-page zero | |
490 | region created using | |
491 | .BR mmap (2). | |
efeece04 | 492 | .PP |
b07243ab MK |
493 | The program takes one command-line argument, |
494 | which is the number of pages that will be created in a mapping | |
495 | whose page faults will be handled via userfaultfd. | |
496 | After creating a userfaultfd object, | |
497 | the program then creates an anonymous private mapping of the specified size | |
498 | and registers the address range of that mapping using the | |
499 | .B UFFDIO_REGISTER | |
500 | .BR ioctl (2) | |
501 | operation. | |
00f4853c | 502 | The program then creates a second thread that will perform the |
b07243ab | 503 | task of handling page faults. |
efeece04 | 504 | .PP |
b07243ab MK |
505 | The main thread then walks through the pages of the mapping fetching |
506 | bytes from successive pages. | |
507 | Because the pages have not yet been accessed, | |
508 | the first access of a byte in each page will trigger a page-fault event | |
509 | on the userfaultfd file descriptor. | |
efeece04 | 510 | .PP |
b07243ab MK |
511 | Each of the page-fault events is handled by the second thread, |
512 | which sits in a loop processing input from the userfaultfd file descriptor. | |
513 | In each loop iteration, the second thread first calls | |
514 | .BR poll (2) | |
515 | to check the state of the file descriptor, | |
516 | and then reads an event from the file descriptor. | |
517 | All such events should be | |
518 | .B UFFD_EVENT_PAGEFAULT | |
519 | events, | |
520 | which the thread handles by copying a page of data into | |
521 | the faulting region using the | |
00f4853c | 522 | .B UFFDIO_COPY |
b07243ab MK |
523 | .BR ioctl (2) |
524 | operation. | |
efeece04 | 525 | .PP |
b07243ab | 526 | The following is an example of what we see when running the program: |
efeece04 | 527 | .PP |
b07243ab | 528 | .in +4n |
b8302363 | 529 | .EX |
b07243ab MK |
530 | $ \fB./userfaultfd_demo 3\fP |
531 | Address returned by mmap() = 0x7fd30106c000 | |
532 | ||
533 | fault_handler_thread(): | |
534 | poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 | |
535 | UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f | |
536 | (uffdio_copy.copy returned 4096) | |
537 | Read address 0x7fd30106c00f in main(): A | |
538 | Read address 0x7fd30106c40f in main(): A | |
539 | Read address 0x7fd30106c80f in main(): A | |
540 | Read address 0x7fd30106cc0f in main(): A | |
541 | ||
542 | fault_handler_thread(): | |
543 | poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 | |
544 | UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f | |
545 | (uffdio_copy.copy returned 4096) | |
546 | Read address 0x7fd30106d00f in main(): B | |
547 | Read address 0x7fd30106d40f in main(): B | |
548 | Read address 0x7fd30106d80f in main(): B | |
549 | Read address 0x7fd30106dc0f in main(): B | |
550 | ||
551 | fault_handler_thread(): | |
552 | poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 | |
553 | UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f | |
554 | (uffdio_copy.copy returned 4096) | |
555 | Read address 0x7fd30106e00f in main(): C | |
556 | Read address 0x7fd30106e40f in main(): C | |
557 | Read address 0x7fd30106e80f in main(): C | |
558 | Read address 0x7fd30106ec0f in main(): C | |
b8302363 | 559 | .EE |
e646a1ba | 560 | .in |
b07243ab MK |
561 | .SS Program source |
562 | \& | |
e7d0bb47 | 563 | .EX |
b07243ab | 564 | /* userfaultfd_demo.c |
00f4853c | 565 | |
b07243ab MK |
566 | Licensed under the GNU General Public License version 2 or later. |
567 | */ | |
568 | #define _GNU_SOURCE | |
569 | #include <sys/types.h> | |
570 | #include <stdio.h> | |
571 | #include <linux/userfaultfd.h> | |
572 | #include <pthread.h> | |
573 | #include <errno.h> | |
574 | #include <unistd.h> | |
575 | #include <stdlib.h> | |
576 | #include <fcntl.h> | |
577 | #include <signal.h> | |
578 | #include <poll.h> | |
579 | #include <string.h> | |
580 | #include <sys/mman.h> | |
581 | #include <sys/syscall.h> | |
582 | #include <sys/ioctl.h> | |
583 | #include <poll.h> | |
584 | ||
d1a71985 | 585 | #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e |
b07243ab MK |
586 | } while (0) |
587 | ||
588 | static int page_size; | |
589 | ||
590 | static void * | |
591 | fault_handler_thread(void *arg) | |
592 | { | |
593 | static struct uffd_msg msg; /* Data read from userfaultfd */ | |
594 | static int fault_cnt = 0; /* Number of faults so far handled */ | |
595 | long uffd; /* userfaultfd file descriptor */ | |
596 | static char *page = NULL; | |
597 | struct uffdio_copy uffdio_copy; | |
598 | ssize_t nread; | |
599 | ||
600 | uffd = (long) arg; | |
601 | ||
602 | /* Create a page that will be copied into the faulting region */ | |
603 | ||
604 | if (page == NULL) { | |
605 | page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, | |
606 | MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0); | |
607 | if (page == MAP_FAILED) | |
608 | errExit("mmap"); | |
609 | } | |
610 | ||
611 | /* Loop, handling incoming events on the userfaultfd | |
612 | file descriptor */ | |
613 | ||
614 | for (;;) { | |
615 | ||
616 | /* See what poll() tells us about the userfaultfd */ | |
617 | ||
618 | struct pollfd pollfd; | |
619 | int nready; | |
620 | pollfd.fd = uffd; | |
621 | pollfd.events = POLLIN; | |
622 | nready = poll(&pollfd, 1, \-1); | |
623 | if (nready == \-1) | |
624 | errExit("poll"); | |
625 | ||
d1a71985 | 626 | printf("\enfault_handler_thread():\en"); |
b07243ab | 627 | printf(" poll() returns: nready = %d; " |
d1a71985 | 628 | "POLLIN = %d; POLLERR = %d\en", nready, |
b07243ab MK |
629 | (pollfd.revents & POLLIN) != 0, |
630 | (pollfd.revents & POLLERR) != 0); | |
631 | ||
632 | /* Read an event from the userfaultfd */ | |
633 | ||
634 | nread = read(uffd, &msg, sizeof(msg)); | |
635 | if (nread == 0) { | |
d1a71985 | 636 | printf("EOF on userfaultfd!\en"); |
b07243ab | 637 | exit(EXIT_FAILURE); |
00f4853c | 638 | } |
b07243ab MK |
639 | |
640 | if (nread == \-1) | |
641 | errExit("read"); | |
00f4853c | 642 | |
b07243ab MK |
643 | /* We expect only one kind of event; verify that assumption */ |
644 | ||
645 | if (msg.event != UFFD_EVENT_PAGEFAULT) { | |
d1a71985 | 646 | fprintf(stderr, "Unexpected event on userfaultfd\en"); |
b07243ab MK |
647 | exit(EXIT_FAILURE); |
648 | } | |
649 | ||
650 | /* Display info about the page\-fault event */ | |
651 | ||
652 | printf(" UFFD_EVENT_PAGEFAULT event: "); | |
653 | printf("flags = %llx; ", msg.arg.pagefault.flags); | |
d1a71985 | 654 | printf("address = %llx\en", msg.arg.pagefault.address); |
b07243ab MK |
655 | |
656 | /* Copy the page pointed to by \(aqpage\(aq into the faulting | |
657 | region. Vary the contents that are copied in, so that it | |
658 | is more obvious that each fault is handled separately. */ | |
659 | ||
660 | memset(page, \(aqA\(aq + fault_cnt % 20, page_size); | |
661 | fault_cnt++; | |
662 | ||
663 | uffdio_copy.src = (unsigned long) page; | |
664 | ||
665 | /* We need to handle page faults in units of pages(!). | |
666 | So, round faulting address down to page boundary */ | |
667 | ||
668 | uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address & | |
669 | ~(page_size \- 1); | |
670 | uffdio_copy.len = page_size; | |
671 | uffdio_copy.mode = 0; | |
672 | uffdio_copy.copy = 0; | |
673 | if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1) | |
674 | errExit("ioctl\-UFFDIO_COPY"); | |
675 | ||
d1a71985 | 676 | printf(" (uffdio_copy.copy returned %lld)\en", |
b07243ab MK |
677 | uffdio_copy.copy); |
678 | } | |
679 | } | |
680 | ||
681 | int | |
682 | main(int argc, char *argv[]) | |
683 | { | |
684 | long uffd; /* userfaultfd file descriptor */ | |
685 | char *addr; /* Start of region handled by userfaultfd */ | |
686 | unsigned long len; /* Length of region handled by userfaultfd */ | |
687 | pthread_t thr; /* ID of thread that handles page faults */ | |
688 | struct uffdio_api uffdio_api; | |
689 | struct uffdio_register uffdio_register; | |
690 | int s; | |
691 | ||
692 | if (argc != 2) { | |
d1a71985 | 693 | fprintf(stderr, "Usage: %s num\-pages\en", argv[0]); |
b07243ab MK |
694 | exit(EXIT_FAILURE); |
695 | } | |
696 | ||
697 | page_size = sysconf(_SC_PAGE_SIZE); | |
698 | len = strtoul(argv[1], NULL, 0) * page_size; | |
699 | ||
700 | /* Create and enable userfaultfd object */ | |
701 | ||
702 | uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); | |
703 | if (uffd == \-1) | |
704 | errExit("userfaultfd"); | |
705 | ||
706 | uffdio_api.api = UFFD_API; | |
707 | uffdio_api.features = 0; | |
708 | if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1) | |
709 | errExit("ioctl\-UFFDIO_API"); | |
710 | ||
711 | /* Create a private anonymous mapping. The memory will be | |
712 | demand\-zero paged\-\-that is, not yet allocated. When we | |
713 | actually touch the memory, it will be allocated via | |
714 | the userfaultfd. */ | |
715 | ||
716 | addr = mmap(NULL, len, PROT_READ | PROT_WRITE, | |
717 | MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0); | |
718 | if (addr == MAP_FAILED) | |
719 | errExit("mmap"); | |
720 | ||
d1a71985 | 721 | printf("Address returned by mmap() = %p\en", addr); |
b07243ab MK |
722 | |
723 | /* Register the memory range of the mapping we just created for | |
724 | handling by the userfaultfd object. In mode, we request to track | |
725 | missing pages (i.e., pages that have not yet been faulted in). */ | |
726 | ||
727 | uffdio_register.range.start = (unsigned long) addr; | |
728 | uffdio_register.range.len = len; | |
729 | uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; | |
730 | if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1) | |
731 | errExit("ioctl\-UFFDIO_REGISTER"); | |
732 | ||
733 | /* Create a thread that will process the userfaultfd events */ | |
734 | ||
735 | s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd); | |
736 | if (s != 0) { | |
737 | errno = s; | |
738 | errExit("pthread_create"); | |
739 | } | |
740 | ||
741 | /* Main thread now touches memory in the mapping, touching | |
742 | locations 1024 bytes apart. This will trigger userfaultfd | |
743 | events for all pages in the region. */ | |
744 | ||
745 | int l; | |
746 | l = 0xf; /* Ensure that faulting address is not on a page | |
747 | boundary, in order to test that we correctly | |
748 | handle that case in fault_handling_thread() */ | |
749 | while (l < len) { | |
750 | char c = addr[l]; | |
751 | printf("Read address %p in main(): ", addr + l); | |
d1a71985 | 752 | printf("%c\en", c); |
b07243ab MK |
753 | l += 1024; |
754 | usleep(100000); /* Slow things down a little */ | |
755 | } | |
756 | ||
757 | exit(EXIT_SUCCESS); | |
758 | } | |
e7d0bb47 | 759 | .EE |
bf9b5158 MR |
760 | .SH SEE ALSO |
761 | .BR fcntl (2), | |
4aa7f5cf | 762 | .BR ioctl (2), |
6bc6d124 | 763 | .BR ioctl_userfaultfd (2), |
6d3e02a5 | 764 | .BR madvise (2), |
4aa7f5cf | 765 | .BR mmap (2) |
efeece04 | 766 | .PP |
a2463bae | 767 | .IR Documentation/admin-guide/mm/userfaultfd.rst |
bf9b5158 | 768 | in the Linux kernel source tree |
efeece04 | 769 | .PP |