]>
Commit | Line | Data |
---|---|---|
bf9b5158 MR |
1 | .\" Copyright (c) 2016, IBM Corporation. |
2 | .\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com> | |
54e031a1 | 3 | .\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com> |
bf9b5158 MR |
4 | .\" |
5 | .\" %%%LICENSE_START(VERBATIM) | |
6 | .\" Permission is granted to make and distribute verbatim copies of this | |
7 | .\" manual provided the copyright notice and this permission notice are | |
8 | .\" preserved on all copies. | |
9 | .\" | |
10 | .\" Permission is granted to copy and distribute modified versions of this | |
11 | .\" manual under the conditions for verbatim copying, provided that the | |
12 | .\" entire resulting derived work is distributed under the terms of a | |
13 | .\" permission notice identical to this one. | |
14 | .\" | |
15 | .\" Since the Linux kernel and libraries are constantly changing, this | |
16 | .\" manual page may be incorrect or out-of-date. The author(s) assume no | |
17 | .\" responsibility for errors or omissions, or for damages resulting from | |
18 | .\" the use of the information contained herein. The author(s) may not | |
19 | .\" have taken the same level of care in the production of this manual, | |
20 | .\" which is licensed free of charge, as they might when working | |
21 | .\" professionally. | |
22 | .\" | |
23 | .\" Formatted or processed versions of this manual, if unaccompanied by | |
24 | .\" the source, must acknowledge the copyright and authors of this work. | |
25 | .\" %%%LICENSE_END | |
26 | .\" | |
4b8c67d9 | 27 | .TH USERFAULTFD 2 2017-09-15 "Linux" "Linux Programmer's Manual" |
bf9b5158 | 28 | .SH NAME |
68a9e23c | 29 | userfaultfd \- create a file descriptor for handling page faults in user space |
bf9b5158 MR |
30 | .SH SYNOPSIS |
31 | .nf | |
32 | .B #include <sys/types.h> | |
3f894561 | 33 | .B #include <linux/userfaultfd.h> |
68e4db0a | 34 | .PP |
bf9b5158 MR |
35 | .BI "int userfaultfd(int " flags ); |
36 | .fi | |
37 | .PP | |
38 | .IR Note : | |
39 | There is no glibc wrapper for this system call; see NOTES. | |
40 | .SH DESCRIPTION | |
4aa7f5cf MK |
41 | .BR userfaultfd () |
42 | creates a new userfaultfd object that can be used for delegation of page-fault | |
43 | handling to a user-space application, | |
44 | and returns a file descriptor that refers to the new object. | |
45 | The new userfaultfd object is configured using | |
bf9b5158 | 46 | .BR ioctl (2). |
efeece04 | 47 | .PP |
4aa7f5cf | 48 | Once the userfaultfd object is configured, the application can use |
bf9b5158 MR |
49 | .BR read (2) |
50 | to receive userfaultfd notifications. | |
4aa7f5cf MK |
51 | The reads from userfaultfd may be blocking or non-blocking, |
52 | depending on the value of | |
bf9b5158 MR |
53 | .I flags |
54 | used for the creation of the userfaultfd or subsequent calls to | |
4aa7f5cf | 55 | .BR fcntl (2). |
efeece04 | 56 | .PP |
bf9b5158 MR |
57 | The following values may be bitwise ORed in |
58 | .IR flags | |
59 | to change the behavior of | |
60 | .BR userfaultfd (): | |
61 | .TP | |
62 | .BR O_CLOEXEC | |
4aa7f5cf | 63 | Enable the close-on-exec flag for the new userfaultfd file descriptor. |
bf9b5158 MR |
64 | See the description of the |
65 | .B O_CLOEXEC | |
66 | flag in | |
4aa7f5cf | 67 | .BR open (2). |
bf9b5158 MR |
68 | .TP |
69 | .BR O_NONBLOCK | |
4aa7f5cf | 70 | Enables non-blocking operation for the userfaultfd object. |
bf9b5158 MR |
71 | See the description of the |
72 | .BR O_NONBLOCK | |
73 | flag in | |
74 | .BR open (2). | |
7b8695f2 MK |
75 | .PP |
76 | When the last file descriptor referring to a userfaultfd object is closed, | |
77 | all memory ranges that were registered with the object are unregistered | |
5b1c4a1e | 78 | and unread events are flushed. |
bf9b5158 | 79 | .\" |
54e031a1 MK |
80 | .SS Usage |
81 | The userfaultfd mechanism is designed to allow a thread in a multithreaded | |
82 | program to perform user-space paging for the other threads in the process. | |
83 | When a page fault occurs for one of the regions registered | |
84 | to the userfaultfd object, | |
85 | the faulting thread is put to sleep and | |
86 | an event is generated that can be read via the userfaultfd file descriptor. | |
87 | The fault-handling thread reads events from this file descriptor and services | |
88 | them using the operations described in | |
89 | .BR ioctl_userfaultfd (2). | |
90 | When servicing the page fault events, | |
91 | the fault-handling thread can trigger a wake-up for the sleeping thread. | |
efeece04 | 92 | .PP |
d28aadf6 MR |
93 | It is possible for the faulting threads and the fault-handling threads |
94 | to run in the context of different processes. | |
95 | In this case, these threads may belong to different programs, | |
96 | and the program that executes the faulting threads | |
97 | will not necessarily cooperate with the program that handles the page faults. | |
98 | In such non-cooperative mode, | |
792570de | 99 | the process that monitors userfaultfd and handles page faults |
d28aadf6 MR |
100 | needs to be aware of the changes in the virtual memory layout |
101 | of the faulting process to avoid memory corruption. | |
efeece04 | 102 | .PP |
5b1c4a1e | 103 | Starting from Linux 4.11, |
c2b3668b | 104 | userfaultfd can also notify the fault-handling threads about changes |
5b1c4a1e MR |
105 | in the virtual memory layout of the faulting process. |
106 | In addition, if the faulting process invokes | |
c2b3668b | 107 | .BR fork (2), |
5b1c4a1e MR |
108 | the userfaultfd objects associated with the parent may be duplicated |
109 | into the child process and the userfaultfd monitor will be notified | |
522ab2ff MK |
110 | (via the |
111 | .B UFFD_EVENT_FORK | |
112 | described below) | |
5b1c4a1e MR |
113 | about the file descriptor associated with the userfault objects |
114 | created for the child process, | |
522ab2ff | 115 | which allows the userfaultfd monitor to perform user-space paging |
5b1c4a1e | 116 | for the child process. |
265225c1 | 117 | Unlike page faults which have to be synchronous and require an |
20050169 MR |
118 | explicit or implicit wakeup, |
119 | all other events are delivered asynchronously and | |
120 | the non-cooperative process resumes execution as | |
265225c1 MK |
121 | soon as the userfaultfd manager executes |
122 | .BR read (2). | |
123 | The userfaultfd manager should carefully synchronize calls to | |
124 | .B UFFDIO_COPY | |
125 | with the processing of events. | |
efeece04 | 126 | .PP |
20050169 MR |
127 | The current asynchronous model of the event delivery is optimal for |
128 | single threaded non-cooperative userfaultfd manager implementations. | |
94a8ebda MK |
129 | .\" Regarding the preceding sentence, Mike Rapoport says: |
130 | .\" The major point here is that current events delivery model could be | |
131 | .\" problematic for multi-threaded monitor. I even suspect that it would be | |
132 | .\" impossible to ensure synchronization between page faults and non-page | |
133 | .\" fault events in multi-threaded monitor. | |
efeece04 | 134 | .PP |
d28aadf6 | 135 | .\" FIXME elaborate about non-cooperating mode, describe its limitations |
792570de | 136 | .\" for kernels before 4.11, features added in 4.11 |
d28aadf6 MR |
137 | .\" and limitations remaining in 4.11 |
138 | .\" Maybe it's worth adding a dedicated sub-section... | |
54e031a1 | 139 | .\" |
bf9b5158 MR |
140 | .SS Userfaultfd operation |
141 | After the userfaultfd object is created with | |
4aa7f5cf MK |
142 | .BR userfaultfd (), |
143 | the application must enable it using the | |
144 | .B UFFDIO_API | |
145 | .BR ioctl (2) | |
146 | operation. | |
147 | This operation allows a handshake between the kernel and user space | |
148 | to determine the API version and supported features. | |
5074d846 MK |
149 | This operation must be performed before any of the other |
150 | .BR ioctl (2) | |
151 | operations described below (or those operations fail with the | |
152 | .BR EINVAL | |
153 | error). | |
efeece04 | 154 | .PP |
4aa7f5cf MK |
155 | After a successful |
156 | .B UFFDIO_API | |
157 | operation, | |
158 | the application then registers memory address ranges using the | |
159 | .B UFFDIO_REGISTER | |
160 | .BR ioctl (2) | |
161 | operation. | |
162 | After successful completion of a | |
163 | .B UFFDIO_REGISTER | |
164 | operation, | |
165 | a page fault occurring in the requested memory range, and satisfying | |
166 | the mode defined at the registration time, will be forwarded by the kernel to | |
167 | the user-space application. | |
168 | The application can then use the | |
169 | .B UFFDIO_COPY | |
bf9b5158 | 170 | or |
4aa7f5cf MK |
171 | .B UFFDIO_ZERO |
172 | .BR ioctl (2) | |
173 | operations to resolve the page fault. | |
2c371032 PS |
174 | .PP |
175 | Starting from Linux 4.14, if application sets | |
176 | .B UFFD_FEATURE_SIGBUS | |
177 | feature bit using | |
178 | .B UFFDIO_API | |
179 | .BR ioctl (2), | |
180 | no page fault notification will be forwarded to | |
181 | the user-space, instead a | |
182 | .B SIGBUS | |
183 | signal is delivered to the faulting process. With this feature, | |
184 | userfaultfd can be used for robustness purpose to simply catch | |
185 | any access to areas within the registered address range that do not | |
186 | have pages allocated, without having to listen to userfaultfd events. | |
187 | No userfaultfd monitor will be required for dealing with such memory | |
188 | accesses. For example, this feature can be useful for applications that | |
189 | want to prevent the kernel from automatically allocating pages and filling | |
190 | holes in sparse files when the hole is accessed thru mapped address. | |
191 | .PP | |
192 | The | |
193 | .B UFFD_FEATURE_SIGBUS | |
194 | feature is implicitly inherited through fork() if used in combination with | |
195 | .BR UFFD_FEATURE_FORK . | |
196 | ||
efeece04 | 197 | .PP |
6bc6d124 | 198 | Details of the various |
4aa7f5cf | 199 | .BR ioctl (2) |
6bc6d124 MK |
200 | operations can be found in |
201 | .BR ioctl_userfaultfd (2). | |
efeece04 | 202 | .PP |
5b1c4a1e MR |
203 | Since Linux 4.11, events other than page-fault may enabled during |
204 | .B UFFDIO_API | |
205 | operation. | |
efeece04 | 206 | .PP |
c3d5d9df MR |
207 | Up to Linux 4.11, |
208 | userfaultfd can be used only with anonymous private memory mappings. | |
c3d5d9df MR |
209 | Since Linux 4.11, |
210 | userfaultfd can be also used with hugetlbfs and shared memory mappings. | |
efeece04 | 211 | .PP |
54e031a1 MK |
212 | .\" |
213 | .SS Reading from the userfaultfd structure | |
54e031a1 MK |
214 | Each |
215 | .BR read (2) | |
216 | from the userfaultfd file descriptor returns one or more | |
217 | .I uffd_msg | |
5b1c4a1e MR |
218 | structures, each of which describes a page-fault event |
219 | or an event required for the non-cooperative userfaultfd usage: | |
efeece04 | 220 | .PP |
54e031a1 | 221 | .in +4n |
b8302363 | 222 | .EX |
54e031a1 | 223 | struct uffd_msg { |
c2b3668b | 224 | __u8 event; /* Type of event */ |
54e031a1 MK |
225 | ... |
226 | union { | |
00f4853c | 227 | struct { |
c2b3668b MK |
228 | __u64 flags; /* Flags describing fault */ |
229 | __u64 address; /* Faulting address */ | |
54e031a1 | 230 | } pagefault; |
c2b3668b MK |
231 | |
232 | struct { /* Since Linux 4.11 */ | |
233 | __u32 ufd; /* Userfault file descriptor | |
234 | of the child process */ | |
235 | } fork; | |
236 | ||
237 | struct { /* Since Linux 4.11 */ | |
238 | __u64 from; /* Old address of remapped area */ | |
239 | __u64 to; /* New address of remapped area */ | |
240 | __u64 len; /* Original mapping length */ | |
241 | } remap; | |
242 | ||
243 | struct { /* Since Linux 4.11 */ | |
244 | __u64 start; /* Start address of removed area */ | |
245 | __u64 end; /* End address of removed area */ | |
246 | } remove; | |
b07243ab | 247 | ... |
54e031a1 MK |
248 | } arg; |
249 | ||
250 | /* Padding fields omitted */ | |
251 | } __packed; | |
b8302363 | 252 | .EE |
e646a1ba | 253 | .in |
efeece04 | 254 | .PP |
54e031a1 MK |
255 | If multiple events are available and the supplied buffer is large enough, |
256 | .BR read (2) | |
257 | returns as many events as will fit in the supplied buffer. | |
258 | If the buffer supplied to | |
259 | .BR read (2) | |
260 | is smaller than the size of the | |
261 | .I uffd_msg | |
262 | structure, the | |
263 | .BR read (2) | |
264 | fails with the error | |
265 | .BR EINVAL . | |
efeece04 | 266 | .PP |
54e031a1 MK |
267 | The fields set in the |
268 | .I uffd_msg | |
269 | structure are as follows: | |
270 | .TP | |
271 | .I event | |
272 | The type of event. | |
5b1c4a1e MR |
273 | Depending of the event type, |
274 | different fields of the | |
275 | .I arg | |
276 | union represent details required for the event processing. | |
277 | The non-page-fault events are generated only when appropriate feature | |
278 | is enabled during API handshake with | |
279 | .B UFFDIO_API | |
280 | .BR ioctl (2). | |
efeece04 | 281 | .IP |
5b1c4a1e MR |
282 | The following values can appear in the |
283 | .I event | |
284 | field: | |
285 | .RS | |
286 | .TP | |
6c12d34e | 287 | .BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)" |
5b1c4a1e MR |
288 | A page-fault event. |
289 | The page-fault details are available in the | |
290 | .I pagefault | |
291 | field. | |
54e031a1 | 292 | .TP |
6c12d34e | 293 | .BR UFFD_EVENT_FORK " (since Linux 4.11)" |
5b1c4a1e | 294 | Generated when the faulting process invokes |
522ab2ff MK |
295 | .BR fork (2) |
296 | (or | |
297 | .BR clone (2) | |
298 | without the | |
299 | .BR CLONE_VM | |
300 | flag). | |
5b1c4a1e MR |
301 | The event details are available in the |
302 | .I fork | |
303 | field. | |
23db3790 | 304 | .\" FIXME describe duplication of userfault file descriptor during fork |
5b1c4a1e | 305 | .TP |
6c12d34e | 306 | .BR UFFD_EVENT_REMAP " (since Linux 4.11)" |
5b1c4a1e | 307 | Generated when the faulting process invokes |
c2b3668b | 308 | .BR mremap (2). |
5b1c4a1e MR |
309 | The event details are available in the |
310 | .I remap | |
311 | field. | |
312 | .TP | |
6c12d34e | 313 | .BR UFFD_EVENT_REMOVE " (since Linux 4.11)" |
5b1c4a1e MR |
314 | Generated when the faulting process invokes |
315 | .BR madvise (2) | |
c2b3668b | 316 | with |
5b1c4a1e MR |
317 | .BR MADV_DONTNEED |
318 | or | |
319 | .BR MADV_REMOVE | |
320 | advice. | |
321 | The event details are available in the | |
322 | .I remove | |
323 | field. | |
324 | .TP | |
6c12d34e | 325 | .BR UFFD_EVENT_UNMAP " (since Linux 4.11)" |
5b1c4a1e MR |
326 | Generated when the faulting process unmaps a memory range, |
327 | either explicitly using | |
328 | .BR munmap (2) | |
c2b3668b | 329 | or implicitly during |
5b1c4a1e MR |
330 | .BR mmap (2) |
331 | or | |
c2b3668b | 332 | .BR mremap (2). |
5b1c4a1e MR |
333 | The event details are available in the |
334 | .I remove | |
335 | field. | |
336 | .RE | |
337 | .TP | |
338 | .I pagefault.address | |
54e031a1 MK |
339 | The address that triggered the page fault. |
340 | .TP | |
5b1c4a1e | 341 | .I pagefault.flags |
54e031a1 MK |
342 | A bit mask of flags that describe the event. |
343 | For | |
344 | .BR UFFD_EVENT_PAGEFAULT , | |
345 | the following flag may appear: | |
346 | .RS | |
347 | .TP | |
348 | .B UFFD_PAGEFAULT_FLAG_WRITE | |
349 | If the address is in a range that was registered with the | |
350 | .B UFFDIO_REGISTER_MODE_MISSING | |
351 | flag (see | |
352 | .BR ioctl_userfaultfd (2)) | |
353 | and this flag is set, this a write fault; | |
354 | otherwise it is a read fault. | |
355 | .\" | |
356 | .\" UFFD_PAGEFAULT_FLAG_WP is not yet supported. | |
357 | .RE | |
5b1c4a1e MR |
358 | .TP |
359 | .I fork.ufd | |
360 | The file descriptor associated with the userfault object | |
522ab2ff MK |
361 | created for the child created by |
362 | .BR fork (2). | |
5b1c4a1e MR |
363 | .TP |
364 | .I remap.from | |
365 | The original address of the memory range that was remapped using | |
366 | .BR mremap (2). | |
367 | .TP | |
368 | .I remap.to | |
369 | The new address of the memory range that was remapped using | |
370 | .BR mremap (2). | |
371 | .TP | |
372 | .I remap.len | |
23db3790 | 373 | The original length of the memory range that was remapped using |
5b1c4a1e MR |
374 | .BR mremap (2). |
375 | .TP | |
376 | .I remove.start | |
377 | The start address of the memory range that was freed using | |
378 | .BR madvise (2) | |
379 | or unmapped | |
380 | .TP | |
381 | .I remove.end | |
382 | The end address of the memory range that was freed using | |
383 | .BR madvise (2) | |
384 | or unmapped | |
54e031a1 | 385 | .PP |
058b32ac MK |
386 | A |
387 | .BR read (2) | |
388 | on a userfaultfd file descriptor can fail with the following errors: | |
389 | .TP | |
390 | .B EINVAL | |
391 | The userfaultfd object has not yet been enabled using the | |
392 | .BR UFFDIO_API | |
393 | .BR ioctl (2) | |
394 | operation | |
395 | .PP | |
1bf00323 MK |
396 | If the |
397 | .B O_NONBLOCK | |
398 | flag is enabled in the associated open file description, | |
399 | the userfaultfd file descriptor can be monitored with | |
54e031a1 MK |
400 | .BR poll (2), |
401 | .BR select (2), | |
402 | and | |
403 | .BR epoll (7). | |
404 | When events are available, the file descriptor indicates as readable. | |
1bf00323 MK |
405 | If the |
406 | .B O_NONBLOCK | |
407 | flag is not enabled, then | |
408 | .BR poll (2) | |
409 | (always) indicates the file as having a | |
410 | .BR POLLERR | |
411 | condition, and | |
412 | .BR select (2) | |
413 | indicates the file descriptor as both readable and writable. | |
414 | .\" FIXME What is the reason for this seemingly odd behavior with respect | |
415 | .\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c). | |
416 | .\" Something needs to be said about this. | |
bf9b5158 | 417 | .SH RETURN VALUE |
4aa7f5cf MK |
418 | On success, |
419 | .BR userfaultfd () | |
420 | returns a new file descriptor that refers to the userfaultfd object. | |
bf9b5158 MR |
421 | On error, \-1 is returned, and |
422 | .I errno | |
423 | is set appropriately. | |
424 | .SH ERRORS | |
425 | .TP | |
426 | .B EINVAL | |
427 | An unsupported value was specified in | |
428 | .IR flags . | |
429 | .TP | |
430 | .BR EMFILE | |
431 | The per-process limit on the number of open file descriptors has been | |
432 | reached | |
433 | .TP | |
434 | .B ENFILE | |
435 | The system-wide limit on the total number of open files has been | |
436 | reached. | |
437 | .TP | |
438 | .B ENOMEM | |
439 | Insufficient kernel memory was available. | |
0da8f5c1 MK |
440 | .SH VERSIONS |
441 | The | |
442 | .BR userfaultfd () | |
443 | system call first appeared in Linux 4.3. | |
efeece04 | 444 | .PP |
be9021b1 MR |
445 | The support for hugetlbfs and shared memory areas and |
446 | non-page-fault events was added in Linux 4.11 | |
bf9b5158 MR |
447 | .SH CONFORMING TO |
448 | .BR userfaultfd () | |
449 | is Linux-specific and should not be used in programs intended to be | |
450 | portable. | |
451 | .SH NOTES | |
452 | Glibc does not provide a wrapper for this system call; call it using | |
453 | .BR syscall (2). | |
efeece04 | 454 | .PP |
de6943fa MK |
455 | The userfaultfd mechanism can be used as an alternative to |
456 | traditional user-space paging techniques based on the use of the | |
457 | .BR SIGSEGV | |
458 | signal and | |
459 | .BR mmap (2). | |
460 | It can also be used to implement lazy restore | |
461 | for checkpoint/restore mechanisms, | |
462 | as well as post-copy migration to allow (nearly) uninterrupted execution | |
18448166 MR |
463 | when transferring virtual machines and Linux containers |
464 | from one host to another. | |
b07243ab MK |
465 | .SH EXAMPLE |
466 | The program below demonstrates the use of the userfaultfd mechanism. | |
467 | The program creates two threads, one of which acts as the | |
468 | page-fault handler for the process, for the pages in a demand-page zero | |
469 | region created using | |
470 | .BR mmap (2). | |
efeece04 | 471 | .PP |
b07243ab MK |
472 | The program takes one command-line argument, |
473 | which is the number of pages that will be created in a mapping | |
474 | whose page faults will be handled via userfaultfd. | |
475 | After creating a userfaultfd object, | |
476 | the program then creates an anonymous private mapping of the specified size | |
477 | and registers the address range of that mapping using the | |
478 | .B UFFDIO_REGISTER | |
479 | .BR ioctl (2) | |
480 | operation. | |
00f4853c | 481 | The program then creates a second thread that will perform the |
b07243ab | 482 | task of handling page faults. |
efeece04 | 483 | .PP |
b07243ab MK |
484 | The main thread then walks through the pages of the mapping fetching |
485 | bytes from successive pages. | |
486 | Because the pages have not yet been accessed, | |
487 | the first access of a byte in each page will trigger a page-fault event | |
488 | on the userfaultfd file descriptor. | |
efeece04 | 489 | .PP |
b07243ab MK |
490 | Each of the page-fault events is handled by the second thread, |
491 | which sits in a loop processing input from the userfaultfd file descriptor. | |
492 | In each loop iteration, the second thread first calls | |
493 | .BR poll (2) | |
494 | to check the state of the file descriptor, | |
495 | and then reads an event from the file descriptor. | |
496 | All such events should be | |
497 | .B UFFD_EVENT_PAGEFAULT | |
498 | events, | |
499 | which the thread handles by copying a page of data into | |
500 | the faulting region using the | |
00f4853c | 501 | .B UFFDIO_COPY |
b07243ab MK |
502 | .BR ioctl (2) |
503 | operation. | |
efeece04 | 504 | .PP |
b07243ab | 505 | The following is an example of what we see when running the program: |
efeece04 | 506 | .PP |
b07243ab | 507 | .in +4n |
b8302363 | 508 | .EX |
b07243ab MK |
509 | $ \fB./userfaultfd_demo 3\fP |
510 | Address returned by mmap() = 0x7fd30106c000 | |
511 | ||
512 | fault_handler_thread(): | |
513 | poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 | |
514 | UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f | |
515 | (uffdio_copy.copy returned 4096) | |
516 | Read address 0x7fd30106c00f in main(): A | |
517 | Read address 0x7fd30106c40f in main(): A | |
518 | Read address 0x7fd30106c80f in main(): A | |
519 | Read address 0x7fd30106cc0f in main(): A | |
520 | ||
521 | fault_handler_thread(): | |
522 | poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 | |
523 | UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f | |
524 | (uffdio_copy.copy returned 4096) | |
525 | Read address 0x7fd30106d00f in main(): B | |
526 | Read address 0x7fd30106d40f in main(): B | |
527 | Read address 0x7fd30106d80f in main(): B | |
528 | Read address 0x7fd30106dc0f in main(): B | |
529 | ||
530 | fault_handler_thread(): | |
531 | poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 | |
532 | UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f | |
533 | (uffdio_copy.copy returned 4096) | |
534 | Read address 0x7fd30106e00f in main(): C | |
535 | Read address 0x7fd30106e40f in main(): C | |
536 | Read address 0x7fd30106e80f in main(): C | |
537 | Read address 0x7fd30106ec0f in main(): C | |
b8302363 | 538 | .EE |
e646a1ba | 539 | .in |
b07243ab MK |
540 | .SS Program source |
541 | \& | |
e7d0bb47 | 542 | .EX |
b07243ab | 543 | /* userfaultfd_demo.c |
00f4853c | 544 | |
b07243ab MK |
545 | Licensed under the GNU General Public License version 2 or later. |
546 | */ | |
547 | #define _GNU_SOURCE | |
548 | #include <sys/types.h> | |
549 | #include <stdio.h> | |
550 | #include <linux/userfaultfd.h> | |
551 | #include <pthread.h> | |
552 | #include <errno.h> | |
553 | #include <unistd.h> | |
554 | #include <stdlib.h> | |
555 | #include <fcntl.h> | |
556 | #include <signal.h> | |
557 | #include <poll.h> | |
558 | #include <string.h> | |
559 | #include <sys/mman.h> | |
560 | #include <sys/syscall.h> | |
561 | #include <sys/ioctl.h> | |
562 | #include <poll.h> | |
563 | ||
564 | #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\ | |
565 | } while (0) | |
566 | ||
567 | static int page_size; | |
568 | ||
569 | static void * | |
570 | fault_handler_thread(void *arg) | |
571 | { | |
572 | static struct uffd_msg msg; /* Data read from userfaultfd */ | |
573 | static int fault_cnt = 0; /* Number of faults so far handled */ | |
574 | long uffd; /* userfaultfd file descriptor */ | |
575 | static char *page = NULL; | |
576 | struct uffdio_copy uffdio_copy; | |
577 | ssize_t nread; | |
578 | ||
579 | uffd = (long) arg; | |
580 | ||
581 | /* Create a page that will be copied into the faulting region */ | |
582 | ||
583 | if (page == NULL) { | |
584 | page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, | |
585 | MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0); | |
586 | if (page == MAP_FAILED) | |
587 | errExit("mmap"); | |
588 | } | |
589 | ||
590 | /* Loop, handling incoming events on the userfaultfd | |
591 | file descriptor */ | |
592 | ||
593 | for (;;) { | |
594 | ||
595 | /* See what poll() tells us about the userfaultfd */ | |
596 | ||
597 | struct pollfd pollfd; | |
598 | int nready; | |
599 | pollfd.fd = uffd; | |
600 | pollfd.events = POLLIN; | |
601 | nready = poll(&pollfd, 1, \-1); | |
602 | if (nready == \-1) | |
603 | errExit("poll"); | |
604 | ||
605 | printf("\\nfault_handler_thread():\\n"); | |
606 | printf(" poll() returns: nready = %d; " | |
607 | "POLLIN = %d; POLLERR = %d\\n", nready, | |
608 | (pollfd.revents & POLLIN) != 0, | |
609 | (pollfd.revents & POLLERR) != 0); | |
610 | ||
611 | /* Read an event from the userfaultfd */ | |
612 | ||
613 | nread = read(uffd, &msg, sizeof(msg)); | |
614 | if (nread == 0) { | |
615 | printf("EOF on userfaultfd!\\n"); | |
616 | exit(EXIT_FAILURE); | |
00f4853c | 617 | } |
b07243ab MK |
618 | |
619 | if (nread == \-1) | |
620 | errExit("read"); | |
00f4853c | 621 | |
b07243ab MK |
622 | /* We expect only one kind of event; verify that assumption */ |
623 | ||
624 | if (msg.event != UFFD_EVENT_PAGEFAULT) { | |
625 | fprintf(stderr, "Unexpected event on userfaultfd\\n"); | |
626 | exit(EXIT_FAILURE); | |
627 | } | |
628 | ||
629 | /* Display info about the page\-fault event */ | |
630 | ||
631 | printf(" UFFD_EVENT_PAGEFAULT event: "); | |
632 | printf("flags = %llx; ", msg.arg.pagefault.flags); | |
633 | printf("address = %llx\\n", msg.arg.pagefault.address); | |
634 | ||
635 | /* Copy the page pointed to by \(aqpage\(aq into the faulting | |
636 | region. Vary the contents that are copied in, so that it | |
637 | is more obvious that each fault is handled separately. */ | |
638 | ||
639 | memset(page, \(aqA\(aq + fault_cnt % 20, page_size); | |
640 | fault_cnt++; | |
641 | ||
642 | uffdio_copy.src = (unsigned long) page; | |
643 | ||
644 | /* We need to handle page faults in units of pages(!). | |
645 | So, round faulting address down to page boundary */ | |
646 | ||
647 | uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address & | |
648 | ~(page_size \- 1); | |
649 | uffdio_copy.len = page_size; | |
650 | uffdio_copy.mode = 0; | |
651 | uffdio_copy.copy = 0; | |
652 | if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1) | |
653 | errExit("ioctl\-UFFDIO_COPY"); | |
654 | ||
655 | printf(" (uffdio_copy.copy returned %lld)\\n", | |
656 | uffdio_copy.copy); | |
657 | } | |
658 | } | |
659 | ||
660 | int | |
661 | main(int argc, char *argv[]) | |
662 | { | |
663 | long uffd; /* userfaultfd file descriptor */ | |
664 | char *addr; /* Start of region handled by userfaultfd */ | |
665 | unsigned long len; /* Length of region handled by userfaultfd */ | |
666 | pthread_t thr; /* ID of thread that handles page faults */ | |
667 | struct uffdio_api uffdio_api; | |
668 | struct uffdio_register uffdio_register; | |
669 | int s; | |
670 | ||
671 | if (argc != 2) { | |
672 | fprintf(stderr, "Usage: %s num\-pages\\n", argv[0]); | |
673 | exit(EXIT_FAILURE); | |
674 | } | |
675 | ||
676 | page_size = sysconf(_SC_PAGE_SIZE); | |
677 | len = strtoul(argv[1], NULL, 0) * page_size; | |
678 | ||
679 | /* Create and enable userfaultfd object */ | |
680 | ||
681 | uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); | |
682 | if (uffd == \-1) | |
683 | errExit("userfaultfd"); | |
684 | ||
685 | uffdio_api.api = UFFD_API; | |
686 | uffdio_api.features = 0; | |
687 | if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1) | |
688 | errExit("ioctl\-UFFDIO_API"); | |
689 | ||
690 | /* Create a private anonymous mapping. The memory will be | |
691 | demand\-zero paged\-\-that is, not yet allocated. When we | |
692 | actually touch the memory, it will be allocated via | |
693 | the userfaultfd. */ | |
694 | ||
695 | addr = mmap(NULL, len, PROT_READ | PROT_WRITE, | |
696 | MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0); | |
697 | if (addr == MAP_FAILED) | |
698 | errExit("mmap"); | |
699 | ||
700 | printf("Address returned by mmap() = %p\\n", addr); | |
701 | ||
702 | /* Register the memory range of the mapping we just created for | |
703 | handling by the userfaultfd object. In mode, we request to track | |
704 | missing pages (i.e., pages that have not yet been faulted in). */ | |
705 | ||
706 | uffdio_register.range.start = (unsigned long) addr; | |
707 | uffdio_register.range.len = len; | |
708 | uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; | |
709 | if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1) | |
710 | errExit("ioctl\-UFFDIO_REGISTER"); | |
711 | ||
712 | /* Create a thread that will process the userfaultfd events */ | |
713 | ||
714 | s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd); | |
715 | if (s != 0) { | |
716 | errno = s; | |
717 | errExit("pthread_create"); | |
718 | } | |
719 | ||
720 | /* Main thread now touches memory in the mapping, touching | |
721 | locations 1024 bytes apart. This will trigger userfaultfd | |
722 | events for all pages in the region. */ | |
723 | ||
724 | int l; | |
725 | l = 0xf; /* Ensure that faulting address is not on a page | |
726 | boundary, in order to test that we correctly | |
727 | handle that case in fault_handling_thread() */ | |
728 | while (l < len) { | |
729 | char c = addr[l]; | |
730 | printf("Read address %p in main(): ", addr + l); | |
731 | printf("%c\\n", c); | |
732 | l += 1024; | |
733 | usleep(100000); /* Slow things down a little */ | |
734 | } | |
735 | ||
736 | exit(EXIT_SUCCESS); | |
737 | } | |
e7d0bb47 | 738 | .EE |
bf9b5158 MR |
739 | .SH SEE ALSO |
740 | .BR fcntl (2), | |
4aa7f5cf | 741 | .BR ioctl (2), |
6bc6d124 | 742 | .BR ioctl_userfaultfd (2), |
6d3e02a5 | 743 | .BR madvise (2), |
4aa7f5cf | 744 | .BR mmap (2) |
efeece04 | 745 | .PP |
bf9b5158 MR |
746 | .IR Documentation/vm/userfaultfd.txt |
747 | in the Linux kernel source tree | |
efeece04 | 748 | .PP |