]> git.ipfire.org Git - thirdparty/man-pages.git/blob - man2/userfaultfd.2
man*/: srcfix (Use .P instead of .PP or .LP)
[thirdparty/man-pages.git] / man2 / userfaultfd.2
1 .\" Copyright (c) 2016, IBM Corporation.
2 .\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com>
3 .\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com>
4 .\"
5 .\" SPDX-License-Identifier: Linux-man-pages-copyleft
6 .\"
7 .TH userfaultfd 2 (date) "Linux man-pages (unreleased)"
8 .SH NAME
9 userfaultfd \- create a file descriptor for handling page faults in user space
10 .SH LIBRARY
11 Standard C library
12 .RI ( libc ", " \-lc )
13 .SH SYNOPSIS
14 .nf
15 .BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
16 .BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
17 .BR "#include <linux/userfaultfd.h>" " /* Definition of " UFFD_* " constants */"
18 .B #include <unistd.h>
19 .P
20 .BI "int syscall(SYS_userfaultfd, int " flags );
21 .fi
22 .P
23 .IR Note :
24 glibc provides no wrapper for
25 .BR userfaultfd (),
26 necessitating the use of
27 .BR syscall (2).
28 .SH DESCRIPTION
29 .BR userfaultfd ()
30 creates a new userfaultfd object that can be used for delegation of page-fault
31 handling to a user-space application,
32 and returns a file descriptor that refers to the new object.
33 The new userfaultfd object is configured using
34 .BR ioctl (2).
35 .P
36 Once the userfaultfd object is configured, the application can use
37 .BR read (2)
38 to receive userfaultfd notifications.
39 The reads from userfaultfd may be blocking or non-blocking,
40 depending on the value of
41 .I flags
42 used for the creation of the userfaultfd or subsequent calls to
43 .BR fcntl (2).
44 .P
45 The following values may be bitwise ORed in
46 .I flags
47 to change the behavior of
48 .BR userfaultfd ():
49 .TP
50 .B O_CLOEXEC
51 Enable the close-on-exec flag for the new userfaultfd file descriptor.
52 See the description of the
53 .B O_CLOEXEC
54 flag in
55 .BR open (2).
56 .TP
57 .B O_NONBLOCK
58 Enables non-blocking operation for the userfaultfd object.
59 See the description of the
60 .B O_NONBLOCK
61 flag in
62 .BR open (2).
63 .TP
64 .B UFFD_USER_MODE_ONLY
65 This is an userfaultfd-specific flag that was introduced in Linux 5.11.
66 When set, the userfaultfd object will only be able to handle
67 page faults originated from the user space on the registered regions.
68 When a kernel-originated fault was triggered
69 on the registered range with this userfaultfd, a
70 .B SIGBUS
71 signal will be delivered.
72 .P
73 When the last file descriptor referring to a userfaultfd object is closed,
74 all memory ranges that were registered with the object are unregistered
75 and unread events are flushed.
76 .\"
77 .P
78 Userfaultfd supports three modes of registration:
79 .TP
80 .BR UFFDIO_REGISTER_MODE_MISSING " (since Linux 4.10)"
81 When registered with
82 .B UFFDIO_REGISTER_MODE_MISSING
83 mode, user-space will receive a page-fault notification
84 when a missing page is accessed.
85 The faulted thread will be stopped from execution until the page fault is
86 resolved from user-space by either an
87 .B UFFDIO_COPY
88 or an
89 .B UFFDIO_ZEROPAGE
90 ioctl.
91 .TP
92 .BR UFFDIO_REGISTER_MODE_MINOR " (since Linux 5.13)"
93 When registered with
94 .B UFFDIO_REGISTER_MODE_MINOR
95 mode, user-space will receive a page-fault notification
96 when a minor page fault occurs.
97 That is,
98 when a backing page is in the page cache,
99 but page table entries don't yet exist.
100 The faulted thread will be stopped from execution
101 until the page fault is resolved from user-space by an
102 .B UFFDIO_CONTINUE
103 ioctl.
104 .TP
105 .BR UFFDIO_REGISTER_MODE_WP " (since Linux 5.7)"
106 When registered with
107 .B UFFDIO_REGISTER_MODE_WP
108 mode, user-space will receive a page-fault notification
109 when a write-protected page is written.
110 The faulted thread will be stopped from execution
111 until user-space write-unprotects the page using an
112 .B UFFDIO_WRITEPROTECT
113 ioctl.
114 .P
115 Multiple modes can be enabled at the same time for the same memory range.
116 .P
117 Since Linux 4.14, a userfaultfd page-fault notification can selectively embed
118 faulting thread ID information into the notification.
119 One needs to enable this feature explicitly using the
120 .B UFFD_FEATURE_THREAD_ID
121 feature bit when initializing the userfaultfd context.
122 By default, thread ID reporting is disabled.
123 .SS Usage
124 The userfaultfd mechanism is designed to allow a thread in a multithreaded
125 program to perform user-space paging for the other threads in the process.
126 When a page fault occurs for one of the regions registered
127 to the userfaultfd object,
128 the faulting thread is put to sleep and
129 an event is generated that can be read via the userfaultfd file descriptor.
130 The fault-handling thread reads events from this file descriptor and services
131 them using the operations described in
132 .BR ioctl_userfaultfd (2).
133 When servicing the page fault events,
134 the fault-handling thread can trigger a wake-up for the sleeping thread.
135 .P
136 It is possible for the faulting threads and the fault-handling threads
137 to run in the context of different processes.
138 In this case, these threads may belong to different programs,
139 and the program that executes the faulting threads
140 will not necessarily cooperate with the program that handles the page faults.
141 In such non-cooperative mode,
142 the process that monitors userfaultfd and handles page faults
143 needs to be aware of the changes in the virtual memory layout
144 of the faulting process to avoid memory corruption.
145 .P
146 Since Linux 4.11,
147 userfaultfd can also notify the fault-handling threads about changes
148 in the virtual memory layout of the faulting process.
149 In addition, if the faulting process invokes
150 .BR fork (2),
151 the userfaultfd objects associated with the parent may be duplicated
152 into the child process and the userfaultfd monitor will be notified
153 (via the
154 .B UFFD_EVENT_FORK
155 described below)
156 about the file descriptor associated with the userfault objects
157 created for the child process,
158 which allows the userfaultfd monitor to perform user-space paging
159 for the child process.
160 Unlike page faults which have to be synchronous and require an
161 explicit or implicit wakeup,
162 all other events are delivered asynchronously and
163 the non-cooperative process resumes execution as
164 soon as the userfaultfd manager executes
165 .BR read (2).
166 The userfaultfd manager should carefully synchronize calls to
167 .B UFFDIO_COPY
168 with the processing of events.
169 .P
170 The current asynchronous model of the event delivery is optimal for
171 single threaded non-cooperative userfaultfd manager implementations.
172 .\" Regarding the preceding sentence, Mike Rapoport says:
173 .\" The major point here is that current events delivery model could be
174 .\" problematic for multi-threaded monitor. I even suspect that it would be
175 .\" impossible to ensure synchronization between page faults and non-page
176 .\" fault events in multi-threaded monitor.
177 .\" .P
178 .\" FIXME elaborate about non-cooperating mode, describe its limitations
179 .\" for kernels before Linux 4.11, features added in Linux 4.11
180 .\" and limitations remaining in Linux 4.11
181 .\" Maybe it's worth adding a dedicated sub-section...
182 .\"
183 .P
184 Since Linux 5.7, userfaultfd is able to do
185 synchronous page dirty tracking using the new write-protect register mode.
186 One should check against the feature bit
187 .B UFFD_FEATURE_PAGEFAULT_FLAG_WP
188 before using this feature.
189 Similar to the original userfaultfd missing mode, the write-protect mode will
190 generate a userfaultfd notification when the protected page is written.
191 The user needs to resolve the page fault by unprotecting the faulted page and
192 kicking the faulted thread to continue.
193 For more information,
194 please refer to the "Userfaultfd write-protect mode" section.
195 .\"
196 .SS Userfaultfd operation
197 After the userfaultfd object is created with
198 .BR userfaultfd (),
199 the application must enable it using the
200 .B UFFDIO_API
201 .BR ioctl (2)
202 operation.
203 This operation allows a two-step handshake between the kernel and user space
204 to determine what API version and features the kernel supports,
205 and then to enable those features user space wants.
206 This operation must be performed before any of the other
207 .BR ioctl (2)
208 operations described below (or those operations fail with the
209 .B EINVAL
210 error).
211 .P
212 After a successful
213 .B UFFDIO_API
214 operation,
215 the application then registers memory address ranges using the
216 .B UFFDIO_REGISTER
217 .BR ioctl (2)
218 operation.
219 After successful completion of a
220 .B UFFDIO_REGISTER
221 operation,
222 a page fault occurring in the requested memory range, and satisfying
223 the mode defined at the registration time, will be forwarded by the kernel to
224 the user-space application.
225 The application can then use various (e.g.,
226 .BR UFFDIO_COPY ,
227 .BR UFFDIO_ZEROPAGE ,
228 or
229 .BR UFFDIO_CONTINUE )
230 .BR ioctl (2)
231 operations to resolve the page fault.
232 .P
233 Since Linux 4.14, if the application sets the
234 .B UFFD_FEATURE_SIGBUS
235 feature bit using the
236 .B UFFDIO_API
237 .BR ioctl (2),
238 no page-fault notification will be forwarded to user space.
239 Instead a
240 .B SIGBUS
241 signal is delivered to the faulting process.
242 With this feature,
243 userfaultfd can be used for robustness purposes to simply catch
244 any access to areas within the registered address range that do not
245 have pages allocated, without having to listen to userfaultfd events.
246 No userfaultfd monitor will be required for dealing with such memory
247 accesses.
248 For example, this feature can be useful for applications that
249 want to prevent the kernel from automatically allocating pages and filling
250 holes in sparse files when the hole is accessed through a memory mapping.
251 .P
252 The
253 .B UFFD_FEATURE_SIGBUS
254 feature is implicitly inherited through
255 .BR fork (2)
256 if used in combination with
257 .BR UFFD_FEATURE_FORK .
258 .P
259 Details of the various
260 .BR ioctl (2)
261 operations can be found in
262 .BR ioctl_userfaultfd (2).
263 .P
264 Since Linux 4.11, events other than page-fault may enabled during
265 .B UFFDIO_API
266 operation.
267 .P
268 Up to Linux 4.11,
269 userfaultfd can be used only with anonymous private memory mappings.
270 Since Linux 4.11,
271 userfaultfd can be also used with hugetlbfs and shared memory mappings.
272 .\"
273 .SS Userfaultfd write-protect mode (since Linux 5.7)
274 Since Linux 5.7, userfaultfd supports write-protect mode for anonymous memory.
275 The user needs to first check availability of this feature using
276 .B UFFDIO_API
277 ioctl against the feature bit
278 .B UFFD_FEATURE_PAGEFAULT_FLAG_WP
279 before using this feature.
280 .P
281 Since Linux 5.19,
282 the write-protection mode was also supported on
283 shmem and hugetlbfs memory types.
284 It can be detected with the feature bit
285 .BR UFFD_FEATURE_WP_HUGETLBFS_SHMEM .
286 .P
287 To register with userfaultfd write-protect mode, the user needs to initiate the
288 .B UFFDIO_REGISTER
289 ioctl with mode
290 .B UFFDIO_REGISTER_MODE_WP
291 set.
292 Note that it is legal to monitor the same memory range with multiple modes.
293 For example, the user can do
294 .B UFFDIO_REGISTER
295 with the mode set to
296 .BR "UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP" .
297 When there is only
298 .B UFFDIO_REGISTER_MODE_WP
299 registered, user-space will
300 .I not
301 receive any notification when a missing page is written.
302 Instead, user-space will receive a write-protect page-fault notification
303 only when an existing but write-protected page got written.
304 .P
305 After the
306 .B UFFDIO_REGISTER
307 ioctl completed with
308 .B UFFDIO_REGISTER_MODE_WP
309 mode set,
310 the user can write-protect any existing memory within the range using the ioctl
311 .B UFFDIO_WRITEPROTECT
312 where
313 .I uffdio_writeprotect.mode
314 should be set to
315 .BR UFFDIO_WRITEPROTECT_MODE_WP .
316 .P
317 When a write-protect event happens,
318 user-space will receive a page-fault notification whose
319 .I uffd_msg.pagefault.flags
320 will be with
321 .B UFFD_PAGEFAULT_FLAG_WP
322 flag set.
323 Note: since only writes can trigger this kind of fault,
324 write-protect notifications will always have the
325 .B UFFD_PAGEFAULT_FLAG_WRITE
326 bit set along with the
327 .B UFFD_PAGEFAULT_FLAG_WP
328 bit.
329 .P
330 To resolve a write-protection page fault, the user should initiate another
331 .B UFFDIO_WRITEPROTECT
332 ioctl, whose
333 .I uffd_msg.pagefault.flags
334 should have the flag
335 .B UFFDIO_WRITEPROTECT_MODE_WP
336 cleared upon the faulted page or range.
337 .\"
338 .SS Userfaultfd minor fault mode (since Linux 5.13)
339 Since Linux 5.13,
340 userfaultfd supports minor fault mode.
341 In this mode,
342 fault messages are produced not for major faults
343 (where the page was missing),
344 but rather for minor faults,
345 where a page exists in the page cache,
346 but the page table entries are not yet present.
347 The user needs to first check availability of this feature using the
348 .B UFFDIO_API
349 ioctl with the appropriate feature bits set before using this feature:
350 .B UFFD_FEATURE_MINOR_HUGETLBFS
351 since Linux 5.13,
352 or
353 .B UFFD_FEATURE_MINOR_SHMEM
354 since Linux 5.14.
355 .P
356 To register with userfaultfd minor fault mode,
357 the user needs to initiate the
358 .B UFFDIO_REGISTER
359 ioctl with mode
360 .B UFFD_REGISTER_MODE_MINOR
361 set.
362 .P
363 When a minor fault occurs,
364 user-space will receive a page-fault notification
365 whose
366 .I uffd_msg.pagefault.flags
367 will have the
368 .B UFFD_PAGEFAULT_FLAG_MINOR
369 flag set.
370 .P
371 To resolve a minor page fault,
372 the handler should decide whether or not
373 the existing page contents need to be modified first.
374 If so,
375 this should be done in-place via a second,
376 non-userfaultfd-registered mapping
377 to the same backing page
378 (e.g., by mapping the shmem or hugetlbfs file twice).
379 Once the page is considered "up to date",
380 the fault can be resolved by initiating an
381 .B UFFDIO_CONTINUE
382 ioctl,
383 which installs the page table entries and
384 (by default)
385 wakes up the faulting thread(s).
386 .P
387 Minor fault mode supports only hugetlbfs-backed (since Linux 5.13)
388 and shmem-backed (since Linux 5.14) memory.
389 .\"
390 .SS Reading from the userfaultfd structure
391 Each
392 .BR read (2)
393 from the userfaultfd file descriptor returns one or more
394 .I uffd_msg
395 structures, each of which describes a page-fault event
396 or an event required for the non-cooperative userfaultfd usage:
397 .P
398 .in +4n
399 .EX
400 struct uffd_msg {
401 __u8 event; /* Type of event */
402 ...
403 union {
404 struct {
405 __u64 flags; /* Flags describing fault */
406 __u64 address; /* Faulting address */
407 union {
408 __u32 ptid; /* Thread ID of the fault */
409 } feat;
410 } pagefault;
411 \&
412 struct { /* Since Linux 4.11 */
413 __u32 ufd; /* Userfault file descriptor
414 of the child process */
415 } fork;
416 \&
417 struct { /* Since Linux 4.11 */
418 __u64 from; /* Old address of remapped area */
419 __u64 to; /* New address of remapped area */
420 __u64 len; /* Original mapping length */
421 } remap;
422 \&
423 struct { /* Since Linux 4.11 */
424 __u64 start; /* Start address of removed area */
425 __u64 end; /* End address of removed area */
426 } remove;
427 ...
428 } arg;
429 \&
430 /* Padding fields omitted */
431 } __packed;
432 .EE
433 .in
434 .P
435 If multiple events are available and the supplied buffer is large enough,
436 .BR read (2)
437 returns as many events as will fit in the supplied buffer.
438 If the buffer supplied to
439 .BR read (2)
440 is smaller than the size of the
441 .I uffd_msg
442 structure, the
443 .BR read (2)
444 fails with the error
445 .BR EINVAL .
446 .P
447 The fields set in the
448 .I uffd_msg
449 structure are as follows:
450 .TP
451 .I event
452 The type of event.
453 Depending of the event type,
454 different fields of the
455 .I arg
456 union represent details required for the event processing.
457 The non-page-fault events are generated only when appropriate feature
458 is enabled during API handshake with
459 .B UFFDIO_API
460 .BR ioctl (2).
461 .IP
462 The following values can appear in the
463 .I event
464 field:
465 .RS
466 .TP
467 .BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)"
468 A page-fault event.
469 The page-fault details are available in the
470 .I pagefault
471 field.
472 .TP
473 .BR UFFD_EVENT_FORK " (since Linux 4.11)"
474 Generated when the faulting process invokes
475 .BR fork (2)
476 (or
477 .BR clone (2)
478 without the
479 .B CLONE_VM
480 flag).
481 The event details are available in the
482 .I fork
483 field.
484 .\" FIXME describe duplication of userfault file descriptor during fork
485 .TP
486 .BR UFFD_EVENT_REMAP " (since Linux 4.11)"
487 Generated when the faulting process invokes
488 .BR mremap (2).
489 The event details are available in the
490 .I remap
491 field.
492 .TP
493 .BR UFFD_EVENT_REMOVE " (since Linux 4.11)"
494 Generated when the faulting process invokes
495 .BR madvise (2)
496 with
497 .B MADV_DONTNEED
498 or
499 .B MADV_REMOVE
500 advice.
501 The event details are available in the
502 .I remove
503 field.
504 .TP
505 .BR UFFD_EVENT_UNMAP " (since Linux 4.11)"
506 Generated when the faulting process unmaps a memory range,
507 either explicitly using
508 .BR munmap (2)
509 or implicitly during
510 .BR mmap (2)
511 or
512 .BR mremap (2).
513 The event details are available in the
514 .I remove
515 field.
516 .RE
517 .TP
518 .I pagefault.address
519 The address that triggered the page fault.
520 .TP
521 .I pagefault.flags
522 A bit mask of flags that describe the event.
523 For
524 .BR UFFD_EVENT_PAGEFAULT ,
525 the following flag may appear:
526 .RS
527 .TP
528 .B UFFD_PAGEFAULT_FLAG_WP
529 If this flag is set, then the fault was a write-protect fault.
530 .TP
531 .B UFFD_PAGEFAULT_FLAG_MINOR
532 If this flag is set, then the fault was a minor fault.
533 .TP
534 .B UFFD_PAGEFAULT_FLAG_WRITE
535 If this flag is set, then the fault was a write fault.
536 .P
537 If neither
538 .B UFFD_PAGEFAULT_FLAG_WP
539 nor
540 .B UFFD_PAGEFAULT_FLAG_MINOR
541 are set, then the fault was a missing fault.
542 .RE
543 .TP
544 .I pagefault.feat.pid
545 The thread ID that triggered the page fault.
546 .TP
547 .I fork.ufd
548 The file descriptor associated with the userfault object
549 created for the child created by
550 .BR fork (2).
551 .TP
552 .I remap.from
553 The original address of the memory range that was remapped using
554 .BR mremap (2).
555 .TP
556 .I remap.to
557 The new address of the memory range that was remapped using
558 .BR mremap (2).
559 .TP
560 .I remap.len
561 The original length of the memory range that was remapped using
562 .BR mremap (2).
563 .TP
564 .I remove.start
565 The start address of the memory range that was freed using
566 .BR madvise (2)
567 or unmapped
568 .TP
569 .I remove.end
570 The end address of the memory range that was freed using
571 .BR madvise (2)
572 or unmapped
573 .P
574 A
575 .BR read (2)
576 on a userfaultfd file descriptor can fail with the following errors:
577 .TP
578 .B EINVAL
579 The userfaultfd object has not yet been enabled using the
580 .B UFFDIO_API
581 .BR ioctl (2)
582 operation
583 .P
584 If the
585 .B O_NONBLOCK
586 flag is enabled in the associated open file description,
587 the userfaultfd file descriptor can be monitored with
588 .BR poll (2),
589 .BR select (2),
590 and
591 .BR epoll (7).
592 When events are available, the file descriptor indicates as readable.
593 If the
594 .B O_NONBLOCK
595 flag is not enabled, then
596 .BR poll (2)
597 (always) indicates the file as having a
598 .B POLLERR
599 condition, and
600 .BR select (2)
601 indicates the file descriptor as both readable and writable.
602 .\" FIXME What is the reason for this seemingly odd behavior with respect
603 .\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c).
604 .\" Something needs to be said about this.
605 .SH RETURN VALUE
606 On success,
607 .BR userfaultfd ()
608 returns a new file descriptor that refers to the userfaultfd object.
609 On error, \-1 is returned, and
610 .I errno
611 is set to indicate the error.
612 .SH ERRORS
613 .TP
614 .B EINVAL
615 An unsupported value was specified in
616 .IR flags .
617 .TP
618 .B EMFILE
619 The per-process limit on the number of open file descriptors has been
620 reached
621 .TP
622 .B ENFILE
623 The system-wide limit on the total number of open files has been
624 reached.
625 .TP
626 .B ENOMEM
627 Insufficient kernel memory was available.
628 .TP
629 .BR EPERM " (since Linux 5.2)"
630 .\" cefdca0a86be517bc390fc4541e3674b8e7803b0
631 The caller is not privileged (does not have the
632 .B CAP_SYS_PTRACE
633 capability in the initial user namespace), and
634 .I /proc/sys/vm/unprivileged_userfaultfd
635 has the value 0.
636 .SH STANDARDS
637 Linux.
638 .SH HISTORY
639 Linux 4.3.
640 .P
641 Support for hugetlbfs and shared memory areas and
642 non-page-fault events was added in Linux 4.11
643 .SH NOTES
644 The userfaultfd mechanism can be used as an alternative to
645 traditional user-space paging techniques based on the use of the
646 .B SIGSEGV
647 signal and
648 .BR mmap (2).
649 It can also be used to implement lazy restore
650 for checkpoint/restore mechanisms,
651 as well as post-copy migration to allow (nearly) uninterrupted execution
652 when transferring virtual machines and Linux containers
653 from one host to another.
654 .SH BUGS
655 If the
656 .B UFFD_FEATURE_EVENT_FORK
657 is enabled and a system call from the
658 .BR fork (2)
659 family is interrupted by a signal or failed, a stale userfaultfd descriptor
660 might be created.
661 In this case, a spurious
662 .B UFFD_EVENT_FORK
663 will be delivered to the userfaultfd monitor.
664 .SH EXAMPLES
665 The program below demonstrates the use of the userfaultfd mechanism.
666 The program creates two threads, one of which acts as the
667 page-fault handler for the process, for the pages in a demand-page zero
668 region created using
669 .BR mmap (2).
670 .P
671 The program takes one command-line argument,
672 which is the number of pages that will be created in a mapping
673 whose page faults will be handled via userfaultfd.
674 After creating a userfaultfd object,
675 the program then creates an anonymous private mapping of the specified size
676 and registers the address range of that mapping using the
677 .B UFFDIO_REGISTER
678 .BR ioctl (2)
679 operation.
680 The program then creates a second thread that will perform the
681 task of handling page faults.
682 .P
683 The main thread then walks through the pages of the mapping fetching
684 bytes from successive pages.
685 Because the pages have not yet been accessed,
686 the first access of a byte in each page will trigger a page-fault event
687 on the userfaultfd file descriptor.
688 .P
689 Each of the page-fault events is handled by the second thread,
690 which sits in a loop processing input from the userfaultfd file descriptor.
691 In each loop iteration, the second thread first calls
692 .BR poll (2)
693 to check the state of the file descriptor,
694 and then reads an event from the file descriptor.
695 All such events should be
696 .B UFFD_EVENT_PAGEFAULT
697 events,
698 which the thread handles by copying a page of data into
699 the faulting region using the
700 .B UFFDIO_COPY
701 .BR ioctl (2)
702 operation.
703 .P
704 The following is an example of what we see when running the program:
705 .P
706 .in +4n
707 .EX
708 $ \fB./userfaultfd_demo 3\fP
709 Address returned by mmap() = 0x7fd30106c000
710 \&
711 fault_handler_thread():
712 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
713 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f
714 (uffdio_copy.copy returned 4096)
715 Read address 0x7fd30106c00f in main(): A
716 Read address 0x7fd30106c40f in main(): A
717 Read address 0x7fd30106c80f in main(): A
718 Read address 0x7fd30106cc0f in main(): A
719 \&
720 fault_handler_thread():
721 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
722 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f
723 (uffdio_copy.copy returned 4096)
724 Read address 0x7fd30106d00f in main(): B
725 Read address 0x7fd30106d40f in main(): B
726 Read address 0x7fd30106d80f in main(): B
727 Read address 0x7fd30106dc0f in main(): B
728 \&
729 fault_handler_thread():
730 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
731 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f
732 (uffdio_copy.copy returned 4096)
733 Read address 0x7fd30106e00f in main(): C
734 Read address 0x7fd30106e40f in main(): C
735 Read address 0x7fd30106e80f in main(): C
736 Read address 0x7fd30106ec0f in main(): C
737 .EE
738 .in
739 .SS Program source
740 \&
741 .\" SRC BEGIN (userfaultfd.c)
742 .EX
743 /* userfaultfd_demo.c
744 \&
745 Licensed under the GNU General Public License version 2 or later.
746 */
747 #define _GNU_SOURCE
748 #include <err.h>
749 #include <errno.h>
750 #include <fcntl.h>
751 #include <inttypes.h>
752 #include <linux/userfaultfd.h>
753 #include <poll.h>
754 #include <pthread.h>
755 #include <stdio.h>
756 #include <stdlib.h>
757 #include <string.h>
758 #include <sys/ioctl.h>
759 #include <sys/mman.h>
760 #include <sys/syscall.h>
761 #include <unistd.h>
762 \&
763 static int page_size;
764 \&
765 static void *
766 fault_handler_thread(void *arg)
767 {
768 int nready;
769 long uffd; /* userfaultfd file descriptor */
770 ssize_t nread;
771 struct pollfd pollfd;
772 struct uffdio_copy uffdio_copy;
773 \&
774 static int fault_cnt = 0; /* Number of faults so far handled */
775 static char *page = NULL;
776 static struct uffd_msg msg; /* Data read from userfaultfd */
777 \&
778 uffd = (long) arg;
779 \&
780 /* Create a page that will be copied into the faulting region. */
781 \&
782 if (page == NULL) {
783 page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
784 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
785 if (page == MAP_FAILED)
786 err(EXIT_FAILURE, "mmap");
787 }
788 \&
789 /* Loop, handling incoming events on the userfaultfd
790 file descriptor. */
791 \&
792 for (;;) {
793 \&
794 /* See what poll() tells us about the userfaultfd. */
795 \&
796 pollfd.fd = uffd;
797 pollfd.events = POLLIN;
798 nready = poll(&pollfd, 1, \-1);
799 if (nready == \-1)
800 err(EXIT_FAILURE, "poll");
801 \&
802 printf("\enfault_handler_thread():\en");
803 printf(" poll() returns: nready = %d; "
804 "POLLIN = %d; POLLERR = %d\en", nready,
805 (pollfd.revents & POLLIN) != 0,
806 (pollfd.revents & POLLERR) != 0);
807 \&
808 /* Read an event from the userfaultfd. */
809 \&
810 nread = read(uffd, &msg, sizeof(msg));
811 if (nread == 0) {
812 printf("EOF on userfaultfd!\en");
813 exit(EXIT_FAILURE);
814 }
815 \&
816 if (nread == \-1)
817 err(EXIT_FAILURE, "read");
818 \&
819 /* We expect only one kind of event; verify that assumption. */
820 \&
821 if (msg.event != UFFD_EVENT_PAGEFAULT) {
822 fprintf(stderr, "Unexpected event on userfaultfd\en");
823 exit(EXIT_FAILURE);
824 }
825 \&
826 /* Display info about the page\-fault event. */
827 \&
828 printf(" UFFD_EVENT_PAGEFAULT event: ");
829 printf("flags = %"PRIx64"; ", msg.arg.pagefault.flags);
830 printf("address = %"PRIx64"\en", msg.arg.pagefault.address);
831 \&
832 /* Copy the page pointed to by \[aq]page\[aq] into the faulting
833 region. Vary the contents that are copied in, so that it
834 is more obvious that each fault is handled separately. */
835 \&
836 memset(page, \[aq]A\[aq] + fault_cnt % 20, page_size);
837 fault_cnt++;
838 \&
839 uffdio_copy.src = (unsigned long) page;
840 \&
841 /* We need to handle page faults in units of pages(!).
842 So, round faulting address down to page boundary. */
843 \&
844 uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
845 \[ti](page_size \- 1);
846 uffdio_copy.len = page_size;
847 uffdio_copy.mode = 0;
848 uffdio_copy.copy = 0;
849 if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1)
850 err(EXIT_FAILURE, "ioctl\-UFFDIO_COPY");
851 \&
852 printf(" (uffdio_copy.copy returned %"PRId64")\en",
853 uffdio_copy.copy);
854 }
855 }
856 \&
857 int
858 main(int argc, char *argv[])
859 {
860 int s;
861 char c;
862 char *addr; /* Start of region handled by userfaultfd */
863 long uffd; /* userfaultfd file descriptor */
864 size_t len, l; /* Length of region handled by userfaultfd */
865 pthread_t thr; /* ID of thread that handles page faults */
866 struct uffdio_api uffdio_api;
867 struct uffdio_register uffdio_register;
868 \&
869 if (argc != 2) {
870 fprintf(stderr, "Usage: %s num\-pages\en", argv[0]);
871 exit(EXIT_FAILURE);
872 }
873 \&
874 page_size = sysconf(_SC_PAGE_SIZE);
875 len = strtoull(argv[1], NULL, 0) * page_size;
876 \&
877 /* Create and enable userfaultfd object. */
878 \&
879 uffd = syscall(SYS_userfaultfd, O_CLOEXEC | O_NONBLOCK);
880 if (uffd == \-1)
881 err(EXIT_FAILURE, "userfaultfd");
882 \&
883 /* NOTE: Two-step feature handshake is not needed here, since this
884 example doesn't require any specific features.
885
886 Programs that *do* should call UFFDIO_API twice: once with
887 `features = 0` to detect features supported by this kernel, and
888 again with the subset of features the program actually wants to
889 enable. */
890 uffdio_api.api = UFFD_API;
891 uffdio_api.features = 0;
892 if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1)
893 err(EXIT_FAILURE, "ioctl\-UFFDIO_API");
894 \&
895 /* Create a private anonymous mapping. The memory will be
896 demand\-zero paged\-\-that is, not yet allocated. When we
897 actually touch the memory, it will be allocated via
898 the userfaultfd. */
899 \&
900 addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
901 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
902 if (addr == MAP_FAILED)
903 err(EXIT_FAILURE, "mmap");
904 \&
905 printf("Address returned by mmap() = %p\en", addr);
906 \&
907 /* Register the memory range of the mapping we just created for
908 handling by the userfaultfd object. In mode, we request to track
909 missing pages (i.e., pages that have not yet been faulted in). */
910 \&
911 uffdio_register.range.start = (unsigned long) addr;
912 uffdio_register.range.len = len;
913 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
914 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1)
915 err(EXIT_FAILURE, "ioctl\-UFFDIO_REGISTER");
916 \&
917 /* Create a thread that will process the userfaultfd events. */
918 \&
919 s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
920 if (s != 0) {
921 errc(EXIT_FAILURE, s, "pthread_create");
922 }
923 \&
924 /* Main thread now touches memory in the mapping, touching
925 locations 1024 bytes apart. This will trigger userfaultfd
926 events for all pages in the region. */
927 \&
928 l = 0xf; /* Ensure that faulting address is not on a page
929 boundary, in order to test that we correctly
930 handle that case in fault_handling_thread(). */
931 while (l < len) {
932 c = addr[l];
933 printf("Read address %p in %s(): ", addr + l, __func__);
934 printf("%c\en", c);
935 l += 1024;
936 usleep(100000); /* Slow things down a little */
937 }
938 \&
939 exit(EXIT_SUCCESS);
940 }
941 .EE
942 .\" SRC END
943 .SH SEE ALSO
944 .BR fcntl (2),
945 .BR ioctl (2),
946 .BR ioctl_userfaultfd (2),
947 .BR madvise (2),
948 .BR mmap (2)
949 .P
950 .I Documentation/admin\-guide/mm/userfaultfd.rst
951 in the Linux kernel source tree