]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/userfaultfd.2
mknod.2: tfix
[thirdparty/man-pages.git] / man2 / userfaultfd.2
CommitLineData
bf9b5158
MR
1.\" Copyright (c) 2016, IBM Corporation.
2.\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com>
54e031a1 3.\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com>
bf9b5158
MR
4.\"
5.\" %%%LICENSE_START(VERBATIM)
6.\" Permission is granted to make and distribute verbatim copies of this
7.\" manual provided the copyright notice and this permission notice are
8.\" preserved on all copies.
9.\"
10.\" Permission is granted to copy and distribute modified versions of this
11.\" manual under the conditions for verbatim copying, provided that the
12.\" entire resulting derived work is distributed under the terms of a
13.\" permission notice identical to this one.
14.\"
15.\" Since the Linux kernel and libraries are constantly changing, this
16.\" manual page may be incorrect or out-of-date. The author(s) assume no
17.\" responsibility for errors or omissions, or for damages resulting from
18.\" the use of the information contained herein. The author(s) may not
19.\" have taken the same level of care in the production of this manual,
20.\" which is licensed free of charge, as they might when working
21.\" professionally.
22.\"
23.\" Formatted or processed versions of this manual, if unaccompanied by
24.\" the source, must acknowledge the copyright and authors of this work.
25.\" %%%LICENSE_END
26.\"
6b621d05 27.TH USERFAULTFD 2 2020-02-09 "Linux" "Linux Programmer's Manual"
bf9b5158 28.SH NAME
68a9e23c 29userfaultfd \- create a file descriptor for handling page faults in user space
bf9b5158
MR
30.SH SYNOPSIS
31.nf
32.B #include <sys/types.h>
3f894561 33.B #include <linux/userfaultfd.h>
68e4db0a 34.PP
bf9b5158
MR
35.BI "int userfaultfd(int " flags );
36.fi
37.PP
38.IR Note :
39There is no glibc wrapper for this system call; see NOTES.
40.SH DESCRIPTION
4aa7f5cf
MK
41.BR userfaultfd ()
42creates a new userfaultfd object that can be used for delegation of page-fault
43handling to a user-space application,
44and returns a file descriptor that refers to the new object.
45The new userfaultfd object is configured using
bf9b5158 46.BR ioctl (2).
efeece04 47.PP
4aa7f5cf 48Once the userfaultfd object is configured, the application can use
bf9b5158
MR
49.BR read (2)
50to receive userfaultfd notifications.
4aa7f5cf
MK
51The reads from userfaultfd may be blocking or non-blocking,
52depending on the value of
bf9b5158
MR
53.I flags
54used for the creation of the userfaultfd or subsequent calls to
4aa7f5cf 55.BR fcntl (2).
efeece04 56.PP
bf9b5158
MR
57The following values may be bitwise ORed in
58.IR flags
59to change the behavior of
60.BR userfaultfd ():
61.TP
62.BR O_CLOEXEC
4aa7f5cf 63Enable the close-on-exec flag for the new userfaultfd file descriptor.
bf9b5158
MR
64See the description of the
65.B O_CLOEXEC
66flag in
4aa7f5cf 67.BR open (2).
bf9b5158
MR
68.TP
69.BR O_NONBLOCK
4aa7f5cf 70Enables non-blocking operation for the userfaultfd object.
bf9b5158
MR
71See the description of the
72.BR O_NONBLOCK
73flag in
74.BR open (2).
7b8695f2
MK
75.PP
76When the last file descriptor referring to a userfaultfd object is closed,
77all memory ranges that were registered with the object are unregistered
5b1c4a1e 78and unread events are flushed.
bf9b5158 79.\"
54e031a1
MK
80.SS Usage
81The userfaultfd mechanism is designed to allow a thread in a multithreaded
82program to perform user-space paging for the other threads in the process.
83When a page fault occurs for one of the regions registered
84to the userfaultfd object,
85the faulting thread is put to sleep and
86an event is generated that can be read via the userfaultfd file descriptor.
87The fault-handling thread reads events from this file descriptor and services
88them using the operations described in
89.BR ioctl_userfaultfd (2).
90When servicing the page fault events,
91the fault-handling thread can trigger a wake-up for the sleeping thread.
efeece04 92.PP
d28aadf6
MR
93It is possible for the faulting threads and the fault-handling threads
94to run in the context of different processes.
95In this case, these threads may belong to different programs,
96and the program that executes the faulting threads
97will not necessarily cooperate with the program that handles the page faults.
98In such non-cooperative mode,
792570de 99the process that monitors userfaultfd and handles page faults
d28aadf6
MR
100needs to be aware of the changes in the virtual memory layout
101of the faulting process to avoid memory corruption.
efeece04 102.PP
5b1c4a1e 103Starting from Linux 4.11,
c2b3668b 104userfaultfd can also notify the fault-handling threads about changes
5b1c4a1e
MR
105in the virtual memory layout of the faulting process.
106In addition, if the faulting process invokes
c2b3668b 107.BR fork (2),
5b1c4a1e
MR
108the userfaultfd objects associated with the parent may be duplicated
109into the child process and the userfaultfd monitor will be notified
522ab2ff
MK
110(via the
111.B UFFD_EVENT_FORK
112described below)
5b1c4a1e
MR
113about the file descriptor associated with the userfault objects
114created for the child process,
522ab2ff 115which allows the userfaultfd monitor to perform user-space paging
5b1c4a1e 116for the child process.
265225c1 117Unlike page faults which have to be synchronous and require an
20050169
MR
118explicit or implicit wakeup,
119all other events are delivered asynchronously and
120the non-cooperative process resumes execution as
265225c1
MK
121soon as the userfaultfd manager executes
122.BR read (2).
123The userfaultfd manager should carefully synchronize calls to
124.B UFFDIO_COPY
125with the processing of events.
efeece04 126.PP
20050169
MR
127The current asynchronous model of the event delivery is optimal for
128single threaded non-cooperative userfaultfd manager implementations.
94a8ebda
MK
129.\" Regarding the preceding sentence, Mike Rapoport says:
130.\" The major point here is that current events delivery model could be
131.\" problematic for multi-threaded monitor. I even suspect that it would be
132.\" impossible to ensure synchronization between page faults and non-page
133.\" fault events in multi-threaded monitor.
efeece04 134.PP
d28aadf6 135.\" FIXME elaborate about non-cooperating mode, describe its limitations
792570de 136.\" for kernels before 4.11, features added in 4.11
d28aadf6
MR
137.\" and limitations remaining in 4.11
138.\" Maybe it's worth adding a dedicated sub-section...
54e031a1 139.\"
bf9b5158
MR
140.SS Userfaultfd operation
141After the userfaultfd object is created with
4aa7f5cf
MK
142.BR userfaultfd (),
143the application must enable it using the
144.B UFFDIO_API
145.BR ioctl (2)
146operation.
147This operation allows a handshake between the kernel and user space
148to determine the API version and supported features.
5074d846
MK
149This operation must be performed before any of the other
150.BR ioctl (2)
151operations described below (or those operations fail with the
152.BR EINVAL
153error).
efeece04 154.PP
4aa7f5cf
MK
155After a successful
156.B UFFDIO_API
157operation,
158the application then registers memory address ranges using the
159.B UFFDIO_REGISTER
160.BR ioctl (2)
161operation.
162After successful completion of a
163.B UFFDIO_REGISTER
164operation,
165a page fault occurring in the requested memory range, and satisfying
166the mode defined at the registration time, will be forwarded by the kernel to
167the user-space application.
168The application can then use the
169.B UFFDIO_COPY
bf9b5158 170or
99de80c5 171.B UFFDIO_ZEROPAGE
4aa7f5cf
MK
172.BR ioctl (2)
173operations to resolve the page fault.
2c371032 174.PP
e81833be 175Starting from Linux 4.14, if the application sets the
2c371032 176.B UFFD_FEATURE_SIGBUS
e81833be 177feature bit using the
2c371032
PS
178.B UFFDIO_API
179.BR ioctl (2),
e81833be
MK
180no page-fault notification will be forwarded to user space.
181Instead a
2c371032 182.B SIGBUS
e81833be
MK
183signal is delivered to the faulting process.
184With this feature,
185userfaultfd can be used for robustness purposes to simply catch
2c371032
PS
186any access to areas within the registered address range that do not
187have pages allocated, without having to listen to userfaultfd events.
188No userfaultfd monitor will be required for dealing with such memory
e81833be
MK
189accesses.
190For example, this feature can be useful for applications that
2c371032 191want to prevent the kernel from automatically allocating pages and filling
e81833be 192holes in sparse files when the hole is accessed through a memory mapping.
2c371032
PS
193.PP
194The
195.B UFFD_FEATURE_SIGBUS
e81833be
MK
196feature is implicitly inherited through
197.BR fork (2)
198if used in combination with
2c371032 199.BR UFFD_FEATURE_FORK .
efeece04 200.PP
6bc6d124 201Details of the various
4aa7f5cf 202.BR ioctl (2)
6bc6d124
MK
203operations can be found in
204.BR ioctl_userfaultfd (2).
efeece04 205.PP
5b1c4a1e
MR
206Since Linux 4.11, events other than page-fault may enabled during
207.B UFFDIO_API
208operation.
efeece04 209.PP
c3d5d9df
MR
210Up to Linux 4.11,
211userfaultfd can be used only with anonymous private memory mappings.
c3d5d9df
MR
212Since Linux 4.11,
213userfaultfd can be also used with hugetlbfs and shared memory mappings.
efeece04 214.PP
54e031a1
MK
215.\"
216.SS Reading from the userfaultfd structure
54e031a1
MK
217Each
218.BR read (2)
219from the userfaultfd file descriptor returns one or more
220.I uffd_msg
5b1c4a1e
MR
221structures, each of which describes a page-fault event
222or an event required for the non-cooperative userfaultfd usage:
efeece04 223.PP
54e031a1 224.in +4n
b8302363 225.EX
54e031a1 226struct uffd_msg {
c2b3668b 227 __u8 event; /* Type of event */
54e031a1
MK
228 ...
229 union {
00f4853c 230 struct {
c2b3668b
MK
231 __u64 flags; /* Flags describing fault */
232 __u64 address; /* Faulting address */
54e031a1 233 } pagefault;
c2b3668b
MK
234
235 struct { /* Since Linux 4.11 */
236 __u32 ufd; /* Userfault file descriptor
237 of the child process */
238 } fork;
239
240 struct { /* Since Linux 4.11 */
241 __u64 from; /* Old address of remapped area */
242 __u64 to; /* New address of remapped area */
243 __u64 len; /* Original mapping length */
244 } remap;
245
246 struct { /* Since Linux 4.11 */
247 __u64 start; /* Start address of removed area */
248 __u64 end; /* End address of removed area */
249 } remove;
b07243ab 250 ...
54e031a1
MK
251 } arg;
252
253 /* Padding fields omitted */
254} __packed;
b8302363 255.EE
e646a1ba 256.in
efeece04 257.PP
54e031a1
MK
258If multiple events are available and the supplied buffer is large enough,
259.BR read (2)
260returns as many events as will fit in the supplied buffer.
261If the buffer supplied to
262.BR read (2)
263is smaller than the size of the
264.I uffd_msg
265structure, the
266.BR read (2)
267fails with the error
268.BR EINVAL .
efeece04 269.PP
54e031a1
MK
270The fields set in the
271.I uffd_msg
272structure are as follows:
273.TP
274.I event
275The type of event.
5b1c4a1e
MR
276Depending of the event type,
277different fields of the
278.I arg
279union represent details required for the event processing.
280The non-page-fault events are generated only when appropriate feature
281is enabled during API handshake with
282.B UFFDIO_API
283.BR ioctl (2).
efeece04 284.IP
5b1c4a1e
MR
285The following values can appear in the
286.I event
287field:
288.RS
289.TP
6c12d34e 290.BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)"
5b1c4a1e
MR
291A page-fault event.
292The page-fault details are available in the
293.I pagefault
294field.
54e031a1 295.TP
6c12d34e 296.BR UFFD_EVENT_FORK " (since Linux 4.11)"
5b1c4a1e 297Generated when the faulting process invokes
522ab2ff
MK
298.BR fork (2)
299(or
300.BR clone (2)
301without the
302.BR CLONE_VM
303flag).
5b1c4a1e
MR
304The event details are available in the
305.I fork
306field.
23db3790 307.\" FIXME describe duplication of userfault file descriptor during fork
5b1c4a1e 308.TP
6c12d34e 309.BR UFFD_EVENT_REMAP " (since Linux 4.11)"
5b1c4a1e 310Generated when the faulting process invokes
c2b3668b 311.BR mremap (2).
5b1c4a1e
MR
312The event details are available in the
313.I remap
314field.
315.TP
6c12d34e 316.BR UFFD_EVENT_REMOVE " (since Linux 4.11)"
5b1c4a1e
MR
317Generated when the faulting process invokes
318.BR madvise (2)
c2b3668b 319with
5b1c4a1e
MR
320.BR MADV_DONTNEED
321or
322.BR MADV_REMOVE
323advice.
324The event details are available in the
325.I remove
326field.
327.TP
6c12d34e 328.BR UFFD_EVENT_UNMAP " (since Linux 4.11)"
5b1c4a1e
MR
329Generated when the faulting process unmaps a memory range,
330either explicitly using
331.BR munmap (2)
c2b3668b 332or implicitly during
5b1c4a1e
MR
333.BR mmap (2)
334or
c2b3668b 335.BR mremap (2).
5b1c4a1e
MR
336The event details are available in the
337.I remove
338field.
339.RE
340.TP
341.I pagefault.address
54e031a1
MK
342The address that triggered the page fault.
343.TP
5b1c4a1e 344.I pagefault.flags
54e031a1
MK
345A bit mask of flags that describe the event.
346For
347.BR UFFD_EVENT_PAGEFAULT ,
348the following flag may appear:
349.RS
350.TP
351.B UFFD_PAGEFAULT_FLAG_WRITE
352If the address is in a range that was registered with the
353.B UFFDIO_REGISTER_MODE_MISSING
354flag (see
355.BR ioctl_userfaultfd (2))
356and this flag is set, this a write fault;
357otherwise it is a read fault.
358.\"
359.\" UFFD_PAGEFAULT_FLAG_WP is not yet supported.
360.RE
5b1c4a1e
MR
361.TP
362.I fork.ufd
363The file descriptor associated with the userfault object
522ab2ff
MK
364created for the child created by
365.BR fork (2).
5b1c4a1e
MR
366.TP
367.I remap.from
368The original address of the memory range that was remapped using
369.BR mremap (2).
370.TP
371.I remap.to
372The new address of the memory range that was remapped using
373.BR mremap (2).
374.TP
375.I remap.len
23db3790 376The original length of the memory range that was remapped using
5b1c4a1e
MR
377.BR mremap (2).
378.TP
379.I remove.start
380The start address of the memory range that was freed using
381.BR madvise (2)
382or unmapped
383.TP
384.I remove.end
385The end address of the memory range that was freed using
386.BR madvise (2)
387or unmapped
54e031a1 388.PP
058b32ac
MK
389A
390.BR read (2)
391on a userfaultfd file descriptor can fail with the following errors:
392.TP
393.B EINVAL
394The userfaultfd object has not yet been enabled using the
395.BR UFFDIO_API
396.BR ioctl (2)
397operation
398.PP
1bf00323
MK
399If the
400.B O_NONBLOCK
401flag is enabled in the associated open file description,
402the userfaultfd file descriptor can be monitored with
54e031a1
MK
403.BR poll (2),
404.BR select (2),
405and
406.BR epoll (7).
407When events are available, the file descriptor indicates as readable.
1bf00323
MK
408If the
409.B O_NONBLOCK
410flag is not enabled, then
411.BR poll (2)
412(always) indicates the file as having a
413.BR POLLERR
414condition, and
415.BR select (2)
416indicates the file descriptor as both readable and writable.
417.\" FIXME What is the reason for this seemingly odd behavior with respect
418.\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c).
419.\" Something needs to be said about this.
bf9b5158 420.SH RETURN VALUE
4aa7f5cf
MK
421On success,
422.BR userfaultfd ()
423returns a new file descriptor that refers to the userfaultfd object.
bf9b5158
MR
424On error, \-1 is returned, and
425.I errno
426is set appropriately.
427.SH ERRORS
428.TP
429.B EINVAL
430An unsupported value was specified in
431.IR flags .
432.TP
433.BR EMFILE
434The per-process limit on the number of open file descriptors has been
435reached
436.TP
437.B ENFILE
438The system-wide limit on the total number of open files has been
439reached.
440.TP
441.B ENOMEM
442Insufficient kernel memory was available.
339b899c 443.TP
c4f13bc7 444.BR EPERM " (since Linux 5.2)"
339b899c 445.\" cefdca0a86be517bc390fc4541e3674b8e7803b0
c4f13bc7
MK
446The caller is not privileged (does not have the
447.B CAP_SYS_PTRACE
2b6923ba 448capability in the initial user namespace), and
c4f13bc7
MK
449.I /proc/sys/vm/unprivileged_userfaultfd
450has the value 0.
0da8f5c1
MK
451.SH VERSIONS
452The
453.BR userfaultfd ()
454system call first appeared in Linux 4.3.
efeece04 455.PP
be9021b1
MR
456The support for hugetlbfs and shared memory areas and
457non-page-fault events was added in Linux 4.11
bf9b5158
MR
458.SH CONFORMING TO
459.BR userfaultfd ()
460is Linux-specific and should not be used in programs intended to be
461portable.
462.SH NOTES
463Glibc does not provide a wrapper for this system call; call it using
464.BR syscall (2).
efeece04 465.PP
de6943fa
MK
466The userfaultfd mechanism can be used as an alternative to
467traditional user-space paging techniques based on the use of the
468.BR SIGSEGV
469signal and
470.BR mmap (2).
471It can also be used to implement lazy restore
472for checkpoint/restore mechanisms,
473as well as post-copy migration to allow (nearly) uninterrupted execution
18448166
MR
474when transferring virtual machines and Linux containers
475from one host to another.
2a885973
MR
476.SH BUGS
477If the
478.B UFFD_FEATURE_EVENT_FORK
479is enabled and a system call from the
480.BR fork (2)
047a0bb8 481family is interrupted by a signal or failed, a stale userfaultfd descriptor
2a885973 482might be created.
047a0bb8 483In this case, a spurious
2a885973
MR
484.B UFFD_EVENT_FORK
485will be delivered to the userfaultfd monitor.
b07243ab
MK
486.SH EXAMPLE
487The program below demonstrates the use of the userfaultfd mechanism.
488The program creates two threads, one of which acts as the
489page-fault handler for the process, for the pages in a demand-page zero
490region created using
491.BR mmap (2).
efeece04 492.PP
b07243ab
MK
493The program takes one command-line argument,
494which is the number of pages that will be created in a mapping
495whose page faults will be handled via userfaultfd.
496After creating a userfaultfd object,
497the program then creates an anonymous private mapping of the specified size
498and registers the address range of that mapping using the
499.B UFFDIO_REGISTER
500.BR ioctl (2)
501operation.
00f4853c 502The program then creates a second thread that will perform the
b07243ab 503task of handling page faults.
efeece04 504.PP
b07243ab
MK
505The main thread then walks through the pages of the mapping fetching
506bytes from successive pages.
507Because the pages have not yet been accessed,
508the first access of a byte in each page will trigger a page-fault event
509on the userfaultfd file descriptor.
efeece04 510.PP
b07243ab
MK
511Each of the page-fault events is handled by the second thread,
512which sits in a loop processing input from the userfaultfd file descriptor.
513In each loop iteration, the second thread first calls
514.BR poll (2)
515to check the state of the file descriptor,
516and then reads an event from the file descriptor.
517All such events should be
518.B UFFD_EVENT_PAGEFAULT
519events,
520which the thread handles by copying a page of data into
521the faulting region using the
00f4853c 522.B UFFDIO_COPY
b07243ab
MK
523.BR ioctl (2)
524operation.
efeece04 525.PP
b07243ab 526The following is an example of what we see when running the program:
efeece04 527.PP
b07243ab 528.in +4n
b8302363 529.EX
b07243ab
MK
530$ \fB./userfaultfd_demo 3\fP
531Address returned by mmap() = 0x7fd30106c000
532
533fault_handler_thread():
534 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
535 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f
536 (uffdio_copy.copy returned 4096)
537Read address 0x7fd30106c00f in main(): A
538Read address 0x7fd30106c40f in main(): A
539Read address 0x7fd30106c80f in main(): A
540Read address 0x7fd30106cc0f in main(): A
541
542fault_handler_thread():
543 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
544 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f
545 (uffdio_copy.copy returned 4096)
546Read address 0x7fd30106d00f in main(): B
547Read address 0x7fd30106d40f in main(): B
548Read address 0x7fd30106d80f in main(): B
549Read address 0x7fd30106dc0f in main(): B
550
551fault_handler_thread():
552 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
553 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f
554 (uffdio_copy.copy returned 4096)
555Read address 0x7fd30106e00f in main(): C
556Read address 0x7fd30106e40f in main(): C
557Read address 0x7fd30106e80f in main(): C
558Read address 0x7fd30106ec0f in main(): C
b8302363 559.EE
e646a1ba 560.in
b07243ab
MK
561.SS Program source
562\&
e7d0bb47 563.EX
b07243ab 564/* userfaultfd_demo.c
00f4853c 565
b07243ab
MK
566 Licensed under the GNU General Public License version 2 or later.
567*/
568#define _GNU_SOURCE
569#include <sys/types.h>
570#include <stdio.h>
571#include <linux/userfaultfd.h>
572#include <pthread.h>
573#include <errno.h>
574#include <unistd.h>
575#include <stdlib.h>
576#include <fcntl.h>
577#include <signal.h>
578#include <poll.h>
579#include <string.h>
580#include <sys/mman.h>
581#include <sys/syscall.h>
582#include <sys/ioctl.h>
583#include <poll.h>
584
d1a71985 585#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
b07243ab
MK
586 } while (0)
587
588static int page_size;
589
590static void *
591fault_handler_thread(void *arg)
592{
593 static struct uffd_msg msg; /* Data read from userfaultfd */
594 static int fault_cnt = 0; /* Number of faults so far handled */
595 long uffd; /* userfaultfd file descriptor */
596 static char *page = NULL;
597 struct uffdio_copy uffdio_copy;
598 ssize_t nread;
599
600 uffd = (long) arg;
601
602 /* Create a page that will be copied into the faulting region */
603
604 if (page == NULL) {
605 page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
606 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
607 if (page == MAP_FAILED)
608 errExit("mmap");
609 }
610
611 /* Loop, handling incoming events on the userfaultfd
612 file descriptor */
613
614 for (;;) {
615
616 /* See what poll() tells us about the userfaultfd */
617
618 struct pollfd pollfd;
619 int nready;
620 pollfd.fd = uffd;
621 pollfd.events = POLLIN;
622 nready = poll(&pollfd, 1, \-1);
623 if (nready == \-1)
624 errExit("poll");
625
d1a71985 626 printf("\enfault_handler_thread():\en");
b07243ab 627 printf(" poll() returns: nready = %d; "
d1a71985 628 "POLLIN = %d; POLLERR = %d\en", nready,
b07243ab
MK
629 (pollfd.revents & POLLIN) != 0,
630 (pollfd.revents & POLLERR) != 0);
631
632 /* Read an event from the userfaultfd */
633
634 nread = read(uffd, &msg, sizeof(msg));
635 if (nread == 0) {
d1a71985 636 printf("EOF on userfaultfd!\en");
b07243ab 637 exit(EXIT_FAILURE);
00f4853c 638 }
b07243ab
MK
639
640 if (nread == \-1)
641 errExit("read");
00f4853c 642
b07243ab
MK
643 /* We expect only one kind of event; verify that assumption */
644
645 if (msg.event != UFFD_EVENT_PAGEFAULT) {
d1a71985 646 fprintf(stderr, "Unexpected event on userfaultfd\en");
b07243ab
MK
647 exit(EXIT_FAILURE);
648 }
649
650 /* Display info about the page\-fault event */
651
652 printf(" UFFD_EVENT_PAGEFAULT event: ");
653 printf("flags = %llx; ", msg.arg.pagefault.flags);
d1a71985 654 printf("address = %llx\en", msg.arg.pagefault.address);
b07243ab
MK
655
656 /* Copy the page pointed to by \(aqpage\(aq into the faulting
657 region. Vary the contents that are copied in, so that it
658 is more obvious that each fault is handled separately. */
659
660 memset(page, \(aqA\(aq + fault_cnt % 20, page_size);
661 fault_cnt++;
662
663 uffdio_copy.src = (unsigned long) page;
664
665 /* We need to handle page faults in units of pages(!).
666 So, round faulting address down to page boundary */
667
668 uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
669 ~(page_size \- 1);
670 uffdio_copy.len = page_size;
671 uffdio_copy.mode = 0;
672 uffdio_copy.copy = 0;
673 if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1)
674 errExit("ioctl\-UFFDIO_COPY");
675
d1a71985 676 printf(" (uffdio_copy.copy returned %lld)\en",
b07243ab
MK
677 uffdio_copy.copy);
678 }
679}
680
681int
682main(int argc, char *argv[])
683{
684 long uffd; /* userfaultfd file descriptor */
685 char *addr; /* Start of region handled by userfaultfd */
686 unsigned long len; /* Length of region handled by userfaultfd */
687 pthread_t thr; /* ID of thread that handles page faults */
688 struct uffdio_api uffdio_api;
689 struct uffdio_register uffdio_register;
690 int s;
691
692 if (argc != 2) {
d1a71985 693 fprintf(stderr, "Usage: %s num\-pages\en", argv[0]);
b07243ab
MK
694 exit(EXIT_FAILURE);
695 }
696
697 page_size = sysconf(_SC_PAGE_SIZE);
698 len = strtoul(argv[1], NULL, 0) * page_size;
699
700 /* Create and enable userfaultfd object */
701
702 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
703 if (uffd == \-1)
704 errExit("userfaultfd");
705
706 uffdio_api.api = UFFD_API;
707 uffdio_api.features = 0;
708 if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1)
709 errExit("ioctl\-UFFDIO_API");
710
711 /* Create a private anonymous mapping. The memory will be
712 demand\-zero paged\-\-that is, not yet allocated. When we
713 actually touch the memory, it will be allocated via
714 the userfaultfd. */
715
716 addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
717 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
718 if (addr == MAP_FAILED)
719 errExit("mmap");
720
d1a71985 721 printf("Address returned by mmap() = %p\en", addr);
b07243ab
MK
722
723 /* Register the memory range of the mapping we just created for
724 handling by the userfaultfd object. In mode, we request to track
725 missing pages (i.e., pages that have not yet been faulted in). */
726
727 uffdio_register.range.start = (unsigned long) addr;
728 uffdio_register.range.len = len;
729 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
730 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1)
731 errExit("ioctl\-UFFDIO_REGISTER");
732
733 /* Create a thread that will process the userfaultfd events */
734
735 s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
736 if (s != 0) {
737 errno = s;
738 errExit("pthread_create");
739 }
740
741 /* Main thread now touches memory in the mapping, touching
742 locations 1024 bytes apart. This will trigger userfaultfd
743 events for all pages in the region. */
744
745 int l;
746 l = 0xf; /* Ensure that faulting address is not on a page
747 boundary, in order to test that we correctly
748 handle that case in fault_handling_thread() */
749 while (l < len) {
750 char c = addr[l];
751 printf("Read address %p in main(): ", addr + l);
d1a71985 752 printf("%c\en", c);
b07243ab
MK
753 l += 1024;
754 usleep(100000); /* Slow things down a little */
755 }
756
757 exit(EXIT_SUCCESS);
758}
e7d0bb47 759.EE
bf9b5158
MR
760.SH SEE ALSO
761.BR fcntl (2),
4aa7f5cf 762.BR ioctl (2),
6bc6d124 763.BR ioctl_userfaultfd (2),
6d3e02a5 764.BR madvise (2),
4aa7f5cf 765.BR mmap (2)
efeece04 766.PP
a2463bae 767.IR Documentation/admin-guide/mm/userfaultfd.rst
bf9b5158 768in the Linux kernel source tree
efeece04 769.PP