]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/userfaultfd.2
Many pages: Use correct letter case in page titles (TH)
[thirdparty/man-pages.git] / man2 / userfaultfd.2
CommitLineData
bf9b5158
MR
1.\" Copyright (c) 2016, IBM Corporation.
2.\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com>
54e031a1 3.\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com>
bf9b5158 4.\"
5fbde956 5.\" SPDX-License-Identifier: Linux-man-pages-copyleft
bf9b5158 6.\"
4c1c5274 7.TH userfaultfd 2 (date) "Linux man-pages (unreleased)"
bf9b5158 8.SH NAME
68a9e23c 9userfaultfd \- create a file descriptor for handling page faults in user space
26aba212
AC
10.SH LIBRARY
11Standard C library
8fc3b2cf 12.RI ( libc ", " \-lc )
bf9b5158
MR
13.SH SYNOPSIS
14.nf
428c8633
PX
15.BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
16.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
17.BR "#include <linux/userfaultfd.h>" " /* Definition of " UFFD_* " constants */"
e1960e7e 18.B #include <unistd.h>
68e4db0a 19.PP
e1960e7e 20.BI "int syscall(SYS_userfaultfd, int " flags );
bf9b5158
MR
21.fi
22.PP
23.IR Note :
e1960e7e
AC
24glibc provides no wrapper for
25.BR userfaultfd (),
26necessitating the use of
27.BR syscall (2).
bf9b5158 28.SH DESCRIPTION
4aa7f5cf
MK
29.BR userfaultfd ()
30creates a new userfaultfd object that can be used for delegation of page-fault
31handling to a user-space application,
32and returns a file descriptor that refers to the new object.
33The new userfaultfd object is configured using
bf9b5158 34.BR ioctl (2).
efeece04 35.PP
4aa7f5cf 36Once the userfaultfd object is configured, the application can use
bf9b5158
MR
37.BR read (2)
38to receive userfaultfd notifications.
4aa7f5cf
MK
39The reads from userfaultfd may be blocking or non-blocking,
40depending on the value of
bf9b5158
MR
41.I flags
42used for the creation of the userfaultfd or subsequent calls to
4aa7f5cf 43.BR fcntl (2).
efeece04 44.PP
bf9b5158 45The following values may be bitwise ORed in
1ae6b2c7 46.I flags
bf9b5158
MR
47to change the behavior of
48.BR userfaultfd ():
49.TP
1ae6b2c7 50.B O_CLOEXEC
4aa7f5cf 51Enable the close-on-exec flag for the new userfaultfd file descriptor.
bf9b5158
MR
52See the description of the
53.B O_CLOEXEC
54flag in
4aa7f5cf 55.BR open (2).
bf9b5158 56.TP
1ae6b2c7 57.B O_NONBLOCK
4aa7f5cf 58Enables non-blocking operation for the userfaultfd object.
bf9b5158 59See the description of the
1ae6b2c7 60.B O_NONBLOCK
bf9b5158
MR
61flag in
62.BR open (2).
428c8633
PX
63.TP
64.B UFFD_USER_MODE_ONLY
0f8ae01d
AC
65This is an userfaultfd-specific flag that was introduced in Linux 5.11.
66When set, the userfaultfd object will only be able to handle
67page faults originated from the user space on the registered regions.
68When a kernel-originated fault was triggered
69on the registered range with this userfaultfd, a
428c8633
PX
70.B SIGBUS
71signal will be delivered.
7b8695f2
MK
72.PP
73When the last file descriptor referring to a userfaultfd object is closed,
74all memory ranges that were registered with the object are unregistered
5b1c4a1e 75and unread events are flushed.
bf9b5158 76.\"
e70f957d 77.PP
140b625a 78Userfaultfd supports three modes of registration:
4b338b38
PX
79.TP
80.BR UFFDIO_REGISTER_MODE_MISSING " (since 4.10)"
81When registered with
82.B UFFDIO_REGISTER_MODE_MISSING
7a3d0845 83mode, user-space will receive a page-fault notification
4b338b38
PX
84when a missing page is accessed.
85The faulted thread will be stopped from execution until the page fault is
7a3d0845 86resolved from user-space by either an
4b338b38
PX
87.B UFFDIO_COPY
88or an
89.B UFFDIO_ZEROPAGE
90ioctl.
91.TP
140b625a
AR
92.BR UFFDIO_REGISTER_MODE_MINOR " (since 5.13)"
93When registered with
94.B UFFDIO_REGISTER_MODE_MINOR
95mode, user-space will receive a page-fault notification
96when a minor page fault occurs.
39a3ee3f
AC
97That is,
98when a backing page is in the page cache,
99but page table entries don't yet exist.
140b625a
AR
100The faulted thread will be stopped from execution
101until the page fault is resolved from user-space by an
102.B UFFDIO_CONTINUE
103ioctl.
104.TP
4b338b38
PX
105.BR UFFDIO_REGISTER_MODE_WP " (since 5.7)"
106When registered with
107.B UFFDIO_REGISTER_MODE_WP
7a3d0845 108mode, user-space will receive a page-fault notification
4b338b38
PX
109when a write-protected page is written.
110The faulted thread will be stopped from execution
7a3d0845 111until user-space write-unprotects the page using an
4b338b38
PX
112.B UFFDIO_WRITEPROTECT
113ioctl.
114.PP
115Multiple modes can be enabled at the same time for the same memory range.
116.PP
7a3d0845
MK
117Since Linux 4.14, a userfaultfd page-fault notification can selectively embed
118faulting thread ID information into the notification.
e70f957d
PX
119One needs to enable this feature explicitly using the
120.B UFFD_FEATURE_THREAD_ID
121feature bit when initializing the userfaultfd context.
122By default, thread ID reporting is disabled.
54e031a1
MK
123.SS Usage
124The userfaultfd mechanism is designed to allow a thread in a multithreaded
125program to perform user-space paging for the other threads in the process.
126When a page fault occurs for one of the regions registered
127to the userfaultfd object,
128the faulting thread is put to sleep and
129an event is generated that can be read via the userfaultfd file descriptor.
130The fault-handling thread reads events from this file descriptor and services
131them using the operations described in
132.BR ioctl_userfaultfd (2).
133When servicing the page fault events,
134the fault-handling thread can trigger a wake-up for the sleeping thread.
efeece04 135.PP
d28aadf6
MR
136It is possible for the faulting threads and the fault-handling threads
137to run in the context of different processes.
138In this case, these threads may belong to different programs,
139and the program that executes the faulting threads
140will not necessarily cooperate with the program that handles the page faults.
141In such non-cooperative mode,
792570de 142the process that monitors userfaultfd and handles page faults
d28aadf6
MR
143needs to be aware of the changes in the virtual memory layout
144of the faulting process to avoid memory corruption.
efeece04 145.PP
4b338b38 146Since Linux 4.11,
c2b3668b 147userfaultfd can also notify the fault-handling threads about changes
5b1c4a1e
MR
148in the virtual memory layout of the faulting process.
149In addition, if the faulting process invokes
c2b3668b 150.BR fork (2),
5b1c4a1e
MR
151the userfaultfd objects associated with the parent may be duplicated
152into the child process and the userfaultfd monitor will be notified
522ab2ff
MK
153(via the
154.B UFFD_EVENT_FORK
155described below)
5b1c4a1e
MR
156about the file descriptor associated with the userfault objects
157created for the child process,
522ab2ff 158which allows the userfaultfd monitor to perform user-space paging
5b1c4a1e 159for the child process.
265225c1 160Unlike page faults which have to be synchronous and require an
20050169
MR
161explicit or implicit wakeup,
162all other events are delivered asynchronously and
163the non-cooperative process resumes execution as
265225c1
MK
164soon as the userfaultfd manager executes
165.BR read (2).
166The userfaultfd manager should carefully synchronize calls to
167.B UFFDIO_COPY
168with the processing of events.
efeece04 169.PP
20050169
MR
170The current asynchronous model of the event delivery is optimal for
171single threaded non-cooperative userfaultfd manager implementations.
94a8ebda
MK
172.\" Regarding the preceding sentence, Mike Rapoport says:
173.\" The major point here is that current events delivery model could be
174.\" problematic for multi-threaded monitor. I even suspect that it would be
175.\" impossible to ensure synchronization between page faults and non-page
176.\" fault events in multi-threaded monitor.
fec6cf90 177.\" .PP
d28aadf6 178.\" FIXME elaborate about non-cooperating mode, describe its limitations
792570de 179.\" for kernels before 4.11, features added in 4.11
d28aadf6
MR
180.\" and limitations remaining in 4.11
181.\" Maybe it's worth adding a dedicated sub-section...
54e031a1 182.\"
4b338b38
PX
183.PP
184Since Linux 5.7, userfaultfd is able to do
185synchronous page dirty tracking using the new write-protect register mode.
186One should check against the feature bit
187.B UFFD_FEATURE_PAGEFAULT_FLAG_WP
188before using this feature.
189Similar to the original userfaultfd missing mode, the write-protect mode will
7a3d0845 190generate a userfaultfd notification when the protected page is written.
4b338b38 191The user needs to resolve the page fault by unprotecting the faulted page and
7a3d0845
MK
192kicking the faulted thread to continue.
193For more information,
194please refer to the "Userfaultfd write-protect mode" section.
195.\"
bf9b5158
MR
196.SS Userfaultfd operation
197After the userfaultfd object is created with
4aa7f5cf
MK
198.BR userfaultfd (),
199the application must enable it using the
200.B UFFDIO_API
201.BR ioctl (2)
202operation.
203This operation allows a handshake between the kernel and user space
204to determine the API version and supported features.
5074d846
MK
205This operation must be performed before any of the other
206.BR ioctl (2)
207operations described below (or those operations fail with the
1ae6b2c7 208.B EINVAL
5074d846 209error).
efeece04 210.PP
4aa7f5cf
MK
211After a successful
212.B UFFDIO_API
213operation,
214the application then registers memory address ranges using the
215.B UFFDIO_REGISTER
216.BR ioctl (2)
217operation.
218After successful completion of a
219.B UFFDIO_REGISTER
220operation,
221a page fault occurring in the requested memory range, and satisfying
222the mode defined at the registration time, will be forwarded by the kernel to
223the user-space application.
224The application can then use the
140b625a
AR
225.B UFFDIO_COPY ,
226.B UFFDIO_ZEROPAGE ,
bf9b5158 227or
140b625a 228.B UFFDIO_CONTINUE
4aa7f5cf
MK
229.BR ioctl (2)
230operations to resolve the page fault.
2c371032 231.PP
4b338b38 232Since Linux 4.14, if the application sets the
2c371032 233.B UFFD_FEATURE_SIGBUS
e81833be 234feature bit using the
2c371032
PS
235.B UFFDIO_API
236.BR ioctl (2),
e81833be
MK
237no page-fault notification will be forwarded to user space.
238Instead a
2c371032 239.B SIGBUS
e81833be
MK
240signal is delivered to the faulting process.
241With this feature,
242userfaultfd can be used for robustness purposes to simply catch
2c371032
PS
243any access to areas within the registered address range that do not
244have pages allocated, without having to listen to userfaultfd events.
245No userfaultfd monitor will be required for dealing with such memory
e81833be
MK
246accesses.
247For example, this feature can be useful for applications that
2c371032 248want to prevent the kernel from automatically allocating pages and filling
e81833be 249holes in sparse files when the hole is accessed through a memory mapping.
2c371032
PS
250.PP
251The
252.B UFFD_FEATURE_SIGBUS
e81833be
MK
253feature is implicitly inherited through
254.BR fork (2)
255if used in combination with
2c371032 256.BR UFFD_FEATURE_FORK .
efeece04 257.PP
6bc6d124 258Details of the various
4aa7f5cf 259.BR ioctl (2)
6bc6d124
MK
260operations can be found in
261.BR ioctl_userfaultfd (2).
efeece04 262.PP
5b1c4a1e
MR
263Since Linux 4.11, events other than page-fault may enabled during
264.B UFFDIO_API
265operation.
efeece04 266.PP
c3d5d9df
MR
267Up to Linux 4.11,
268userfaultfd can be used only with anonymous private memory mappings.
c3d5d9df
MR
269Since Linux 4.11,
270userfaultfd can be also used with hugetlbfs and shared memory mappings.
54e031a1 271.\"
4b338b38 272.SS Userfaultfd write-protect mode (since 5.7)
827e7731 273Since Linux 5.7, userfaultfd supports write-protect mode for anonymous memory.
4b338b38
PX
274The user needs to first check availability of this feature using
275.B UFFDIO_API
276ioctl against the feature bit
277.B UFFD_FEATURE_PAGEFAULT_FLAG_WP
278before using this feature.
279.PP
0f8ae01d
AC
280Since Linux 5.19,
281the write-protection mode was also supported on
282shmem and hugetlbfs memory types.
827e7731
PX
283It can be detected with the feature bit
284.BR UFFD_FEATURE_WP_HUGETLBFS_SHMEM .
285.PP
4b338b38
PX
286To register with userfaultfd write-protect mode, the user needs to initiate the
287.B UFFDIO_REGISTER
288ioctl with mode
289.B UFFDIO_REGISTER_MODE_WP
290set.
9ae36f18 291Note that it is legal to monitor the same memory range with multiple modes.
4b338b38
PX
292For example, the user can do
293.B UFFDIO_REGISTER
294with the mode set to
295.BR "UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP" .
296When there is only
297.B UFFDIO_REGISTER_MODE_WP
7a3d0845 298registered, user-space will
4b338b38 299.I not
7a3d0845
MK
300receive any notification when a missing page is written.
301Instead, user-space will receive a write-protect page-fault notification
302only when an existing but write-protected page got written.
4b338b38
PX
303.PP
304After the
305.B UFFDIO_REGISTER
306ioctl completed with
307.B UFFDIO_REGISTER_MODE_WP
308mode set,
309the user can write-protect any existing memory within the range using the ioctl
310.B UFFDIO_WRITEPROTECT
311where
312.I uffdio_writeprotect.mode
313should be set to
314.BR UFFDIO_WRITEPROTECT_MODE_WP .
315.PP
316When a write-protect event happens,
7a3d0845 317user-space will receive a page-fault notification whose
4b338b38
PX
318.I uffd_msg.pagefault.flags
319will be with
320.B UFFD_PAGEFAULT_FLAG_WP
321flag set.
7a3d0845
MK
322Note: since only writes can trigger this kind of fault,
323write-protect notifications will always have the
4b338b38 324.B UFFD_PAGEFAULT_FLAG_WRITE
7a3d0845 325bit set along with the
1ae6b2c7 326.B UFFD_PAGEFAULT_FLAG_WP
7a3d0845 327bit.
4b338b38
PX
328.PP
329To resolve a write-protection page fault, the user should initiate another
330.B UFFDIO_WRITEPROTECT
331ioctl, whose
332.I uffd_msg.pagefault.flags
333should have the flag
334.B UFFDIO_WRITEPROTECT_MODE_WP
335cleared upon the faulted page or range.
140b625a
AR
336.\"
337.SS Userfaultfd minor fault mode (since 5.13)
338Since Linux 5.13,
339userfaultfd supports minor fault mode.
340In this mode,
341fault messages are produced not for major faults
342(where the page was missing),
343but rather for minor faults,
344where a page exists in the page cache,
345but the page table entries are not yet present.
346The user needs to first check availability of this feature using the
347.B UFFDIO_API
348ioctl with the appropriate feature bits set before using this feature:
349.B UFFD_FEATURE_MINOR_HUGETLBFS
350since Linux 5.13,
351or
352.B UFFD_FEATURE_MINOR_SHMEM
353since Linux 5.14.
354.PP
355To register with userfaultfd minor fault mode,
356the user needs to initiate the
357.B UFFDIO_REGISTER
358ioctl with mode
359.B UFFD_REGISTER_MODE_MINOR
360set.
361.PP
362When a minor fault occurs,
363user-space will receive a page-fault notification
364whose
365.I uffd_msg.pagefault.flags
366will have the
367.B UFFD_PAGEFAULT_FLAG_MINOR
368flag set.
369.PP
370To resolve a minor page fault,
371the handler should decide whether or not
372the existing page contents need to be modified first.
373If so,
374this should be done in-place via a second,
375non-userfaultfd-registered mapping
376to the same backing page
377(e.g., by mapping the shmem or hugetlbfs file twice).
378Once the page is considered "up to date",
379the fault can be resolved by initiating an
380.B UFFDIO_CONTINUE
381ioctl,
382which installs the page table entries and
383(by default)
384wakes up the faulting thread(s).
385.PP
386Minor fault mode supports only hugetlbfs-backed (since Linux 5.13)
387and shmem-backed (since Linux 5.14) memory.
388.\"
54e031a1 389.SS Reading from the userfaultfd structure
54e031a1
MK
390Each
391.BR read (2)
392from the userfaultfd file descriptor returns one or more
393.I uffd_msg
5b1c4a1e
MR
394structures, each of which describes a page-fault event
395or an event required for the non-cooperative userfaultfd usage:
efeece04 396.PP
54e031a1 397.in +4n
b8302363 398.EX
54e031a1 399struct uffd_msg {
115b4e0e 400 __u8 event; /* Type of event */
54e031a1
MK
401 ...
402 union {
00f4853c 403 struct {
115b4e0e
AC
404 __u64 flags; /* Flags describing fault */
405 __u64 address; /* Faulting address */
e70f957d 406 union {
115b4e0e 407 __u32 ptid; /* Thread ID of the fault */
e70f957d 408 } feat;
54e031a1 409 } pagefault;
c2b3668b 410
115b4e0e
AC
411 struct { /* Since Linux 4.11 */
412 __u32 ufd; /* Userfault file descriptor
413 of the child process */
c2b3668b
MK
414 } fork;
415
115b4e0e
AC
416 struct { /* Since Linux 4.11 */
417 __u64 from; /* Old address of remapped area */
418 __u64 to; /* New address of remapped area */
419 __u64 len; /* Original mapping length */
c2b3668b
MK
420 } remap;
421
115b4e0e
AC
422 struct { /* Since Linux 4.11 */
423 __u64 start; /* Start address of removed area */
424 __u64 end; /* End address of removed area */
c2b3668b 425 } remove;
b07243ab 426 ...
54e031a1
MK
427 } arg;
428
429 /* Padding fields omitted */
430} __packed;
b8302363 431.EE
e646a1ba 432.in
efeece04 433.PP
54e031a1
MK
434If multiple events are available and the supplied buffer is large enough,
435.BR read (2)
436returns as many events as will fit in the supplied buffer.
437If the buffer supplied to
438.BR read (2)
439is smaller than the size of the
440.I uffd_msg
441structure, the
442.BR read (2)
443fails with the error
444.BR EINVAL .
efeece04 445.PP
54e031a1
MK
446The fields set in the
447.I uffd_msg
448structure are as follows:
449.TP
450.I event
451The type of event.
5b1c4a1e
MR
452Depending of the event type,
453different fields of the
454.I arg
455union represent details required for the event processing.
456The non-page-fault events are generated only when appropriate feature
457is enabled during API handshake with
458.B UFFDIO_API
459.BR ioctl (2).
efeece04 460.IP
5b1c4a1e
MR
461The following values can appear in the
462.I event
463field:
464.RS
465.TP
6c12d34e 466.BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)"
5b1c4a1e
MR
467A page-fault event.
468The page-fault details are available in the
469.I pagefault
470field.
54e031a1 471.TP
6c12d34e 472.BR UFFD_EVENT_FORK " (since Linux 4.11)"
5b1c4a1e 473Generated when the faulting process invokes
522ab2ff
MK
474.BR fork (2)
475(or
476.BR clone (2)
477without the
1ae6b2c7 478.B CLONE_VM
522ab2ff 479flag).
5b1c4a1e
MR
480The event details are available in the
481.I fork
482field.
23db3790 483.\" FIXME describe duplication of userfault file descriptor during fork
5b1c4a1e 484.TP
6c12d34e 485.BR UFFD_EVENT_REMAP " (since Linux 4.11)"
5b1c4a1e 486Generated when the faulting process invokes
c2b3668b 487.BR mremap (2).
5b1c4a1e
MR
488The event details are available in the
489.I remap
490field.
491.TP
6c12d34e 492.BR UFFD_EVENT_REMOVE " (since Linux 4.11)"
5b1c4a1e
MR
493Generated when the faulting process invokes
494.BR madvise (2)
c2b3668b 495with
1ae6b2c7 496.B MADV_DONTNEED
5b1c4a1e 497or
1ae6b2c7 498.B MADV_REMOVE
5b1c4a1e
MR
499advice.
500The event details are available in the
501.I remove
502field.
503.TP
6c12d34e 504.BR UFFD_EVENT_UNMAP " (since Linux 4.11)"
5b1c4a1e
MR
505Generated when the faulting process unmaps a memory range,
506either explicitly using
507.BR munmap (2)
c2b3668b 508or implicitly during
5b1c4a1e
MR
509.BR mmap (2)
510or
c2b3668b 511.BR mremap (2).
5b1c4a1e
MR
512The event details are available in the
513.I remove
514field.
515.RE
516.TP
517.I pagefault.address
54e031a1
MK
518The address that triggered the page fault.
519.TP
5b1c4a1e 520.I pagefault.flags
54e031a1
MK
521A bit mask of flags that describe the event.
522For
523.BR UFFD_EVENT_PAGEFAULT ,
524the following flag may appear:
525.RS
526.TP
140b625a
AR
527.B UFFD_PAGEFAULT_FLAG_WP
528If this flag is set, then the fault was a write-protect fault.
529.TP
530.B UFFD_PAGEFAULT_FLAG_MINOR
531If this flag is set, then the fault was a minor fault.
4b338b38 532.TP
140b625a
AR
533.B UFFD_PAGEFAULT_FLAG_WRITE
534If this flag is set, then the fault was a write fault.
535.PP
536If neither
4b338b38 537.B UFFD_PAGEFAULT_FLAG_WP
140b625a
AR
538nor
539.B UFFD_PAGEFAULT_FLAG_MINOR
540are set, then the fault was a missing fault.
54e031a1 541.RE
5b1c4a1e 542.TP
e70f957d
PX
543.I pagefault.feat.pid
544The thread ID that triggered the page fault.
545.TP
5b1c4a1e
MR
546.I fork.ufd
547The file descriptor associated with the userfault object
522ab2ff
MK
548created for the child created by
549.BR fork (2).
5b1c4a1e
MR
550.TP
551.I remap.from
552The original address of the memory range that was remapped using
553.BR mremap (2).
554.TP
555.I remap.to
556The new address of the memory range that was remapped using
557.BR mremap (2).
558.TP
559.I remap.len
23db3790 560The original length of the memory range that was remapped using
5b1c4a1e
MR
561.BR mremap (2).
562.TP
563.I remove.start
564The start address of the memory range that was freed using
565.BR madvise (2)
566or unmapped
567.TP
568.I remove.end
569The end address of the memory range that was freed using
570.BR madvise (2)
571or unmapped
54e031a1 572.PP
058b32ac
MK
573A
574.BR read (2)
575on a userfaultfd file descriptor can fail with the following errors:
576.TP
577.B EINVAL
578The userfaultfd object has not yet been enabled using the
1ae6b2c7 579.B UFFDIO_API
058b32ac
MK
580.BR ioctl (2)
581operation
582.PP
1bf00323
MK
583If the
584.B O_NONBLOCK
585flag is enabled in the associated open file description,
586the userfaultfd file descriptor can be monitored with
54e031a1
MK
587.BR poll (2),
588.BR select (2),
589and
590.BR epoll (7).
591When events are available, the file descriptor indicates as readable.
1bf00323
MK
592If the
593.B O_NONBLOCK
594flag is not enabled, then
595.BR poll (2)
596(always) indicates the file as having a
1ae6b2c7 597.B POLLERR
1bf00323
MK
598condition, and
599.BR select (2)
600indicates the file descriptor as both readable and writable.
601.\" FIXME What is the reason for this seemingly odd behavior with respect
602.\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c).
603.\" Something needs to be said about this.
bf9b5158 604.SH RETURN VALUE
4aa7f5cf
MK
605On success,
606.BR userfaultfd ()
607returns a new file descriptor that refers to the userfaultfd object.
bf9b5158
MR
608On error, \-1 is returned, and
609.I errno
f6a4078b 610is set to indicate the error.
bf9b5158
MR
611.SH ERRORS
612.TP
613.B EINVAL
614An unsupported value was specified in
615.IR flags .
616.TP
1ae6b2c7 617.B EMFILE
bf9b5158
MR
618The per-process limit on the number of open file descriptors has been
619reached
620.TP
621.B ENFILE
622The system-wide limit on the total number of open files has been
623reached.
624.TP
625.B ENOMEM
626Insufficient kernel memory was available.
339b899c 627.TP
c4f13bc7 628.BR EPERM " (since Linux 5.2)"
339b899c 629.\" cefdca0a86be517bc390fc4541e3674b8e7803b0
c4f13bc7
MK
630The caller is not privileged (does not have the
631.B CAP_SYS_PTRACE
2b6923ba 632capability in the initial user namespace), and
c4f13bc7
MK
633.I /proc/sys/vm/unprivileged_userfaultfd
634has the value 0.
0da8f5c1
MK
635.SH VERSIONS
636The
637.BR userfaultfd ()
638system call first appeared in Linux 4.3.
efeece04 639.PP
be9021b1
MR
640The support for hugetlbfs and shared memory areas and
641non-page-fault events was added in Linux 4.11
3113c7f3 642.SH STANDARDS
bf9b5158
MR
643.BR userfaultfd ()
644is Linux-specific and should not be used in programs intended to be
645portable.
646.SH NOTES
de6943fa
MK
647The userfaultfd mechanism can be used as an alternative to
648traditional user-space paging techniques based on the use of the
1ae6b2c7 649.B SIGSEGV
de6943fa
MK
650signal and
651.BR mmap (2).
652It can also be used to implement lazy restore
653for checkpoint/restore mechanisms,
654as well as post-copy migration to allow (nearly) uninterrupted execution
18448166
MR
655when transferring virtual machines and Linux containers
656from one host to another.
2a885973
MR
657.SH BUGS
658If the
659.B UFFD_FEATURE_EVENT_FORK
660is enabled and a system call from the
661.BR fork (2)
047a0bb8 662family is interrupted by a signal or failed, a stale userfaultfd descriptor
2a885973 663might be created.
047a0bb8 664In this case, a spurious
2a885973
MR
665.B UFFD_EVENT_FORK
666will be delivered to the userfaultfd monitor.
a14af333 667.SH EXAMPLES
b07243ab
MK
668The program below demonstrates the use of the userfaultfd mechanism.
669The program creates two threads, one of which acts as the
670page-fault handler for the process, for the pages in a demand-page zero
671region created using
672.BR mmap (2).
efeece04 673.PP
b07243ab
MK
674The program takes one command-line argument,
675which is the number of pages that will be created in a mapping
676whose page faults will be handled via userfaultfd.
677After creating a userfaultfd object,
678the program then creates an anonymous private mapping of the specified size
679and registers the address range of that mapping using the
680.B UFFDIO_REGISTER
681.BR ioctl (2)
682operation.
00f4853c 683The program then creates a second thread that will perform the
b07243ab 684task of handling page faults.
efeece04 685.PP
b07243ab
MK
686The main thread then walks through the pages of the mapping fetching
687bytes from successive pages.
688Because the pages have not yet been accessed,
689the first access of a byte in each page will trigger a page-fault event
690on the userfaultfd file descriptor.
efeece04 691.PP
b07243ab
MK
692Each of the page-fault events is handled by the second thread,
693which sits in a loop processing input from the userfaultfd file descriptor.
694In each loop iteration, the second thread first calls
695.BR poll (2)
696to check the state of the file descriptor,
697and then reads an event from the file descriptor.
698All such events should be
699.B UFFD_EVENT_PAGEFAULT
700events,
701which the thread handles by copying a page of data into
702the faulting region using the
00f4853c 703.B UFFDIO_COPY
b07243ab
MK
704.BR ioctl (2)
705operation.
efeece04 706.PP
b07243ab 707The following is an example of what we see when running the program:
efeece04 708.PP
b07243ab 709.in +4n
b8302363 710.EX
b07243ab
MK
711$ \fB./userfaultfd_demo 3\fP
712Address returned by mmap() = 0x7fd30106c000
713
714fault_handler_thread():
715 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
716 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f
717 (uffdio_copy.copy returned 4096)
718Read address 0x7fd30106c00f in main(): A
719Read address 0x7fd30106c40f in main(): A
720Read address 0x7fd30106c80f in main(): A
721Read address 0x7fd30106cc0f in main(): A
722
723fault_handler_thread():
724 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
725 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f
726 (uffdio_copy.copy returned 4096)
727Read address 0x7fd30106d00f in main(): B
728Read address 0x7fd30106d40f in main(): B
729Read address 0x7fd30106d80f in main(): B
730Read address 0x7fd30106dc0f in main(): B
731
732fault_handler_thread():
733 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
734 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f
735 (uffdio_copy.copy returned 4096)
736Read address 0x7fd30106e00f in main(): C
737Read address 0x7fd30106e40f in main(): C
738Read address 0x7fd30106e80f in main(): C
739Read address 0x7fd30106ec0f in main(): C
b8302363 740.EE
e646a1ba 741.in
b07243ab
MK
742.SS Program source
743\&
33857069 744.\" SRC BEGIN (userfaultfd.c)
e7d0bb47 745.EX
b07243ab 746/* userfaultfd_demo.c
00f4853c 747
b07243ab
MK
748 Licensed under the GNU General Public License version 2 or later.
749*/
750#define _GNU_SOURCE
5a5208c1 751#include <err.h>
47b94bbd
AC
752#include <errno.h>
753#include <fcntl.h>
f6fbffea 754#include <inttypes.h>
b07243ab 755#include <linux/userfaultfd.h>
47b94bbd 756#include <poll.h>
b07243ab 757#include <pthread.h>
47b94bbd 758#include <stdio.h>
b07243ab 759#include <stdlib.h>
b07243ab 760#include <string.h>
47b94bbd 761#include <sys/ioctl.h>
b07243ab
MK
762#include <sys/mman.h>
763#include <sys/syscall.h>
47b94bbd 764#include <unistd.h>
b07243ab 765
b07243ab
MK
766static int page_size;
767
768static void *
769fault_handler_thread(void *arg)
770{
0b94bd78
AC
771 int nready;
772 long uffd; /* userfaultfd file descriptor */
773 ssize_t nread;
774 struct pollfd pollfd;
775 struct uffdio_copy uffdio_copy;
776
777 static int fault_cnt = 0; /* Number of faults so far handled */
778 static char *page = NULL;
779 static struct uffd_msg msg; /* Data read from userfaultfd */
b07243ab
MK
780
781 uffd = (long) arg;
782
c6beb8a1 783 /* Create a page that will be copied into the faulting region. */
b07243ab
MK
784
785 if (page == NULL) {
786 page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
787 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
788 if (page == MAP_FAILED)
5a5208c1 789 err(EXIT_FAILURE, "mmap");
b07243ab
MK
790 }
791
792 /* Loop, handling incoming events on the userfaultfd
c6beb8a1 793 file descriptor. */
b07243ab
MK
794
795 for (;;) {
796
c6beb8a1 797 /* See what poll() tells us about the userfaultfd. */
b07243ab 798
b07243ab
MK
799 pollfd.fd = uffd;
800 pollfd.events = POLLIN;
801 nready = poll(&pollfd, 1, \-1);
802 if (nready == \-1)
5a5208c1 803 err(EXIT_FAILURE, "poll");
b07243ab 804
d1a71985 805 printf("\enfault_handler_thread():\en");
b07243ab 806 printf(" poll() returns: nready = %d; "
4687ab0e
AC
807 "POLLIN = %d; POLLERR = %d\en", nready,
808 (pollfd.revents & POLLIN) != 0,
809 (pollfd.revents & POLLERR) != 0);
b07243ab 810
c6beb8a1 811 /* Read an event from the userfaultfd. */
b07243ab
MK
812
813 nread = read(uffd, &msg, sizeof(msg));
814 if (nread == 0) {
d1a71985 815 printf("EOF on userfaultfd!\en");
b07243ab 816 exit(EXIT_FAILURE);
00f4853c 817 }
b07243ab
MK
818
819 if (nread == \-1)
5a5208c1 820 err(EXIT_FAILURE, "read");
00f4853c 821
c6beb8a1 822 /* We expect only one kind of event; verify that assumption. */
b07243ab
MK
823
824 if (msg.event != UFFD_EVENT_PAGEFAULT) {
d1a71985 825 fprintf(stderr, "Unexpected event on userfaultfd\en");
b07243ab
MK
826 exit(EXIT_FAILURE);
827 }
828
c6beb8a1 829 /* Display info about the page\-fault event. */
b07243ab
MK
830
831 printf(" UFFD_EVENT_PAGEFAULT event: ");
f6fbffea
AC
832 printf("flags = %"PRIx64"; ", msg.arg.pagefault.flags);
833 printf("address = %"PRIx64"\en", msg.arg.pagefault.address);
b07243ab
MK
834
835 /* Copy the page pointed to by \(aqpage\(aq into the faulting
836 region. Vary the contents that are copied in, so that it
837 is more obvious that each fault is handled separately. */
838
839 memset(page, \(aqA\(aq + fault_cnt % 20, page_size);
840 fault_cnt++;
841
842 uffdio_copy.src = (unsigned long) page;
843
844 /* We need to handle page faults in units of pages(!).
c6beb8a1 845 So, round faulting address down to page boundary. */
b07243ab
MK
846
847 uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
af2d18b2 848 \(ti(page_size \- 1);
b07243ab
MK
849 uffdio_copy.len = page_size;
850 uffdio_copy.mode = 0;
851 uffdio_copy.copy = 0;
852 if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1)
5a5208c1 853 err(EXIT_FAILURE, "ioctl\-UFFDIO_COPY");
b07243ab 854
f6fbffea 855 printf(" (uffdio_copy.copy returned %"PRId64")\en",
4687ab0e 856 uffdio_copy.copy);
b07243ab
MK
857 }
858}
859
860int
861main(int argc, char *argv[])
862{
0b94bd78
AC
863 int s;
864 char c;
865 char *addr; /* Start of region handled by userfaultfd */
866 long uffd; /* userfaultfd file descriptor */
867 size_t len, l; /* Length of region handled by userfaultfd */
868 pthread_t thr; /* ID of thread that handles page faults */
869 struct uffdio_api uffdio_api;
870 struct uffdio_register uffdio_register;
b07243ab
MK
871
872 if (argc != 2) {
d1a71985 873 fprintf(stderr, "Usage: %s num\-pages\en", argv[0]);
b07243ab
MK
874 exit(EXIT_FAILURE);
875 }
876
877 page_size = sysconf(_SC_PAGE_SIZE);
68041421 878 len = strtoull(argv[1], NULL, 0) * page_size;
b07243ab 879
c6beb8a1 880 /* Create and enable userfaultfd object. */
b07243ab 881
ddffcbf1 882 uffd = syscall(SYS_userfaultfd, O_CLOEXEC | O_NONBLOCK);
b07243ab 883 if (uffd == \-1)
5a5208c1 884 err(EXIT_FAILURE, "userfaultfd");
b07243ab
MK
885
886 uffdio_api.api = UFFD_API;
887 uffdio_api.features = 0;
888 if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1)
5a5208c1 889 err(EXIT_FAILURE, "ioctl\-UFFDIO_API");
b07243ab
MK
890
891 /* Create a private anonymous mapping. The memory will be
892 demand\-zero paged\-\-that is, not yet allocated. When we
893 actually touch the memory, it will be allocated via
894 the userfaultfd. */
895
896 addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
897 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
898 if (addr == MAP_FAILED)
5a5208c1 899 err(EXIT_FAILURE, "mmap");
b07243ab 900
d1a71985 901 printf("Address returned by mmap() = %p\en", addr);
b07243ab
MK
902
903 /* Register the memory range of the mapping we just created for
904 handling by the userfaultfd object. In mode, we request to track
905 missing pages (i.e., pages that have not yet been faulted in). */
906
907 uffdio_register.range.start = (unsigned long) addr;
908 uffdio_register.range.len = len;
909 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
910 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1)
5a5208c1 911 err(EXIT_FAILURE, "ioctl\-UFFDIO_REGISTER");
b07243ab 912
c6beb8a1 913 /* Create a thread that will process the userfaultfd events. */
b07243ab
MK
914
915 s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
916 if (s != 0) {
5a5208c1 917 errc(EXIT_FAILURE, s, "pthread_create");
b07243ab
MK
918 }
919
920 /* Main thread now touches memory in the mapping, touching
921 locations 1024 bytes apart. This will trigger userfaultfd
922 events for all pages in the region. */
923
b07243ab
MK
924 l = 0xf; /* Ensure that faulting address is not on a page
925 boundary, in order to test that we correctly
c6beb8a1 926 handle that case in fault_handling_thread(). */
b07243ab 927 while (l < len) {
11f81bf1
AC
928 c = addr[l];
929 printf("Read address %p in %s(): ", addr + l, __func__);
d1a71985 930 printf("%c\en", c);
b07243ab
MK
931 l += 1024;
932 usleep(100000); /* Slow things down a little */
933 }
934
935 exit(EXIT_SUCCESS);
936}
e7d0bb47 937.EE
33857069 938.\" SRC END
bf9b5158
MR
939.SH SEE ALSO
940.BR fcntl (2),
4aa7f5cf 941.BR ioctl (2),
6bc6d124 942.BR ioctl_userfaultfd (2),
6d3e02a5 943.BR madvise (2),
4aa7f5cf 944.BR mmap (2)
efeece04 945.PP
1ae6b2c7 946.I Documentation/admin\-guide/mm/userfaultfd.rst
bf9b5158 947in the Linux kernel source tree