]>
Commit | Line | Data |
---|---|---|
bf9b5158 MR |
1 | .\" Copyright (c) 2016, IBM Corporation. |
2 | .\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com> | |
54e031a1 | 3 | .\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com> |
bf9b5158 | 4 | .\" |
5fbde956 | 5 | .\" SPDX-License-Identifier: Linux-man-pages-copyleft |
bf9b5158 | 6 | .\" |
4c1c5274 | 7 | .TH userfaultfd 2 (date) "Linux man-pages (unreleased)" |
bf9b5158 | 8 | .SH NAME |
68a9e23c | 9 | userfaultfd \- create a file descriptor for handling page faults in user space |
26aba212 AC |
10 | .SH LIBRARY |
11 | Standard C library | |
8fc3b2cf | 12 | .RI ( libc ", " \-lc ) |
bf9b5158 MR |
13 | .SH SYNOPSIS |
14 | .nf | |
428c8633 PX |
15 | .BR "#include <fcntl.h>" " /* Definition of " O_* " constants */" |
16 | .BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */" | |
17 | .BR "#include <linux/userfaultfd.h>" " /* Definition of " UFFD_* " constants */" | |
e1960e7e | 18 | .B #include <unistd.h> |
68e4db0a | 19 | .PP |
e1960e7e | 20 | .BI "int syscall(SYS_userfaultfd, int " flags ); |
bf9b5158 MR |
21 | .fi |
22 | .PP | |
23 | .IR Note : | |
e1960e7e AC |
24 | glibc provides no wrapper for |
25 | .BR userfaultfd (), | |
26 | necessitating the use of | |
27 | .BR syscall (2). | |
bf9b5158 | 28 | .SH DESCRIPTION |
4aa7f5cf MK |
29 | .BR userfaultfd () |
30 | creates a new userfaultfd object that can be used for delegation of page-fault | |
31 | handling to a user-space application, | |
32 | and returns a file descriptor that refers to the new object. | |
33 | The new userfaultfd object is configured using | |
bf9b5158 | 34 | .BR ioctl (2). |
efeece04 | 35 | .PP |
4aa7f5cf | 36 | Once the userfaultfd object is configured, the application can use |
bf9b5158 MR |
37 | .BR read (2) |
38 | to receive userfaultfd notifications. | |
4aa7f5cf MK |
39 | The reads from userfaultfd may be blocking or non-blocking, |
40 | depending on the value of | |
bf9b5158 MR |
41 | .I flags |
42 | used for the creation of the userfaultfd or subsequent calls to | |
4aa7f5cf | 43 | .BR fcntl (2). |
efeece04 | 44 | .PP |
bf9b5158 | 45 | The following values may be bitwise ORed in |
1ae6b2c7 | 46 | .I flags |
bf9b5158 MR |
47 | to change the behavior of |
48 | .BR userfaultfd (): | |
49 | .TP | |
1ae6b2c7 | 50 | .B O_CLOEXEC |
4aa7f5cf | 51 | Enable the close-on-exec flag for the new userfaultfd file descriptor. |
bf9b5158 MR |
52 | See the description of the |
53 | .B O_CLOEXEC | |
54 | flag in | |
4aa7f5cf | 55 | .BR open (2). |
bf9b5158 | 56 | .TP |
1ae6b2c7 | 57 | .B O_NONBLOCK |
4aa7f5cf | 58 | Enables non-blocking operation for the userfaultfd object. |
bf9b5158 | 59 | See the description of the |
1ae6b2c7 | 60 | .B O_NONBLOCK |
bf9b5158 MR |
61 | flag in |
62 | .BR open (2). | |
428c8633 PX |
63 | .TP |
64 | .B UFFD_USER_MODE_ONLY | |
0f8ae01d AC |
65 | This is an userfaultfd-specific flag that was introduced in Linux 5.11. |
66 | When set, the userfaultfd object will only be able to handle | |
67 | page faults originated from the user space on the registered regions. | |
68 | When a kernel-originated fault was triggered | |
69 | on the registered range with this userfaultfd, a | |
428c8633 PX |
70 | .B SIGBUS |
71 | signal will be delivered. | |
7b8695f2 MK |
72 | .PP |
73 | When the last file descriptor referring to a userfaultfd object is closed, | |
74 | all memory ranges that were registered with the object are unregistered | |
5b1c4a1e | 75 | and unread events are flushed. |
bf9b5158 | 76 | .\" |
e70f957d | 77 | .PP |
140b625a | 78 | Userfaultfd supports three modes of registration: |
4b338b38 PX |
79 | .TP |
80 | .BR UFFDIO_REGISTER_MODE_MISSING " (since 4.10)" | |
81 | When registered with | |
82 | .B UFFDIO_REGISTER_MODE_MISSING | |
7a3d0845 | 83 | mode, user-space will receive a page-fault notification |
4b338b38 PX |
84 | when a missing page is accessed. |
85 | The faulted thread will be stopped from execution until the page fault is | |
7a3d0845 | 86 | resolved from user-space by either an |
4b338b38 PX |
87 | .B UFFDIO_COPY |
88 | or an | |
89 | .B UFFDIO_ZEROPAGE | |
90 | ioctl. | |
91 | .TP | |
140b625a AR |
92 | .BR UFFDIO_REGISTER_MODE_MINOR " (since 5.13)" |
93 | When registered with | |
94 | .B UFFDIO_REGISTER_MODE_MINOR | |
95 | mode, user-space will receive a page-fault notification | |
96 | when a minor page fault occurs. | |
39a3ee3f AC |
97 | That is, |
98 | when a backing page is in the page cache, | |
99 | but page table entries don't yet exist. | |
140b625a AR |
100 | The faulted thread will be stopped from execution |
101 | until the page fault is resolved from user-space by an | |
102 | .B UFFDIO_CONTINUE | |
103 | ioctl. | |
104 | .TP | |
4b338b38 PX |
105 | .BR UFFDIO_REGISTER_MODE_WP " (since 5.7)" |
106 | When registered with | |
107 | .B UFFDIO_REGISTER_MODE_WP | |
7a3d0845 | 108 | mode, user-space will receive a page-fault notification |
4b338b38 PX |
109 | when a write-protected page is written. |
110 | The faulted thread will be stopped from execution | |
7a3d0845 | 111 | until user-space write-unprotects the page using an |
4b338b38 PX |
112 | .B UFFDIO_WRITEPROTECT |
113 | ioctl. | |
114 | .PP | |
115 | Multiple modes can be enabled at the same time for the same memory range. | |
116 | .PP | |
7a3d0845 MK |
117 | Since Linux 4.14, a userfaultfd page-fault notification can selectively embed |
118 | faulting thread ID information into the notification. | |
e70f957d PX |
119 | One needs to enable this feature explicitly using the |
120 | .B UFFD_FEATURE_THREAD_ID | |
121 | feature bit when initializing the userfaultfd context. | |
122 | By default, thread ID reporting is disabled. | |
54e031a1 MK |
123 | .SS Usage |
124 | The userfaultfd mechanism is designed to allow a thread in a multithreaded | |
125 | program to perform user-space paging for the other threads in the process. | |
126 | When a page fault occurs for one of the regions registered | |
127 | to the userfaultfd object, | |
128 | the faulting thread is put to sleep and | |
129 | an event is generated that can be read via the userfaultfd file descriptor. | |
130 | The fault-handling thread reads events from this file descriptor and services | |
131 | them using the operations described in | |
132 | .BR ioctl_userfaultfd (2). | |
133 | When servicing the page fault events, | |
134 | the fault-handling thread can trigger a wake-up for the sleeping thread. | |
efeece04 | 135 | .PP |
d28aadf6 MR |
136 | It is possible for the faulting threads and the fault-handling threads |
137 | to run in the context of different processes. | |
138 | In this case, these threads may belong to different programs, | |
139 | and the program that executes the faulting threads | |
140 | will not necessarily cooperate with the program that handles the page faults. | |
141 | In such non-cooperative mode, | |
792570de | 142 | the process that monitors userfaultfd and handles page faults |
d28aadf6 MR |
143 | needs to be aware of the changes in the virtual memory layout |
144 | of the faulting process to avoid memory corruption. | |
efeece04 | 145 | .PP |
4b338b38 | 146 | Since Linux 4.11, |
c2b3668b | 147 | userfaultfd can also notify the fault-handling threads about changes |
5b1c4a1e MR |
148 | in the virtual memory layout of the faulting process. |
149 | In addition, if the faulting process invokes | |
c2b3668b | 150 | .BR fork (2), |
5b1c4a1e MR |
151 | the userfaultfd objects associated with the parent may be duplicated |
152 | into the child process and the userfaultfd monitor will be notified | |
522ab2ff MK |
153 | (via the |
154 | .B UFFD_EVENT_FORK | |
155 | described below) | |
5b1c4a1e MR |
156 | about the file descriptor associated with the userfault objects |
157 | created for the child process, | |
522ab2ff | 158 | which allows the userfaultfd monitor to perform user-space paging |
5b1c4a1e | 159 | for the child process. |
265225c1 | 160 | Unlike page faults which have to be synchronous and require an |
20050169 MR |
161 | explicit or implicit wakeup, |
162 | all other events are delivered asynchronously and | |
163 | the non-cooperative process resumes execution as | |
265225c1 MK |
164 | soon as the userfaultfd manager executes |
165 | .BR read (2). | |
166 | The userfaultfd manager should carefully synchronize calls to | |
167 | .B UFFDIO_COPY | |
168 | with the processing of events. | |
efeece04 | 169 | .PP |
20050169 MR |
170 | The current asynchronous model of the event delivery is optimal for |
171 | single threaded non-cooperative userfaultfd manager implementations. | |
94a8ebda MK |
172 | .\" Regarding the preceding sentence, Mike Rapoport says: |
173 | .\" The major point here is that current events delivery model could be | |
174 | .\" problematic for multi-threaded monitor. I even suspect that it would be | |
175 | .\" impossible to ensure synchronization between page faults and non-page | |
176 | .\" fault events in multi-threaded monitor. | |
fec6cf90 | 177 | .\" .PP |
d28aadf6 | 178 | .\" FIXME elaborate about non-cooperating mode, describe its limitations |
792570de | 179 | .\" for kernels before 4.11, features added in 4.11 |
d28aadf6 MR |
180 | .\" and limitations remaining in 4.11 |
181 | .\" Maybe it's worth adding a dedicated sub-section... | |
54e031a1 | 182 | .\" |
4b338b38 PX |
183 | .PP |
184 | Since Linux 5.7, userfaultfd is able to do | |
185 | synchronous page dirty tracking using the new write-protect register mode. | |
186 | One should check against the feature bit | |
187 | .B UFFD_FEATURE_PAGEFAULT_FLAG_WP | |
188 | before using this feature. | |
189 | Similar to the original userfaultfd missing mode, the write-protect mode will | |
7a3d0845 | 190 | generate a userfaultfd notification when the protected page is written. |
4b338b38 | 191 | The user needs to resolve the page fault by unprotecting the faulted page and |
7a3d0845 MK |
192 | kicking the faulted thread to continue. |
193 | For more information, | |
194 | please refer to the "Userfaultfd write-protect mode" section. | |
195 | .\" | |
bf9b5158 MR |
196 | .SS Userfaultfd operation |
197 | After the userfaultfd object is created with | |
4aa7f5cf MK |
198 | .BR userfaultfd (), |
199 | the application must enable it using the | |
200 | .B UFFDIO_API | |
201 | .BR ioctl (2) | |
202 | operation. | |
203 | This operation allows a handshake between the kernel and user space | |
204 | to determine the API version and supported features. | |
5074d846 MK |
205 | This operation must be performed before any of the other |
206 | .BR ioctl (2) | |
207 | operations described below (or those operations fail with the | |
1ae6b2c7 | 208 | .B EINVAL |
5074d846 | 209 | error). |
efeece04 | 210 | .PP |
4aa7f5cf MK |
211 | After a successful |
212 | .B UFFDIO_API | |
213 | operation, | |
214 | the application then registers memory address ranges using the | |
215 | .B UFFDIO_REGISTER | |
216 | .BR ioctl (2) | |
217 | operation. | |
218 | After successful completion of a | |
219 | .B UFFDIO_REGISTER | |
220 | operation, | |
221 | a page fault occurring in the requested memory range, and satisfying | |
222 | the mode defined at the registration time, will be forwarded by the kernel to | |
223 | the user-space application. | |
224 | The application can then use the | |
140b625a AR |
225 | .B UFFDIO_COPY , |
226 | .B UFFDIO_ZEROPAGE , | |
bf9b5158 | 227 | or |
140b625a | 228 | .B UFFDIO_CONTINUE |
4aa7f5cf MK |
229 | .BR ioctl (2) |
230 | operations to resolve the page fault. | |
2c371032 | 231 | .PP |
4b338b38 | 232 | Since Linux 4.14, if the application sets the |
2c371032 | 233 | .B UFFD_FEATURE_SIGBUS |
e81833be | 234 | feature bit using the |
2c371032 PS |
235 | .B UFFDIO_API |
236 | .BR ioctl (2), | |
e81833be MK |
237 | no page-fault notification will be forwarded to user space. |
238 | Instead a | |
2c371032 | 239 | .B SIGBUS |
e81833be MK |
240 | signal is delivered to the faulting process. |
241 | With this feature, | |
242 | userfaultfd can be used for robustness purposes to simply catch | |
2c371032 PS |
243 | any access to areas within the registered address range that do not |
244 | have pages allocated, without having to listen to userfaultfd events. | |
245 | No userfaultfd monitor will be required for dealing with such memory | |
e81833be MK |
246 | accesses. |
247 | For example, this feature can be useful for applications that | |
2c371032 | 248 | want to prevent the kernel from automatically allocating pages and filling |
e81833be | 249 | holes in sparse files when the hole is accessed through a memory mapping. |
2c371032 PS |
250 | .PP |
251 | The | |
252 | .B UFFD_FEATURE_SIGBUS | |
e81833be MK |
253 | feature is implicitly inherited through |
254 | .BR fork (2) | |
255 | if used in combination with | |
2c371032 | 256 | .BR UFFD_FEATURE_FORK . |
efeece04 | 257 | .PP |
6bc6d124 | 258 | Details of the various |
4aa7f5cf | 259 | .BR ioctl (2) |
6bc6d124 MK |
260 | operations can be found in |
261 | .BR ioctl_userfaultfd (2). | |
efeece04 | 262 | .PP |
5b1c4a1e MR |
263 | Since Linux 4.11, events other than page-fault may enabled during |
264 | .B UFFDIO_API | |
265 | operation. | |
efeece04 | 266 | .PP |
c3d5d9df MR |
267 | Up to Linux 4.11, |
268 | userfaultfd can be used only with anonymous private memory mappings. | |
c3d5d9df MR |
269 | Since Linux 4.11, |
270 | userfaultfd can be also used with hugetlbfs and shared memory mappings. | |
54e031a1 | 271 | .\" |
4b338b38 | 272 | .SS Userfaultfd write-protect mode (since 5.7) |
827e7731 | 273 | Since Linux 5.7, userfaultfd supports write-protect mode for anonymous memory. |
4b338b38 PX |
274 | The user needs to first check availability of this feature using |
275 | .B UFFDIO_API | |
276 | ioctl against the feature bit | |
277 | .B UFFD_FEATURE_PAGEFAULT_FLAG_WP | |
278 | before using this feature. | |
279 | .PP | |
0f8ae01d AC |
280 | Since Linux 5.19, |
281 | the write-protection mode was also supported on | |
282 | shmem and hugetlbfs memory types. | |
827e7731 PX |
283 | It can be detected with the feature bit |
284 | .BR UFFD_FEATURE_WP_HUGETLBFS_SHMEM . | |
285 | .PP | |
4b338b38 PX |
286 | To register with userfaultfd write-protect mode, the user needs to initiate the |
287 | .B UFFDIO_REGISTER | |
288 | ioctl with mode | |
289 | .B UFFDIO_REGISTER_MODE_WP | |
290 | set. | |
9ae36f18 | 291 | Note that it is legal to monitor the same memory range with multiple modes. |
4b338b38 PX |
292 | For example, the user can do |
293 | .B UFFDIO_REGISTER | |
294 | with the mode set to | |
295 | .BR "UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP" . | |
296 | When there is only | |
297 | .B UFFDIO_REGISTER_MODE_WP | |
7a3d0845 | 298 | registered, user-space will |
4b338b38 | 299 | .I not |
7a3d0845 MK |
300 | receive any notification when a missing page is written. |
301 | Instead, user-space will receive a write-protect page-fault notification | |
302 | only when an existing but write-protected page got written. | |
4b338b38 PX |
303 | .PP |
304 | After the | |
305 | .B UFFDIO_REGISTER | |
306 | ioctl completed with | |
307 | .B UFFDIO_REGISTER_MODE_WP | |
308 | mode set, | |
309 | the user can write-protect any existing memory within the range using the ioctl | |
310 | .B UFFDIO_WRITEPROTECT | |
311 | where | |
312 | .I uffdio_writeprotect.mode | |
313 | should be set to | |
314 | .BR UFFDIO_WRITEPROTECT_MODE_WP . | |
315 | .PP | |
316 | When a write-protect event happens, | |
7a3d0845 | 317 | user-space will receive a page-fault notification whose |
4b338b38 PX |
318 | .I uffd_msg.pagefault.flags |
319 | will be with | |
320 | .B UFFD_PAGEFAULT_FLAG_WP | |
321 | flag set. | |
7a3d0845 MK |
322 | Note: since only writes can trigger this kind of fault, |
323 | write-protect notifications will always have the | |
4b338b38 | 324 | .B UFFD_PAGEFAULT_FLAG_WRITE |
7a3d0845 | 325 | bit set along with the |
1ae6b2c7 | 326 | .B UFFD_PAGEFAULT_FLAG_WP |
7a3d0845 | 327 | bit. |
4b338b38 PX |
328 | .PP |
329 | To resolve a write-protection page fault, the user should initiate another | |
330 | .B UFFDIO_WRITEPROTECT | |
331 | ioctl, whose | |
332 | .I uffd_msg.pagefault.flags | |
333 | should have the flag | |
334 | .B UFFDIO_WRITEPROTECT_MODE_WP | |
335 | cleared upon the faulted page or range. | |
140b625a AR |
336 | .\" |
337 | .SS Userfaultfd minor fault mode (since 5.13) | |
338 | Since Linux 5.13, | |
339 | userfaultfd supports minor fault mode. | |
340 | In this mode, | |
341 | fault messages are produced not for major faults | |
342 | (where the page was missing), | |
343 | but rather for minor faults, | |
344 | where a page exists in the page cache, | |
345 | but the page table entries are not yet present. | |
346 | The user needs to first check availability of this feature using the | |
347 | .B UFFDIO_API | |
348 | ioctl with the appropriate feature bits set before using this feature: | |
349 | .B UFFD_FEATURE_MINOR_HUGETLBFS | |
350 | since Linux 5.13, | |
351 | or | |
352 | .B UFFD_FEATURE_MINOR_SHMEM | |
353 | since Linux 5.14. | |
354 | .PP | |
355 | To register with userfaultfd minor fault mode, | |
356 | the user needs to initiate the | |
357 | .B UFFDIO_REGISTER | |
358 | ioctl with mode | |
359 | .B UFFD_REGISTER_MODE_MINOR | |
360 | set. | |
361 | .PP | |
362 | When a minor fault occurs, | |
363 | user-space will receive a page-fault notification | |
364 | whose | |
365 | .I uffd_msg.pagefault.flags | |
366 | will have the | |
367 | .B UFFD_PAGEFAULT_FLAG_MINOR | |
368 | flag set. | |
369 | .PP | |
370 | To resolve a minor page fault, | |
371 | the handler should decide whether or not | |
372 | the existing page contents need to be modified first. | |
373 | If so, | |
374 | this should be done in-place via a second, | |
375 | non-userfaultfd-registered mapping | |
376 | to the same backing page | |
377 | (e.g., by mapping the shmem or hugetlbfs file twice). | |
378 | Once the page is considered "up to date", | |
379 | the fault can be resolved by initiating an | |
380 | .B UFFDIO_CONTINUE | |
381 | ioctl, | |
382 | which installs the page table entries and | |
383 | (by default) | |
384 | wakes up the faulting thread(s). | |
385 | .PP | |
386 | Minor fault mode supports only hugetlbfs-backed (since Linux 5.13) | |
387 | and shmem-backed (since Linux 5.14) memory. | |
388 | .\" | |
54e031a1 | 389 | .SS Reading from the userfaultfd structure |
54e031a1 MK |
390 | Each |
391 | .BR read (2) | |
392 | from the userfaultfd file descriptor returns one or more | |
393 | .I uffd_msg | |
5b1c4a1e MR |
394 | structures, each of which describes a page-fault event |
395 | or an event required for the non-cooperative userfaultfd usage: | |
efeece04 | 396 | .PP |
54e031a1 | 397 | .in +4n |
b8302363 | 398 | .EX |
54e031a1 | 399 | struct uffd_msg { |
115b4e0e | 400 | __u8 event; /* Type of event */ |
54e031a1 MK |
401 | ... |
402 | union { | |
00f4853c | 403 | struct { |
115b4e0e AC |
404 | __u64 flags; /* Flags describing fault */ |
405 | __u64 address; /* Faulting address */ | |
e70f957d | 406 | union { |
115b4e0e | 407 | __u32 ptid; /* Thread ID of the fault */ |
e70f957d | 408 | } feat; |
54e031a1 | 409 | } pagefault; |
c2b3668b | 410 | |
115b4e0e AC |
411 | struct { /* Since Linux 4.11 */ |
412 | __u32 ufd; /* Userfault file descriptor | |
413 | of the child process */ | |
c2b3668b MK |
414 | } fork; |
415 | ||
115b4e0e AC |
416 | struct { /* Since Linux 4.11 */ |
417 | __u64 from; /* Old address of remapped area */ | |
418 | __u64 to; /* New address of remapped area */ | |
419 | __u64 len; /* Original mapping length */ | |
c2b3668b MK |
420 | } remap; |
421 | ||
115b4e0e AC |
422 | struct { /* Since Linux 4.11 */ |
423 | __u64 start; /* Start address of removed area */ | |
424 | __u64 end; /* End address of removed area */ | |
c2b3668b | 425 | } remove; |
b07243ab | 426 | ... |
54e031a1 MK |
427 | } arg; |
428 | ||
429 | /* Padding fields omitted */ | |
430 | } __packed; | |
b8302363 | 431 | .EE |
e646a1ba | 432 | .in |
efeece04 | 433 | .PP |
54e031a1 MK |
434 | If multiple events are available and the supplied buffer is large enough, |
435 | .BR read (2) | |
436 | returns as many events as will fit in the supplied buffer. | |
437 | If the buffer supplied to | |
438 | .BR read (2) | |
439 | is smaller than the size of the | |
440 | .I uffd_msg | |
441 | structure, the | |
442 | .BR read (2) | |
443 | fails with the error | |
444 | .BR EINVAL . | |
efeece04 | 445 | .PP |
54e031a1 MK |
446 | The fields set in the |
447 | .I uffd_msg | |
448 | structure are as follows: | |
449 | .TP | |
450 | .I event | |
451 | The type of event. | |
5b1c4a1e MR |
452 | Depending of the event type, |
453 | different fields of the | |
454 | .I arg | |
455 | union represent details required for the event processing. | |
456 | The non-page-fault events are generated only when appropriate feature | |
457 | is enabled during API handshake with | |
458 | .B UFFDIO_API | |
459 | .BR ioctl (2). | |
efeece04 | 460 | .IP |
5b1c4a1e MR |
461 | The following values can appear in the |
462 | .I event | |
463 | field: | |
464 | .RS | |
465 | .TP | |
6c12d34e | 466 | .BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)" |
5b1c4a1e MR |
467 | A page-fault event. |
468 | The page-fault details are available in the | |
469 | .I pagefault | |
470 | field. | |
54e031a1 | 471 | .TP |
6c12d34e | 472 | .BR UFFD_EVENT_FORK " (since Linux 4.11)" |
5b1c4a1e | 473 | Generated when the faulting process invokes |
522ab2ff MK |
474 | .BR fork (2) |
475 | (or | |
476 | .BR clone (2) | |
477 | without the | |
1ae6b2c7 | 478 | .B CLONE_VM |
522ab2ff | 479 | flag). |
5b1c4a1e MR |
480 | The event details are available in the |
481 | .I fork | |
482 | field. | |
23db3790 | 483 | .\" FIXME describe duplication of userfault file descriptor during fork |
5b1c4a1e | 484 | .TP |
6c12d34e | 485 | .BR UFFD_EVENT_REMAP " (since Linux 4.11)" |
5b1c4a1e | 486 | Generated when the faulting process invokes |
c2b3668b | 487 | .BR mremap (2). |
5b1c4a1e MR |
488 | The event details are available in the |
489 | .I remap | |
490 | field. | |
491 | .TP | |
6c12d34e | 492 | .BR UFFD_EVENT_REMOVE " (since Linux 4.11)" |
5b1c4a1e MR |
493 | Generated when the faulting process invokes |
494 | .BR madvise (2) | |
c2b3668b | 495 | with |
1ae6b2c7 | 496 | .B MADV_DONTNEED |
5b1c4a1e | 497 | or |
1ae6b2c7 | 498 | .B MADV_REMOVE |
5b1c4a1e MR |
499 | advice. |
500 | The event details are available in the | |
501 | .I remove | |
502 | field. | |
503 | .TP | |
6c12d34e | 504 | .BR UFFD_EVENT_UNMAP " (since Linux 4.11)" |
5b1c4a1e MR |
505 | Generated when the faulting process unmaps a memory range, |
506 | either explicitly using | |
507 | .BR munmap (2) | |
c2b3668b | 508 | or implicitly during |
5b1c4a1e MR |
509 | .BR mmap (2) |
510 | or | |
c2b3668b | 511 | .BR mremap (2). |
5b1c4a1e MR |
512 | The event details are available in the |
513 | .I remove | |
514 | field. | |
515 | .RE | |
516 | .TP | |
517 | .I pagefault.address | |
54e031a1 MK |
518 | The address that triggered the page fault. |
519 | .TP | |
5b1c4a1e | 520 | .I pagefault.flags |
54e031a1 MK |
521 | A bit mask of flags that describe the event. |
522 | For | |
523 | .BR UFFD_EVENT_PAGEFAULT , | |
524 | the following flag may appear: | |
525 | .RS | |
526 | .TP | |
140b625a AR |
527 | .B UFFD_PAGEFAULT_FLAG_WP |
528 | If this flag is set, then the fault was a write-protect fault. | |
529 | .TP | |
530 | .B UFFD_PAGEFAULT_FLAG_MINOR | |
531 | If this flag is set, then the fault was a minor fault. | |
4b338b38 | 532 | .TP |
140b625a AR |
533 | .B UFFD_PAGEFAULT_FLAG_WRITE |
534 | If this flag is set, then the fault was a write fault. | |
535 | .PP | |
536 | If neither | |
4b338b38 | 537 | .B UFFD_PAGEFAULT_FLAG_WP |
140b625a AR |
538 | nor |
539 | .B UFFD_PAGEFAULT_FLAG_MINOR | |
540 | are set, then the fault was a missing fault. | |
54e031a1 | 541 | .RE |
5b1c4a1e | 542 | .TP |
e70f957d PX |
543 | .I pagefault.feat.pid |
544 | The thread ID that triggered the page fault. | |
545 | .TP | |
5b1c4a1e MR |
546 | .I fork.ufd |
547 | The file descriptor associated with the userfault object | |
522ab2ff MK |
548 | created for the child created by |
549 | .BR fork (2). | |
5b1c4a1e MR |
550 | .TP |
551 | .I remap.from | |
552 | The original address of the memory range that was remapped using | |
553 | .BR mremap (2). | |
554 | .TP | |
555 | .I remap.to | |
556 | The new address of the memory range that was remapped using | |
557 | .BR mremap (2). | |
558 | .TP | |
559 | .I remap.len | |
23db3790 | 560 | The original length of the memory range that was remapped using |
5b1c4a1e MR |
561 | .BR mremap (2). |
562 | .TP | |
563 | .I remove.start | |
564 | The start address of the memory range that was freed using | |
565 | .BR madvise (2) | |
566 | or unmapped | |
567 | .TP | |
568 | .I remove.end | |
569 | The end address of the memory range that was freed using | |
570 | .BR madvise (2) | |
571 | or unmapped | |
54e031a1 | 572 | .PP |
058b32ac MK |
573 | A |
574 | .BR read (2) | |
575 | on a userfaultfd file descriptor can fail with the following errors: | |
576 | .TP | |
577 | .B EINVAL | |
578 | The userfaultfd object has not yet been enabled using the | |
1ae6b2c7 | 579 | .B UFFDIO_API |
058b32ac MK |
580 | .BR ioctl (2) |
581 | operation | |
582 | .PP | |
1bf00323 MK |
583 | If the |
584 | .B O_NONBLOCK | |
585 | flag is enabled in the associated open file description, | |
586 | the userfaultfd file descriptor can be monitored with | |
54e031a1 MK |
587 | .BR poll (2), |
588 | .BR select (2), | |
589 | and | |
590 | .BR epoll (7). | |
591 | When events are available, the file descriptor indicates as readable. | |
1bf00323 MK |
592 | If the |
593 | .B O_NONBLOCK | |
594 | flag is not enabled, then | |
595 | .BR poll (2) | |
596 | (always) indicates the file as having a | |
1ae6b2c7 | 597 | .B POLLERR |
1bf00323 MK |
598 | condition, and |
599 | .BR select (2) | |
600 | indicates the file descriptor as both readable and writable. | |
601 | .\" FIXME What is the reason for this seemingly odd behavior with respect | |
602 | .\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c). | |
603 | .\" Something needs to be said about this. | |
bf9b5158 | 604 | .SH RETURN VALUE |
4aa7f5cf MK |
605 | On success, |
606 | .BR userfaultfd () | |
607 | returns a new file descriptor that refers to the userfaultfd object. | |
bf9b5158 MR |
608 | On error, \-1 is returned, and |
609 | .I errno | |
f6a4078b | 610 | is set to indicate the error. |
bf9b5158 MR |
611 | .SH ERRORS |
612 | .TP | |
613 | .B EINVAL | |
614 | An unsupported value was specified in | |
615 | .IR flags . | |
616 | .TP | |
1ae6b2c7 | 617 | .B EMFILE |
bf9b5158 MR |
618 | The per-process limit on the number of open file descriptors has been |
619 | reached | |
620 | .TP | |
621 | .B ENFILE | |
622 | The system-wide limit on the total number of open files has been | |
623 | reached. | |
624 | .TP | |
625 | .B ENOMEM | |
626 | Insufficient kernel memory was available. | |
339b899c | 627 | .TP |
c4f13bc7 | 628 | .BR EPERM " (since Linux 5.2)" |
339b899c | 629 | .\" cefdca0a86be517bc390fc4541e3674b8e7803b0 |
c4f13bc7 MK |
630 | The caller is not privileged (does not have the |
631 | .B CAP_SYS_PTRACE | |
2b6923ba | 632 | capability in the initial user namespace), and |
c4f13bc7 MK |
633 | .I /proc/sys/vm/unprivileged_userfaultfd |
634 | has the value 0. | |
0da8f5c1 MK |
635 | .SH VERSIONS |
636 | The | |
637 | .BR userfaultfd () | |
638 | system call first appeared in Linux 4.3. | |
efeece04 | 639 | .PP |
be9021b1 MR |
640 | The support for hugetlbfs and shared memory areas and |
641 | non-page-fault events was added in Linux 4.11 | |
3113c7f3 | 642 | .SH STANDARDS |
bf9b5158 MR |
643 | .BR userfaultfd () |
644 | is Linux-specific and should not be used in programs intended to be | |
645 | portable. | |
646 | .SH NOTES | |
de6943fa MK |
647 | The userfaultfd mechanism can be used as an alternative to |
648 | traditional user-space paging techniques based on the use of the | |
1ae6b2c7 | 649 | .B SIGSEGV |
de6943fa MK |
650 | signal and |
651 | .BR mmap (2). | |
652 | It can also be used to implement lazy restore | |
653 | for checkpoint/restore mechanisms, | |
654 | as well as post-copy migration to allow (nearly) uninterrupted execution | |
18448166 MR |
655 | when transferring virtual machines and Linux containers |
656 | from one host to another. | |
2a885973 MR |
657 | .SH BUGS |
658 | If the | |
659 | .B UFFD_FEATURE_EVENT_FORK | |
660 | is enabled and a system call from the | |
661 | .BR fork (2) | |
047a0bb8 | 662 | family is interrupted by a signal or failed, a stale userfaultfd descriptor |
2a885973 | 663 | might be created. |
047a0bb8 | 664 | In this case, a spurious |
2a885973 MR |
665 | .B UFFD_EVENT_FORK |
666 | will be delivered to the userfaultfd monitor. | |
a14af333 | 667 | .SH EXAMPLES |
b07243ab MK |
668 | The program below demonstrates the use of the userfaultfd mechanism. |
669 | The program creates two threads, one of which acts as the | |
670 | page-fault handler for the process, for the pages in a demand-page zero | |
671 | region created using | |
672 | .BR mmap (2). | |
efeece04 | 673 | .PP |
b07243ab MK |
674 | The program takes one command-line argument, |
675 | which is the number of pages that will be created in a mapping | |
676 | whose page faults will be handled via userfaultfd. | |
677 | After creating a userfaultfd object, | |
678 | the program then creates an anonymous private mapping of the specified size | |
679 | and registers the address range of that mapping using the | |
680 | .B UFFDIO_REGISTER | |
681 | .BR ioctl (2) | |
682 | operation. | |
00f4853c | 683 | The program then creates a second thread that will perform the |
b07243ab | 684 | task of handling page faults. |
efeece04 | 685 | .PP |
b07243ab MK |
686 | The main thread then walks through the pages of the mapping fetching |
687 | bytes from successive pages. | |
688 | Because the pages have not yet been accessed, | |
689 | the first access of a byte in each page will trigger a page-fault event | |
690 | on the userfaultfd file descriptor. | |
efeece04 | 691 | .PP |
b07243ab MK |
692 | Each of the page-fault events is handled by the second thread, |
693 | which sits in a loop processing input from the userfaultfd file descriptor. | |
694 | In each loop iteration, the second thread first calls | |
695 | .BR poll (2) | |
696 | to check the state of the file descriptor, | |
697 | and then reads an event from the file descriptor. | |
698 | All such events should be | |
699 | .B UFFD_EVENT_PAGEFAULT | |
700 | events, | |
701 | which the thread handles by copying a page of data into | |
702 | the faulting region using the | |
00f4853c | 703 | .B UFFDIO_COPY |
b07243ab MK |
704 | .BR ioctl (2) |
705 | operation. | |
efeece04 | 706 | .PP |
b07243ab | 707 | The following is an example of what we see when running the program: |
efeece04 | 708 | .PP |
b07243ab | 709 | .in +4n |
b8302363 | 710 | .EX |
b07243ab MK |
711 | $ \fB./userfaultfd_demo 3\fP |
712 | Address returned by mmap() = 0x7fd30106c000 | |
713 | ||
714 | fault_handler_thread(): | |
715 | poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 | |
716 | UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f | |
717 | (uffdio_copy.copy returned 4096) | |
718 | Read address 0x7fd30106c00f in main(): A | |
719 | Read address 0x7fd30106c40f in main(): A | |
720 | Read address 0x7fd30106c80f in main(): A | |
721 | Read address 0x7fd30106cc0f in main(): A | |
722 | ||
723 | fault_handler_thread(): | |
724 | poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 | |
725 | UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f | |
726 | (uffdio_copy.copy returned 4096) | |
727 | Read address 0x7fd30106d00f in main(): B | |
728 | Read address 0x7fd30106d40f in main(): B | |
729 | Read address 0x7fd30106d80f in main(): B | |
730 | Read address 0x7fd30106dc0f in main(): B | |
731 | ||
732 | fault_handler_thread(): | |
733 | poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 | |
734 | UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f | |
735 | (uffdio_copy.copy returned 4096) | |
736 | Read address 0x7fd30106e00f in main(): C | |
737 | Read address 0x7fd30106e40f in main(): C | |
738 | Read address 0x7fd30106e80f in main(): C | |
739 | Read address 0x7fd30106ec0f in main(): C | |
b8302363 | 740 | .EE |
e646a1ba | 741 | .in |
b07243ab MK |
742 | .SS Program source |
743 | \& | |
33857069 | 744 | .\" SRC BEGIN (userfaultfd.c) |
e7d0bb47 | 745 | .EX |
b07243ab | 746 | /* userfaultfd_demo.c |
00f4853c | 747 | |
b07243ab MK |
748 | Licensed under the GNU General Public License version 2 or later. |
749 | */ | |
750 | #define _GNU_SOURCE | |
5a5208c1 | 751 | #include <err.h> |
47b94bbd AC |
752 | #include <errno.h> |
753 | #include <fcntl.h> | |
f6fbffea | 754 | #include <inttypes.h> |
b07243ab | 755 | #include <linux/userfaultfd.h> |
47b94bbd | 756 | #include <poll.h> |
b07243ab | 757 | #include <pthread.h> |
47b94bbd | 758 | #include <stdio.h> |
b07243ab | 759 | #include <stdlib.h> |
b07243ab | 760 | #include <string.h> |
47b94bbd | 761 | #include <sys/ioctl.h> |
b07243ab MK |
762 | #include <sys/mman.h> |
763 | #include <sys/syscall.h> | |
47b94bbd | 764 | #include <unistd.h> |
b07243ab | 765 | |
b07243ab MK |
766 | static int page_size; |
767 | ||
768 | static void * | |
769 | fault_handler_thread(void *arg) | |
770 | { | |
0b94bd78 AC |
771 | int nready; |
772 | long uffd; /* userfaultfd file descriptor */ | |
773 | ssize_t nread; | |
774 | struct pollfd pollfd; | |
775 | struct uffdio_copy uffdio_copy; | |
776 | ||
777 | static int fault_cnt = 0; /* Number of faults so far handled */ | |
778 | static char *page = NULL; | |
779 | static struct uffd_msg msg; /* Data read from userfaultfd */ | |
b07243ab MK |
780 | |
781 | uffd = (long) arg; | |
782 | ||
c6beb8a1 | 783 | /* Create a page that will be copied into the faulting region. */ |
b07243ab MK |
784 | |
785 | if (page == NULL) { | |
786 | page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, | |
787 | MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0); | |
788 | if (page == MAP_FAILED) | |
5a5208c1 | 789 | err(EXIT_FAILURE, "mmap"); |
b07243ab MK |
790 | } |
791 | ||
792 | /* Loop, handling incoming events on the userfaultfd | |
c6beb8a1 | 793 | file descriptor. */ |
b07243ab MK |
794 | |
795 | for (;;) { | |
796 | ||
c6beb8a1 | 797 | /* See what poll() tells us about the userfaultfd. */ |
b07243ab | 798 | |
b07243ab MK |
799 | pollfd.fd = uffd; |
800 | pollfd.events = POLLIN; | |
801 | nready = poll(&pollfd, 1, \-1); | |
802 | if (nready == \-1) | |
5a5208c1 | 803 | err(EXIT_FAILURE, "poll"); |
b07243ab | 804 | |
d1a71985 | 805 | printf("\enfault_handler_thread():\en"); |
b07243ab | 806 | printf(" poll() returns: nready = %d; " |
4687ab0e AC |
807 | "POLLIN = %d; POLLERR = %d\en", nready, |
808 | (pollfd.revents & POLLIN) != 0, | |
809 | (pollfd.revents & POLLERR) != 0); | |
b07243ab | 810 | |
c6beb8a1 | 811 | /* Read an event from the userfaultfd. */ |
b07243ab MK |
812 | |
813 | nread = read(uffd, &msg, sizeof(msg)); | |
814 | if (nread == 0) { | |
d1a71985 | 815 | printf("EOF on userfaultfd!\en"); |
b07243ab | 816 | exit(EXIT_FAILURE); |
00f4853c | 817 | } |
b07243ab MK |
818 | |
819 | if (nread == \-1) | |
5a5208c1 | 820 | err(EXIT_FAILURE, "read"); |
00f4853c | 821 | |
c6beb8a1 | 822 | /* We expect only one kind of event; verify that assumption. */ |
b07243ab MK |
823 | |
824 | if (msg.event != UFFD_EVENT_PAGEFAULT) { | |
d1a71985 | 825 | fprintf(stderr, "Unexpected event on userfaultfd\en"); |
b07243ab MK |
826 | exit(EXIT_FAILURE); |
827 | } | |
828 | ||
c6beb8a1 | 829 | /* Display info about the page\-fault event. */ |
b07243ab MK |
830 | |
831 | printf(" UFFD_EVENT_PAGEFAULT event: "); | |
f6fbffea AC |
832 | printf("flags = %"PRIx64"; ", msg.arg.pagefault.flags); |
833 | printf("address = %"PRIx64"\en", msg.arg.pagefault.address); | |
b07243ab MK |
834 | |
835 | /* Copy the page pointed to by \(aqpage\(aq into the faulting | |
836 | region. Vary the contents that are copied in, so that it | |
837 | is more obvious that each fault is handled separately. */ | |
838 | ||
839 | memset(page, \(aqA\(aq + fault_cnt % 20, page_size); | |
840 | fault_cnt++; | |
841 | ||
842 | uffdio_copy.src = (unsigned long) page; | |
843 | ||
844 | /* We need to handle page faults in units of pages(!). | |
c6beb8a1 | 845 | So, round faulting address down to page boundary. */ |
b07243ab MK |
846 | |
847 | uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address & | |
af2d18b2 | 848 | \(ti(page_size \- 1); |
b07243ab MK |
849 | uffdio_copy.len = page_size; |
850 | uffdio_copy.mode = 0; | |
851 | uffdio_copy.copy = 0; | |
852 | if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1) | |
5a5208c1 | 853 | err(EXIT_FAILURE, "ioctl\-UFFDIO_COPY"); |
b07243ab | 854 | |
f6fbffea | 855 | printf(" (uffdio_copy.copy returned %"PRId64")\en", |
4687ab0e | 856 | uffdio_copy.copy); |
b07243ab MK |
857 | } |
858 | } | |
859 | ||
860 | int | |
861 | main(int argc, char *argv[]) | |
862 | { | |
0b94bd78 AC |
863 | int s; |
864 | char c; | |
865 | char *addr; /* Start of region handled by userfaultfd */ | |
866 | long uffd; /* userfaultfd file descriptor */ | |
867 | size_t len, l; /* Length of region handled by userfaultfd */ | |
868 | pthread_t thr; /* ID of thread that handles page faults */ | |
869 | struct uffdio_api uffdio_api; | |
870 | struct uffdio_register uffdio_register; | |
b07243ab MK |
871 | |
872 | if (argc != 2) { | |
d1a71985 | 873 | fprintf(stderr, "Usage: %s num\-pages\en", argv[0]); |
b07243ab MK |
874 | exit(EXIT_FAILURE); |
875 | } | |
876 | ||
877 | page_size = sysconf(_SC_PAGE_SIZE); | |
68041421 | 878 | len = strtoull(argv[1], NULL, 0) * page_size; |
b07243ab | 879 | |
c6beb8a1 | 880 | /* Create and enable userfaultfd object. */ |
b07243ab | 881 | |
ddffcbf1 | 882 | uffd = syscall(SYS_userfaultfd, O_CLOEXEC | O_NONBLOCK); |
b07243ab | 883 | if (uffd == \-1) |
5a5208c1 | 884 | err(EXIT_FAILURE, "userfaultfd"); |
b07243ab MK |
885 | |
886 | uffdio_api.api = UFFD_API; | |
887 | uffdio_api.features = 0; | |
888 | if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1) | |
5a5208c1 | 889 | err(EXIT_FAILURE, "ioctl\-UFFDIO_API"); |
b07243ab MK |
890 | |
891 | /* Create a private anonymous mapping. The memory will be | |
892 | demand\-zero paged\-\-that is, not yet allocated. When we | |
893 | actually touch the memory, it will be allocated via | |
894 | the userfaultfd. */ | |
895 | ||
896 | addr = mmap(NULL, len, PROT_READ | PROT_WRITE, | |
897 | MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0); | |
898 | if (addr == MAP_FAILED) | |
5a5208c1 | 899 | err(EXIT_FAILURE, "mmap"); |
b07243ab | 900 | |
d1a71985 | 901 | printf("Address returned by mmap() = %p\en", addr); |
b07243ab MK |
902 | |
903 | /* Register the memory range of the mapping we just created for | |
904 | handling by the userfaultfd object. In mode, we request to track | |
905 | missing pages (i.e., pages that have not yet been faulted in). */ | |
906 | ||
907 | uffdio_register.range.start = (unsigned long) addr; | |
908 | uffdio_register.range.len = len; | |
909 | uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; | |
910 | if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1) | |
5a5208c1 | 911 | err(EXIT_FAILURE, "ioctl\-UFFDIO_REGISTER"); |
b07243ab | 912 | |
c6beb8a1 | 913 | /* Create a thread that will process the userfaultfd events. */ |
b07243ab MK |
914 | |
915 | s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd); | |
916 | if (s != 0) { | |
5a5208c1 | 917 | errc(EXIT_FAILURE, s, "pthread_create"); |
b07243ab MK |
918 | } |
919 | ||
920 | /* Main thread now touches memory in the mapping, touching | |
921 | locations 1024 bytes apart. This will trigger userfaultfd | |
922 | events for all pages in the region. */ | |
923 | ||
b07243ab MK |
924 | l = 0xf; /* Ensure that faulting address is not on a page |
925 | boundary, in order to test that we correctly | |
c6beb8a1 | 926 | handle that case in fault_handling_thread(). */ |
b07243ab | 927 | while (l < len) { |
11f81bf1 AC |
928 | c = addr[l]; |
929 | printf("Read address %p in %s(): ", addr + l, __func__); | |
d1a71985 | 930 | printf("%c\en", c); |
b07243ab MK |
931 | l += 1024; |
932 | usleep(100000); /* Slow things down a little */ | |
933 | } | |
934 | ||
935 | exit(EXIT_SUCCESS); | |
936 | } | |
e7d0bb47 | 937 | .EE |
33857069 | 938 | .\" SRC END |
bf9b5158 MR |
939 | .SH SEE ALSO |
940 | .BR fcntl (2), | |
4aa7f5cf | 941 | .BR ioctl (2), |
6bc6d124 | 942 | .BR ioctl_userfaultfd (2), |
6d3e02a5 | 943 | .BR madvise (2), |
4aa7f5cf | 944 | .BR mmap (2) |
efeece04 | 945 | .PP |
1ae6b2c7 | 946 | .I Documentation/admin\-guide/mm/userfaultfd.rst |
bf9b5158 | 947 | in the Linux kernel source tree |