]>
Commit | Line | Data |
---|---|---|
97b6084b MK |
1 | .\" Copyright (c) 2016, IBM Corporation. |
2 | .\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com> | |
3 | .\" and Copyright (C) 2016 Michael Kerrisk <mtk.manpages@gmail.com> | |
4 | .\" | |
5fbde956 | 5 | .\" SPDX-License-Identifier: Linux-man-pages-copyleft |
97b6084b MK |
6 | .\" |
7 | .\" | |
4c1c5274 | 8 | .TH ioctl_userfaultfd 2 (date) "Linux man-pages (unreleased)" |
97b6084b | 9 | .SH NAME |
7c4dfcac | 10 | ioctl_userfaultfd \- create a file descriptor for handling page faults in user |
97b6084b | 11 | space |
1d7a4349 AC |
12 | .SH LIBRARY |
13 | Standard C library | |
8fc3b2cf | 14 | .RI ( libc ", " \-lc ) |
97b6084b MK |
15 | .SH SYNOPSIS |
16 | .nf | |
cac89bc7 | 17 | .BR "#include <linux/userfaultfd.h>" " /* Definition of " UFFD* " constants */" |
97b6084b | 18 | .B #include <sys/ioctl.h> |
c6d039a3 | 19 | .P |
97b6084b MK |
20 | .BI "int ioctl(int " fd ", int " cmd ", ...);" |
21 | .fi | |
22 | .SH DESCRIPTION | |
23 | Various | |
24 | .BR ioctl (2) | |
25 | operations can be performed on a userfaultfd object (created by a call to | |
26 | .BR userfaultfd (2)) | |
27 | using calls of the form: | |
c6d039a3 | 28 | .P |
a458bc45 MK |
29 | .in +4n |
30 | .EX | |
31 | ioctl(fd, cmd, argp); | |
32 | .EE | |
33 | .in | |
c6d039a3 | 34 | .P |
97b6084b MK |
35 | In the above, |
36 | .I fd | |
37 | is a file descriptor referring to a userfaultfd object, | |
38 | .I cmd | |
39 | is one of the commands listed below, and | |
40 | .I argp | |
41 | is a pointer to a data structure that is specific to | |
42 | .IR cmd . | |
c6d039a3 | 43 | .P |
fe905357 | 44 | The various |
97b6084b | 45 | .BR ioctl (2) |
fe905357 MK |
46 | operations are described below. |
47 | The | |
f7111396 | 48 | .BR UFFDIO_API , |
fe905357 MK |
49 | .BR UFFDIO_REGISTER , |
50 | and | |
1ae6b2c7 | 51 | .B UFFDIO_UNREGISTER |
fe905357 MK |
52 | operations are used to |
53 | .I configure | |
54 | userfaultfd behavior. | |
55 | These operations allow the caller to choose what features will be enabled and | |
97b6084b | 56 | what kinds of events will be delivered to the application. |
fe905357 | 57 | The remaining operations are |
1ae6b2c7 | 58 | .I range |
fe905357 | 59 | operations. |
e8a846ae | 60 | These operations enable the calling application to resolve page-fault |
a18cc179 | 61 | events. |
637512ed MK |
62 | .\" |
63 | .SS UFFDIO_API | |
1b945386 | 64 | (Since Linux 4.3.) |
97b6084b | 65 | Enable operation of the userfaultfd and perform API handshake. |
c6d039a3 | 66 | .P |
97b6084b | 67 | The |
74bd6e93 MK |
68 | .I argp |
69 | argument is a pointer to a | |
1ae6b2c7 | 70 | .I uffdio_api |
74bd6e93 | 71 | structure, defined as: |
c6d039a3 | 72 | .P |
97b6084b | 73 | .in +4n |
b76974c1 | 74 | .EX |
97b6084b | 75 | struct uffdio_api { |
115b4e0e AC |
76 | __u64 api; /* Requested API version (input) */ |
77 | __u64 features; /* Requested features (input/output) */ | |
78 | __u64 ioctls; /* Available ioctl() operations (output) */ | |
97b6084b | 79 | }; |
b76974c1 | 80 | .EE |
97b6084b | 81 | .in |
c6d039a3 | 82 | .P |
97b6084b MK |
83 | The |
84 | .I api | |
85 | field denotes the API version requested by the application. | |
9a3c2194 MK |
86 | The kernel verifies that it can support the requested API version, |
87 | and sets the | |
97b6084b MK |
88 | .I features |
89 | and | |
90 | .I ioctls | |
91 | fields to bit masks representing all the available features and the generic | |
405513d3 | 92 | .BR ioctl (2) |
97b6084b | 93 | operations available. |
c6d039a3 | 94 | .P |
a252b334 AR |
95 | Since Linux 4.11, |
96 | applications should use the | |
97 | .I features | |
98 | field to perform a two-step handshake. | |
99 | First, | |
bf8647d1 | 100 | .B UFFDIO_API |
a252b334 AR |
101 | is called with the |
102 | .I features | |
103 | field set to zero. | |
104 | The kernel responds by setting all supported feature bits. | |
c6d039a3 | 105 | .P |
a252b334 AR |
106 | Applications which do not require any specific features |
107 | can begin using the userfaultfd immediately. | |
108 | Applications which do need specific features | |
109 | should call | |
bf8647d1 | 110 | .B UFFDIO_API |
a252b334 AR |
111 | again with a subset of the reported feature bits set |
112 | to enable those features. | |
c6d039a3 | 113 | .P |
b324e17d | 114 | Before Linux 4.11, the |
30dced33 MR |
115 | .I features |
116 | field must be initialized to zero before the call to | |
4190ca0a | 117 | .BR UFFDIO_API , |
1a053be3 | 118 | and zero (i.e., no feature bits) is placed in the |
30dced33 MR |
119 | .I features |
120 | field by the kernel upon return from | |
121 | .BR ioctl (2). | |
c6d039a3 | 122 | .P |
a252b334 AR |
123 | If the application sets unsupported feature bits, |
124 | the kernel will zero out the returned | |
66487517 MR |
125 | .I uffdio_api |
126 | structure and return | |
127 | .BR EINVAL . | |
c6d039a3 | 128 | .P |
364270d0 | 129 | The following feature bits may be set: |
30dced33 | 130 | .TP |
364270d0 | 131 | .BR UFFD_FEATURE_EVENT_FORK " (since Linux 4.11)" |
66487517 MR |
132 | When this feature is enabled, |
133 | the userfaultfd objects associated with a parent process are duplicated | |
134 | into the child process during | |
135 | .BR fork (2) | |
42630fa9 MK |
136 | and a |
137 | .B UFFD_EVENT_FORK | |
81bb1233 | 138 | event is delivered to the userfaultfd monitor |
30dced33 | 139 | .TP |
364270d0 | 140 | .BR UFFD_FEATURE_EVENT_REMAP " (since Linux 4.11)" |
66487517 MR |
141 | If this feature is enabled, |
142 | when the faulting process invokes | |
42630fa9 | 143 | .BR mremap (2), |
66487517 | 144 | the userfaultfd monitor will receive an event of type |
42630fa9 | 145 | .BR UFFD_EVENT_REMAP . |
30dced33 | 146 | .TP |
364270d0 | 147 | .BR UFFD_FEATURE_EVENT_REMOVE " (since Linux 4.11)" |
66487517 MR |
148 | If this feature is enabled, |
149 | when the faulting process calls | |
42630fa9 MK |
150 | .BR madvise (2) |
151 | with the | |
152 | .B MADV_DONTNEED | |
66487517 | 153 | or |
42630fa9 MK |
154 | .B MADV_REMOVE |
155 | advice value to free a virtual memory area | |
66487517 | 156 | the userfaultfd monitor will receive an event of type |
42630fa9 | 157 | .BR UFFD_EVENT_REMOVE . |
30dced33 | 158 | .TP |
364270d0 | 159 | .BR UFFD_FEATURE_EVENT_UNMAP " (since Linux 4.11)" |
66487517 MR |
160 | If this feature is enabled, |
161 | when the faulting process unmaps virtual memory either explicitly with | |
42630fa9 MK |
162 | .BR munmap (2), |
163 | or implicitly during either | |
66487517 MR |
164 | .BR mmap (2) |
165 | or | |
03659d7d | 166 | .BR mremap (2), |
66487517 | 167 | the userfaultfd monitor will receive an event of type |
42630fa9 | 168 | .BR UFFD_EVENT_UNMAP . |
30dced33 | 169 | .TP |
364270d0 | 170 | .BR UFFD_FEATURE_MISSING_HUGETLBFS " (since Linux 4.11)" |
66487517 MR |
171 | If this feature bit is set, |
172 | the kernel supports registering userfaultfd ranges on hugetlbfs | |
173 | virtual memory areas | |
30dced33 | 174 | .TP |
364270d0 | 175 | .BR UFFD_FEATURE_MISSING_SHMEM " (since Linux 4.11)" |
66487517 | 176 | If this feature bit is set, |
41e9e648 MR |
177 | the kernel supports registering userfaultfd ranges on shared memory areas. |
178 | This includes all kernel shared memory APIs: | |
179 | System V shared memory, | |
5191c688 MK |
180 | .BR tmpfs (5), |
181 | shared mappings of | |
182 | .IR /dev/zero , | |
183 | .BR mmap (2) | |
184 | with the | |
f1008f5f | 185 | .B MAP_SHARED |
41e9e648 MR |
186 | flag set, |
187 | .BR memfd_create (2), | |
5191c688 | 188 | and so on. |
2c371032 | 189 | .TP |
364270d0 | 190 | .BR UFFD_FEATURE_SIGBUS " (since Linux 4.14)" |
1df2779d | 191 | .\" commit 2d6d6f5a09a96cc1fec7ed992b825e05f64cb50e |
1df2779d MK |
192 | If this feature bit is set, no page-fault events |
193 | .RB ( UFFD_EVENT_PAGEFAULT ) | |
194 | will be delivered. | |
195 | Instead, a | |
2c371032 | 196 | .B SIGBUS |
1df2779d MK |
197 | signal will be sent to the faulting process. |
198 | Applications using this | |
2c371032 PS |
199 | feature will not require the use of a userfaultfd monitor for processing |
200 | memory accesses to the regions registered with userfaultfd. | |
fbda69bb PX |
201 | .TP |
202 | .BR UFFD_FEATURE_THREAD_ID " (since Linux 4.14)" | |
203 | If this feature bit is set, | |
204 | .I uffd_msg.pagefault.feat.ptid | |
7a3d0845 | 205 | will be set to the faulted thread ID for each page-fault message. |
140b625a | 206 | .TP |
1f0f451c AR |
207 | .BR UFFD_FEATURE_PAGEFAULT_FLAG_WP " (since Linux 5.10)" |
208 | If this feature bit is set, | |
209 | userfaultfd supports write-protect faults | |
210 | for anonymous memory. | |
211 | (Note that shmem / hugetlbfs support | |
212 | is indicated by a separate feature.) | |
213 | .TP | |
140b625a AR |
214 | .BR UFFD_FEATURE_MINOR_HUGETLBFS " (since Linux 5.13)" |
215 | If this feature bit is set, | |
216 | the kernel supports registering userfaultfd ranges | |
217 | in minor mode on hugetlbfs-backed memory areas. | |
218 | .TP | |
219 | .BR UFFD_FEATURE_MINOR_SHMEM " (since Linux 5.14)" | |
220 | If this feature bit is set, | |
221 | the kernel supports registering userfaultfd ranges | |
222 | in minor mode on shmem-backed memory areas. | |
994a585e NA |
223 | .TP |
224 | .BR UFFD_FEATURE_EXACT_ADDRESS " (since Linux 5.18)" | |
225 | If this feature bit is set, | |
226 | .I uffd_msg.pagefault.address | |
227 | will be set to the exact page-fault address that was reported by the hardware, | |
228 | and will not mask the offset within the page. | |
229 | Note that old Linux versions might indicate the exact address as well, | |
230 | even though the feature bit is not set. | |
1f0f451c AR |
231 | .TP |
232 | .BR UFFD_FEATURE_WP_HUGETLBFS_SHMEM " (since Linux 5.19)" | |
233 | If this feature bit is set, | |
234 | userfaultfd supports write-protect faults | |
235 | for hugetlbfs and shmem / tmpfs memory. | |
236 | .TP | |
237 | .BR UFFD_FEATURE_WP_UNPOPULATED " (since Linux 6.4)" | |
238 | If this feature bit is set, | |
239 | the kernel will handle anonymous memory the same way as file memory, | |
a0d480cd | 240 | by allowing the user to write-protect unpopulated page table entries. |
1f0f451c AR |
241 | .TP |
242 | .BR UFFD_FEATURE_POISON " (since Linux 6.6)" | |
243 | If this feature bit is set, | |
244 | the kernel supports resolving faults with the | |
245 | .B UFFDIO_POISON | |
246 | ioctl. | |
ad3a0ad8 MUA |
247 | .TP |
248 | .BR UFFD_FEATURE_WP_ASYNC " (since Linux 6.7)" | |
249 | If this feature bit is set, | |
250 | the write protection faults would be asynchronously resolved | |
251 | by the kernel. | |
c6d039a3 | 252 | .P |
9a3c2194 MK |
253 | The returned |
254 | .I ioctls | |
255 | field can contain the following bits: | |
256 | .\" FIXME This user-space API seems not fully polished. Why are there | |
53b95540 | 257 | .\" not constants defined for each of the bit-mask values listed below? |
9a3c2194 MK |
258 | .TP |
259 | .B 1 << _UFFDIO_API | |
260 | The | |
261 | .B UFFDIO_API | |
262 | operation is supported. | |
263 | .TP | |
264 | .B 1 << _UFFDIO_REGISTER | |
265 | The | |
266 | .B UFFDIO_REGISTER | |
267 | operation is supported. | |
268 | .TP | |
269 | .B 1 << _UFFDIO_UNREGISTER | |
270 | The | |
271 | .B UFFDIO_UNREGISTER | |
272 | operation is supported. | |
c6d039a3 | 273 | .P |
97b6084b MK |
274 | This |
275 | .BR ioctl (2) | |
276 | operation returns 0 on success. | |
277 | On error, \-1 is returned and | |
278 | .I errno | |
855d489a | 279 | is set to indicate the error. |
5f1acc81 AR |
280 | If an error occurs, |
281 | the kernel may zero the provided | |
282 | .I uffdio_api | |
283 | structure. | |
284 | The caller should treat its contents as unspecified, | |
285 | and reinitialize it before re-attempting another | |
286 | .B UFFDIO_API | |
287 | call. | |
97b6084b | 288 | Possible errors include: |
cd8389ff MK |
289 | .TP |
290 | .B EFAULT | |
291 | .I argp | |
292 | refers to an address that is outside the calling process's | |
293 | accessible address space. | |
97b6084b MK |
294 | .TP |
295 | .B EINVAL | |
9a3c2194 MK |
296 | The API version requested in the |
297 | .I api | |
298 | field is not supported by this kernel, or the | |
299 | .I features | |
66487517 MR |
300 | field passed to the kernel includes feature bits that are not supported |
301 | by the current kernel version. | |
db3d5cc1 AR |
302 | .TP |
303 | .B EINVAL | |
304 | A previous | |
305 | .B UFFDIO_API | |
306 | call already enabled one or more features for this userfaultfd. | |
307 | Calling | |
308 | .B UFFDIO_API | |
309 | twice, | |
310 | the first time with no features set, | |
311 | is explicitly allowed | |
312 | as per the two-step feature detection handshake. | |
313 | .TP | |
314 | .B EPERM | |
315 | The | |
316 | .B UFFD_FEATURE_EVENT_FORK | |
317 | feature was enabled, | |
318 | but the calling process doesn't have the | |
319 | .B CAP_SYS_PTRACE | |
320 | capability. | |
637512ed | 321 | .SS UFFDIO_REGISTER |
1b945386 | 322 | (Since Linux 4.3.) |
97b6084b | 323 | Register a memory address range with the userfaultfd object. |
fc497d11 | 324 | The pages in the range must be \[lq]compatible\[rq]. |
140b625a AR |
325 | Please refer to the list of register modes below |
326 | for the compatible memory backends for each mode. | |
c6d039a3 | 327 | .P |
97b6084b | 328 | The |
74bd6e93 MK |
329 | .I argp |
330 | argument is a pointer to a | |
97b6084b | 331 | .I uffdio_register |
74bd6e93 | 332 | structure, defined as: |
c6d039a3 | 333 | .P |
97b6084b | 334 | .in +4n |
b76974c1 | 335 | .EX |
97b6084b | 336 | struct uffdio_range { |
115b4e0e AC |
337 | __u64 start; /* Start of range */ |
338 | __u64 len; /* Length of range (bytes) */ | |
97b6084b | 339 | }; |
fe5dba13 | 340 | \& |
97b6084b MK |
341 | struct uffdio_register { |
342 | struct uffdio_range range; | |
115b4e0e AC |
343 | __u64 mode; /* Desired mode of operation (input) */ |
344 | __u64 ioctls; /* Available ioctl() operations (output) */ | |
97b6084b | 345 | }; |
b76974c1 | 346 | .EE |
97b6084b | 347 | .in |
c6d039a3 | 348 | .P |
97b6084b MK |
349 | The |
350 | .I range | |
351 | field defines a memory range starting at | |
352 | .I start | |
353 | and continuing for | |
354 | .I len | |
355 | bytes that should be handled by the userfaultfd. | |
c6d039a3 | 356 | .P |
97b6084b MK |
357 | The |
358 | .I mode | |
359 | field defines the mode of operation desired for this memory region. | |
360 | The following values may be bitwise ORed to set the userfaultfd mode for | |
361 | the specified range: | |
97b6084b MK |
362 | .TP |
363 | .B UFFDIO_REGISTER_MODE_MISSING | |
a23c93d8 | 364 | Track page faults on missing pages. |
140b625a AR |
365 | Since Linux 4.3, |
366 | only private anonymous ranges are compatible. | |
367 | Since Linux 4.11, | |
368 | hugetlbfs and shared memory ranges are also compatible. | |
97b6084b MK |
369 | .TP |
370 | .B UFFDIO_REGISTER_MODE_WP | |
371 | Track page faults on write-protected pages. | |
140b625a AR |
372 | Since Linux 5.7, |
373 | only private anonymous ranges are compatible. | |
374 | .TP | |
375 | .B UFFDIO_REGISTER_MODE_MINOR | |
376 | Track minor page faults. | |
377 | Since Linux 5.13, | |
378 | only hugetlbfs ranges are compatible. | |
379 | Since Linux 5.14, | |
fa0fd472 | 380 | compatibility with shmem ranges was added. |
c6d039a3 | 381 | .P |
264a0a30 | 382 | If the operation is successful, the kernel modifies the |
97b6084b | 383 | .I ioctls |
264a0a30 MK |
384 | bit-mask field to indicate which |
385 | .BR ioctl (2) | |
386 | operations are available for the specified range. | |
6edec894 IA |
387 | This returned bit mask can contain the following bits: |
388 | .TP | |
389 | .B 1 << _UFFDIO_COPY | |
390 | The | |
391 | .B UFFDIO_COPY | |
392 | operation is supported. | |
393 | .TP | |
394 | .B 1 << _UFFDIO_WAKE | |
395 | The | |
396 | .B UFFDIO_WAKE | |
397 | operation is supported. | |
398 | .TP | |
399 | .B 1 << _UFFDIO_WRITEPROTECT | |
400 | The | |
401 | .B UFFDIO_WRITEPROTECT | |
af205afd | 402 | operation is supported. |
6edec894 IA |
403 | .TP |
404 | .B 1 << _UFFDIO_ZEROPAGE | |
405 | The | |
406 | .B UFFDIO_ZEROPAGE | |
407 | operation is supported. | |
140b625a AR |
408 | .TP |
409 | .B 1 << _UFFDIO_CONTINUE | |
410 | The | |
411 | .B UFFDIO_CONTINUE | |
412 | operation is supported. | |
9ae03d21 AR |
413 | .TP |
414 | .B 1 << _UFFDIO_POISON | |
415 | The | |
416 | .B UFFDIO_POISON | |
417 | operation is supported. | |
c6d039a3 | 418 | .P |
264a0a30 MK |
419 | This |
420 | .BR ioctl (2) | |
421 | operation returns 0 on success. | |
422 | On error, \-1 is returned and | |
423 | .I errno | |
855d489a | 424 | is set to indicate the error. |
264a0a30 MK |
425 | Possible errors include: |
426 | .\" FIXME Is the following error list correct? | |
427 | .\" | |
428 | .TP | |
429 | .B EBUSY | |
430 | A mapping in the specified range is registered with another | |
431 | userfaultfd object. | |
432 | .TP | |
cd8389ff MK |
433 | .B EFAULT |
434 | .I argp | |
435 | refers to an address that is outside the calling process's | |
436 | accessible address space. | |
437 | .TP | |
264a0a30 MK |
438 | .B EINVAL |
439 | An invalid or unsupported bit was specified in the | |
440 | .I mode | |
441 | field; or the | |
442 | .I mode | |
443 | field was zero. | |
444 | .TP | |
445 | .B EINVAL | |
446 | There is no mapping in the specified address range. | |
447 | .TP | |
448 | .B EINVAL | |
242c2b6b MK |
449 | .I range.start |
450 | or | |
451 | .I range.len | |
452 | is not a multiple of the system page size; or, | |
453 | .I range.len | |
454 | is zero; or these fields are otherwise invalid. | |
455 | .TP | |
456 | .B EINVAL | |
264a0a30 | 457 | There as an incompatible mapping in the specified address range. |
a440cc33 MK |
458 | .\" Mike Rapoport: |
459 | .\" ENOMEM if the process is exiting and the | |
99cf1681 | 460 | .\" mm_struct has gone by the time userfault grabs it. |
637512ed | 461 | .SS UFFDIO_UNREGISTER |
1b945386 | 462 | (Since Linux 4.3.) |
97b6084b | 463 | Unregister a memory address range from userfaultfd. |
fc497d11 AC |
464 | The pages in the range must be \[lq]compatible\[rq] |
465 | (see the description of | |
466 | .BR UFFDIO_REGISTER .) | |
c6d039a3 | 467 | .P |
97b6084b | 468 | The address range to unregister is specified in the |
1ae6b2c7 | 469 | .I uffdio_range |
97b6084b MK |
470 | structure pointed to by |
471 | .IR argp . | |
c6d039a3 | 472 | .P |
97b6084b MK |
473 | This |
474 | .BR ioctl (2) | |
475 | operation returns 0 on success. | |
476 | On error, \-1 is returned and | |
477 | .I errno | |
855d489a | 478 | is set to indicate the error. |
97b6084b | 479 | Possible errors include: |
97b6084b MK |
480 | .TP |
481 | .B EINVAL | |
482 | Either the | |
483 | .I start | |
484 | or the | |
485 | .I len | |
486 | field of the | |
487 | .I ufdio_range | |
242c2b6b MK |
488 | structure was not a multiple of the system page size; or the |
489 | .I len | |
490 | field was zero; or these fields were otherwise invalid. | |
97b6084b MK |
491 | .TP |
492 | .B EINVAL | |
493 | There as an incompatible mapping in the specified address range. | |
494 | .TP | |
495 | .B EINVAL | |
496 | There was no mapping in the specified address range. | |
637512ed MK |
497 | .\" |
498 | .SS UFFDIO_COPY | |
1b945386 | 499 | (Since Linux 4.3.) |
97b6084b MK |
500 | Atomically copy a continuous memory chunk into the userfault registered |
501 | range and optionally wake up the blocked thread. | |
502 | The source and destination addresses and the number of bytes to copy are | |
503 | specified by the | |
aad729a0 AC |
504 | .IR src , |
505 | .IR dst , | |
506 | and | |
507 | .I len | |
74bd6e93 MK |
508 | fields of the |
509 | .I uffdio_copy | |
510 | structure pointed to by | |
511 | .IR argp : | |
c6d039a3 | 512 | .P |
97b6084b | 513 | .in +4n |
b76974c1 | 514 | .EX |
97b6084b | 515 | struct uffdio_copy { |
115b4e0e AC |
516 | __u64 dst; /* Destination of copy */ |
517 | __u64 src; /* Source of copy */ | |
518 | __u64 len; /* Number of bytes to copy */ | |
519 | __u64 mode; /* Flags controlling behavior of copy */ | |
520 | __s64 copy; /* Number of bytes copied, or negated error */ | |
97b6084b | 521 | }; |
b76974c1 | 522 | .EE |
97b6084b | 523 | .in |
c6d039a3 | 524 | .P |
1dca8e3d | 525 | The following value may be bitwise ORed in |
1ae6b2c7 | 526 | .I mode |
97b6084b MK |
527 | to change the behavior of the |
528 | .B UFFDIO_COPY | |
529 | operation: | |
97b6084b MK |
530 | .TP |
531 | .B UFFDIO_COPY_MODE_DONTWAKE | |
e8a846ae | 532 | Do not wake up the thread that waits for page-fault resolution |
f559fa36 PX |
533 | .TP |
534 | .B UFFDIO_COPY_MODE_WP | |
535 | Copy the page with read-only permission. | |
536 | This allows the user to trap the next write to the page, | |
537 | which will block and generate another write-protect userfault message. | |
7a3d0845 | 538 | This is used only when both |
f559fa36 PX |
539 | .B UFFDIO_REGISTER_MODE_MISSING |
540 | and | |
541 | .B UFFDIO_REGISTER_MODE_WP | |
542 | modes are enabled for the registered range. | |
c6d039a3 | 543 | .P |
97b6084b MK |
544 | The |
545 | .I copy | |
1dca8e3d MK |
546 | field is used by the kernel to return the number of bytes |
547 | that was actually copied, or an error (a negated | |
548 | .IR errno -style | |
549 | value). | |
53b95540 MK |
550 | .\" FIXME Above: Why is the 'copy' field used to return error values? |
551 | .\" This should be explained in the manual page. | |
1dca8e3d MK |
552 | If the value returned in |
553 | .I copy | |
554 | doesn't match the value that was specified in | |
555 | .IR len , | |
556 | the operation fails with the error | |
557 | .BR EAGAIN . | |
558 | The | |
559 | .I copy | |
560 | field is output-only; | |
97b6084b MK |
561 | it is not read by the |
562 | .B UFFDIO_COPY | |
563 | operation. | |
c6d039a3 | 564 | .P |
1dca8e3d MK |
565 | This |
566 | .BR ioctl (2) | |
567 | operation returns 0 on success. | |
568 | In this case, the entire area was copied. | |
569 | On error, \-1 is returned and | |
570 | .I errno | |
855d489a | 571 | is set to indicate the error. |
1dca8e3d MK |
572 | Possible errors include: |
573 | .TP | |
574 | .B EAGAIN | |
575 | The number of bytes copied (i.e., the value returned in the | |
576 | .I copy | |
577 | field) | |
578 | does not equal the value that was specified in the | |
579 | .I len | |
580 | field. | |
581 | .TP | |
582 | .B EINVAL | |
583 | Either | |
584 | .I dst | |
585 | or | |
586 | .I len | |
587 | was not a multiple of the system page size, or the range specified by | |
1ae6b2c7 | 588 | .I src |
1dca8e3d | 589 | and |
1ae6b2c7 | 590 | .I len |
1dca8e3d | 591 | or |
1ae6b2c7 | 592 | .I dst |
1dca8e3d | 593 | and |
1ae6b2c7 | 594 | .I len |
1dca8e3d MK |
595 | was invalid. |
596 | .TP | |
597 | .B EINVAL | |
598 | An invalid bit was specified in the | |
1ae6b2c7 | 599 | .I mode |
1dca8e3d | 600 | field. |
337405ef | 601 | .TP |
5191c688 | 602 | .BR ENOENT " (since Linux 4.11)" |
337405ef | 603 | The faulting process has changed |
5191c688 | 604 | its virtual memory layout simultaneously with an outstanding |
4190ca0a | 605 | .B UFFDIO_COPY |
337405ef MR |
606 | operation. |
607 | .TP | |
9dc44855 | 608 | .BR ENOSPC " (from Linux 4.11 until Linux 4.13)" |
e3ec1293 | 609 | The faulting process has exited at the time of a |
4190ca0a | 610 | .B UFFDIO_COPY |
e3ec1293 MR |
611 | operation. |
612 | .TP | |
613 | .BR ESRCH " (since Linux 4.13)" | |
5191c688 | 614 | The faulting process has exited at the time of a |
4190ca0a | 615 | .B UFFDIO_COPY |
337405ef | 616 | operation. |
97b6084b | 617 | .\" |
611ec3a3 | 618 | .SS UFFDIO_ZEROPAGE |
1b945386 | 619 | (Since Linux 4.3.) |
611ec3a3 | 620 | Zero out a memory range registered with userfaultfd. |
c6d039a3 | 621 | .P |
97b6084b MK |
622 | The requested range is specified by the |
623 | .I range | |
624 | field of the | |
625 | .I uffdio_zeropage | |
74bd6e93 MK |
626 | structure pointed to by |
627 | .IR argp : | |
c6d039a3 | 628 | .P |
97b6084b | 629 | .in +4n |
b8302363 | 630 | .EX |
97b6084b MK |
631 | struct uffdio_zeropage { |
632 | struct uffdio_range range; | |
115b4e0e AC |
633 | __u64 mode; /* Flags controlling behavior of copy */ |
634 | __s64 zeropage; /* Number of bytes zeroed, or negated error */ | |
97b6084b | 635 | }; |
b8302363 | 636 | .EE |
97b6084b | 637 | .in |
c6d039a3 | 638 | .P |
611ec3a3 | 639 | The following value may be bitwise ORed in |
1ae6b2c7 | 640 | .I mode |
405513d3 | 641 | to change the behavior of the |
99de80c5 | 642 | .B UFFDIO_ZEROPAGE |
97b6084b | 643 | operation: |
97b6084b MK |
644 | .TP |
645 | .B UFFDIO_ZEROPAGE_MODE_DONTWAKE | |
646 | Do not wake up the thread that waits for page-fault resolution. | |
c6d039a3 | 647 | .P |
97b6084b MK |
648 | The |
649 | .I zeropage | |
611ec3a3 | 650 | field is used by the kernel to return the number of bytes |
97b6084b MK |
651 | that was actually zeroed, |
652 | or an error in the same manner as | |
611ec3a3 MK |
653 | .BR UFFDIO_COPY . |
654 | .\" FIXME Why is the 'zeropage' field used to return error values? | |
53b95540 | 655 | .\" This should be explained in the manual page. |
611ec3a3 MK |
656 | If the value returned in the |
657 | .I zeropage | |
658 | field doesn't match the value that was specified in | |
659 | .IR range.len , | |
660 | the operation fails with the error | |
661 | .BR EAGAIN . | |
662 | The | |
663 | .I zeropage | |
664 | field is output-only; | |
665 | it is not read by the | |
99de80c5 | 666 | .B UFFDIO_ZEROPAGE |
611ec3a3 | 667 | operation. |
c6d039a3 | 668 | .P |
611ec3a3 MK |
669 | This |
670 | .BR ioctl (2) | |
671 | operation returns 0 on success. | |
672 | In this case, the entire area was zeroed. | |
673 | On error, \-1 is returned and | |
674 | .I errno | |
855d489a | 675 | is set to indicate the error. |
611ec3a3 MK |
676 | Possible errors include: |
677 | .TP | |
678 | .B EAGAIN | |
679 | The number of bytes zeroed (i.e., the value returned in the | |
680 | .I zeropage | |
681 | field) | |
682 | does not equal the value that was specified in the | |
683 | .I range.len | |
684 | field. | |
685 | .TP | |
686 | .B EINVAL | |
687 | Either | |
688 | .I range.start | |
689 | or | |
690 | .I range.len | |
242c2b6b MK |
691 | was not a multiple of the system page size; or |
692 | .I range.len | |
693 | was zero; or the range specified was invalid. | |
611ec3a3 MK |
694 | .TP |
695 | .B EINVAL | |
696 | An invalid bit was specified in the | |
1ae6b2c7 | 697 | .I mode |
611ec3a3 | 698 | field. |
e3ec1293 MR |
699 | .TP |
700 | .BR ESRCH " (since Linux 4.13)" | |
701 | The faulting process has exited at the time of a | |
4190ca0a | 702 | .B UFFDIO_ZEROPAGE |
e3ec1293 | 703 | operation. |
97b6084b | 704 | .\" |
637512ed | 705 | .SS UFFDIO_WAKE |
1b945386 | 706 | (Since Linux 4.3.) |
66e11c1f MK |
707 | Wake up the thread waiting for page-fault resolution on |
708 | a specified memory address range. | |
c6d039a3 | 709 | .P |
be7979d7 MK |
710 | The |
711 | .B UFFDIO_WAKE | |
712 | operation is used in conjunction with | |
1ae6b2c7 | 713 | .B UFFDIO_COPY |
be7979d7 | 714 | and |
1ae6b2c7 | 715 | .B UFFDIO_ZEROPAGE |
be7979d7 | 716 | operations that have the |
1ae6b2c7 | 717 | .B UFFDIO_COPY_MODE_DONTWAKE |
be7979d7 | 718 | or |
1ae6b2c7 | 719 | .B UFFDIO_ZEROPAGE_MODE_DONTWAKE |
be7979d7 MK |
720 | bit set in the |
721 | .I mode | |
722 | field. | |
723 | The userfault monitor can perform several | |
1ae6b2c7 | 724 | .B UFFDIO_COPY |
be7979d7 | 725 | and |
1ae6b2c7 | 726 | .B UFFDIO_ZEROPAGE |
be7979d7 MK |
727 | operations in a batch and then explicitly wake up the faulting thread using |
728 | .BR UFFDIO_WAKE . | |
c6d039a3 | 729 | .P |
74bd6e93 MK |
730 | The |
731 | .I argp | |
732 | argument is a pointer to a | |
733 | .I uffdio_range | |
66e11c1f | 734 | structure (shown above) that specifies the address range. |
c6d039a3 | 735 | .P |
97b6084b MK |
736 | This |
737 | .BR ioctl (2) | |
738 | operation returns 0 on success. | |
739 | On error, \-1 is returned and | |
740 | .I errno | |
855d489a | 741 | is set to indicate the error. |
97b6084b | 742 | Possible errors include: |
97b6084b MK |
743 | .TP |
744 | .B EINVAL | |
242c2b6b | 745 | The |
97b6084b MK |
746 | .I start |
747 | or the | |
748 | .I len | |
749 | field of the | |
750 | .I ufdio_range | |
242c2b6b MK |
751 | structure was not a multiple of the system page size; or |
752 | .I len | |
753 | was zero; or the specified range was otherwise invalid. | |
af205afd AR |
754 | .SS UFFDIO_WRITEPROTECT |
755 | (Since Linux 5.7.) | |
7a3d0845 | 756 | Write-protect or write-unprotect a userfaultfd-registered memory range |
f559fa36 PX |
757 | registered with mode |
758 | .BR UFFDIO_REGISTER_MODE_WP . | |
c6d039a3 | 759 | .P |
f559fa36 PX |
760 | The |
761 | .I argp | |
762 | argument is a pointer to a | |
763 | .I uffdio_range | |
764 | structure as shown below: | |
c6d039a3 | 765 | .P |
f559fa36 PX |
766 | .in +4n |
767 | .EX | |
768 | struct uffdio_writeprotect { | |
769 | struct uffdio_range range; /* Range to change write permission*/ | |
115b4e0e | 770 | __u64 mode; /* Mode to change write permission */ |
f559fa36 PX |
771 | }; |
772 | .EE | |
773 | .in | |
c6d039a3 | 774 | .P |
1cf1ada5 | 775 | There are two mode bits that are supported in this structure: |
f559fa36 PX |
776 | .TP |
777 | .B UFFDIO_WRITEPROTECT_MODE_WP | |
778 | When this mode bit is set, | |
779 | the ioctl will be a write-protect operation upon the memory range specified by | |
780 | .IR range . | |
1cf1ada5 | 781 | Otherwise it will be a write-unprotect operation upon the specified range, |
7a3d0845 | 782 | which can be used to resolve a userfaultfd write-protect page fault. |
f559fa36 PX |
783 | .TP |
784 | .B UFFDIO_WRITEPROTECT_MODE_DONTWAKE | |
785 | When this mode bit is set, | |
786 | do not wake up any thread that waits for | |
787 | page-fault resolution after the operation. | |
7a3d0845 | 788 | This can be specified only if |
f559fa36 PX |
789 | .B UFFDIO_WRITEPROTECT_MODE_WP |
790 | is not specified. | |
c6d039a3 | 791 | .P |
f559fa36 PX |
792 | This |
793 | .BR ioctl (2) | |
794 | operation returns 0 on success. | |
795 | On error, \-1 is returned and | |
796 | .I errno | |
797 | is set to indicate the error. | |
798 | Possible errors include: | |
799 | .TP | |
800 | .B EINVAL | |
801 | The | |
802 | .I start | |
803 | or the | |
804 | .I len | |
805 | field of the | |
806 | .I ufdio_range | |
807 | structure was not a multiple of the system page size; or | |
808 | .I len | |
809 | was zero; or the specified range was otherwise invalid. | |
810 | .TP | |
811 | .B EAGAIN | |
812 | The process was interrupted; retry this call. | |
813 | .TP | |
814 | .B ENOENT | |
815 | The range specified in | |
816 | .I range | |
817 | is not valid. | |
818 | For example, the virtual address does not exist, | |
819 | or not registered with userfaultfd write-protect mode. | |
820 | .TP | |
821 | .B EFAULT | |
822 | Encountered a generic fault during processing. | |
140b625a AR |
823 | .\" |
824 | .SS UFFDIO_CONTINUE | |
825 | (Since Linux 5.13.) | |
826 | Resolve a minor page fault | |
827 | by installing page table entries | |
828 | for existing pages in the page cache. | |
c6d039a3 | 829 | .P |
140b625a AR |
830 | The |
831 | .I argp | |
832 | argument is a pointer to a | |
833 | .I uffdio_continue | |
834 | structure as shown below: | |
c6d039a3 | 835 | .P |
140b625a AR |
836 | .in +4n |
837 | .EX | |
838 | struct uffdio_continue { | |
e5da16f1 AC |
839 | struct uffdio_range range; |
840 | /* Range to install PTEs for and continue */ | |
841 | __u64 mode; /* Flags controlling the behavior of continue */ | |
842 | __s64 mapped; /* Number of bytes mapped, or negated error */ | |
140b625a AR |
843 | }; |
844 | .EE | |
845 | .in | |
c6d039a3 | 846 | .P |
140b625a | 847 | The following value may be bitwise ORed in |
39a3ee3f | 848 | .I mode |
140b625a AR |
849 | to change the behavior of the |
850 | .B UFFDIO_CONTINUE | |
851 | operation: | |
852 | .TP | |
853 | .B UFFDIO_CONTINUE_MODE_DONTWAKE | |
854 | Do not wake up the thread that waits for page-fault resolution. | |
c6d039a3 | 855 | .P |
140b625a AR |
856 | The |
857 | .I mapped | |
858 | field is used by the kernel | |
859 | to return the number of bytes that were actually mapped, | |
860 | or an error in the same manner as | |
861 | .BR UFFDIO_COPY . | |
862 | If the value returned in the | |
863 | .I mapped | |
864 | field doesn't match the value that was specified in | |
865 | .IR range.len , | |
866 | the operation fails with the error | |
867 | .BR EAGAIN . | |
868 | The | |
869 | .I mapped | |
870 | field is output-only; | |
871 | it is not read by the | |
872 | .B UFFDIO_CONTINUE | |
873 | operation. | |
c6d039a3 | 874 | .P |
140b625a AR |
875 | This |
876 | .BR ioctl (2) | |
877 | operation returns 0 on success. | |
878 | In this case, | |
879 | the entire area was mapped. | |
880 | On error, \-1 is returned and | |
881 | .I errno | |
882 | is set to indicate the error. | |
883 | Possible errors include: | |
884 | .TP | |
885 | .B EAGAIN | |
886 | The number of bytes mapped | |
887 | (i.e., the value returned in the | |
888 | .I mapped | |
889 | field) | |
890 | does not equal the value that was specified in the | |
891 | .I range.len | |
892 | field. | |
893 | .TP | |
9ae03d21 AR |
894 | .B EEXIST |
895 | One or more pages were already mapped in the given range. | |
896 | .TP | |
897 | .B EFAULT | |
898 | No existing page could be found in the page cache for the given range. | |
899 | .TP | |
140b625a AR |
900 | .B EINVAL |
901 | Either | |
902 | .I range.start | |
903 | or | |
904 | .I range.len | |
905 | was not a multiple of the system page size; or | |
906 | .I range.len | |
907 | was zero; or the range specified was invalid. | |
908 | .TP | |
909 | .B EINVAL | |
910 | An invalid bit was specified in the | |
39a3ee3f | 911 | .I mode |
140b625a AR |
912 | field. |
913 | .TP | |
140b625a AR |
914 | .B ENOENT |
915 | The faulting process has changed its virtual memory layout simultaneously with | |
916 | an outstanding | |
917 | .B UFFDIO_CONTINUE | |
918 | operation. | |
919 | .TP | |
920 | .B ENOMEM | |
921 | Allocating memory needed to setup the page table mappings failed. | |
922 | .TP | |
f8dd996e | 923 | .B ESRCH |
140b625a AR |
924 | The faulting process has exited at the time of a |
925 | .B UFFDIO_CONTINUE | |
926 | operation. | |
927 | .\" | |
9ae03d21 AR |
928 | .SS UFFDIO_POISON |
929 | (Since Linux 6.6.) | |
930 | Mark an address range as "poisoned". | |
931 | Future accesses to these addresses will raise a | |
932 | .B SIGBUS | |
933 | signal. | |
934 | Unlike | |
935 | .B MADV_HWPOISON | |
936 | this works by installing page table entries, | |
937 | rather than "really" poisoning the underlying physical pages. | |
938 | This means it only affects this particular address space. | |
c6d039a3 | 939 | .P |
9ae03d21 AR |
940 | The |
941 | .I argp | |
942 | argument is a pointer to a | |
943 | .I uffdio_poison | |
944 | structure as shown below: | |
c6d039a3 | 945 | .P |
9ae03d21 AR |
946 | .in +4n |
947 | .EX | |
948 | struct uffdio_poison { | |
949 | struct uffdio_range range; | |
950 | /* Range to install poison PTE markers in */ | |
951 | __u64 mode; /* Flags controlling the behavior of poison */ | |
952 | __s64 updated; /* Number of bytes poisoned, or negated error */ | |
953 | }; | |
954 | .EE | |
955 | .in | |
c6d039a3 | 956 | .P |
9ae03d21 AR |
957 | The following value may be bitwise ORed in |
958 | .I mode | |
959 | to change the behavior of the | |
960 | .B UFFDIO_POISON | |
961 | operation: | |
962 | .TP | |
963 | .B UFFDIO_POISON_MODE_DONTWAKE | |
964 | Do not wake up the thread that waits for page-fault resolution. | |
c6d039a3 | 965 | .P |
9ae03d21 AR |
966 | The |
967 | .I updated | |
968 | field is used by the kernel | |
969 | to return the number of bytes that were actually poisoned, | |
970 | or an error in the same manner as | |
971 | .BR UFFDIO_COPY . | |
972 | If the value returned in the | |
973 | .I updated | |
974 | field doesn't match the value that was specified in | |
975 | .IR range.len , | |
976 | the operation fails with the error | |
977 | .BR EAGAIN . | |
978 | The | |
979 | .I updated | |
980 | field is output-only; | |
981 | it is not read by the | |
982 | .B UFFDIO_POISON | |
983 | operation. | |
c6d039a3 | 984 | .P |
9ae03d21 AR |
985 | This |
986 | .BR ioctl (2) | |
987 | operation returns 0 on success. | |
988 | In this case, | |
989 | the entire area was poisoned. | |
990 | On error, \-1 is returned and | |
991 | .I errno | |
992 | is set to indicate the error. | |
993 | Possible errors include: | |
994 | .TP | |
995 | .B EAGAIN | |
996 | The number of bytes mapped | |
997 | (i.e., the value returned in the | |
998 | .I updated | |
999 | field) | |
1000 | does not equal the value that was specified in the | |
1001 | .I range.len | |
1002 | field. | |
1003 | .TP | |
1004 | .B EINVAL | |
1005 | Either | |
1006 | .I range.start | |
1007 | or | |
1008 | .I range.len | |
1009 | was not a multiple of the system page size; or | |
1010 | .I range.len | |
1011 | was zero; or the range specified was invalid. | |
1012 | .TP | |
1013 | .B EINVAL | |
1014 | An invalid bit was specified in the | |
1015 | .I mode | |
1016 | field. | |
1017 | .TP | |
1018 | .B EEXIST | |
1019 | One or more pages were already mapped in the given range. | |
1020 | .TP | |
1021 | .B ENOENT | |
1022 | The faulting process has changed its virtual memory layout simultaneously with | |
1023 | an outstanding | |
1024 | .B UFFDIO_POISON | |
1025 | operation. | |
1026 | .TP | |
1027 | .B ENOMEM | |
1028 | Allocating memory for page table entries failed. | |
1029 | .TP | |
1030 | .B ESRCH | |
1031 | The faulting process has exited at the time of a | |
1032 | .B UFFDIO_POISON | |
1033 | operation. | |
1034 | .\" | |
97b6084b MK |
1035 | .SH RETURN VALUE |
1036 | See descriptions of the individual operations, above. | |
1037 | .SH ERRORS | |
1038 | See descriptions of the individual operations, above. | |
24fa87ff MK |
1039 | In addition, the following general errors can occur for all of the |
1040 | operations described above: | |
1041 | .TP | |
1042 | .B EFAULT | |
1043 | .I argp | |
1044 | does not point to a valid memory address. | |
1045 | .TP | |
1046 | .B EINVAL | |
1047 | (For all operations except | |
1048 | .BR UFFDIO_API .) | |
1049 | The userfaultfd object has not yet been enabled (via the | |
1ae6b2c7 | 1050 | .B UFFDIO_API |
24fa87ff | 1051 | operation). |
3113c7f3 | 1052 | .SH STANDARDS |
4131356c | 1053 | Linux. |
53004b3b MR |
1054 | .SH BUGS |
1055 | In order to detect available userfault features and | |
5191c688 MK |
1056 | enable some subset of those features |
1057 | the userfaultfd file descriptor must be closed after the first | |
1ae6b2c7 | 1058 | .B UFFDIO_API |
5191c688 | 1059 | operation that queries features availability and reopened before |
53004b3b | 1060 | the second |
1ae6b2c7 | 1061 | .B UFFDIO_API |
5191c688 | 1062 | operation that actually enables the desired features. |
a14af333 | 1063 | .SH EXAMPLES |
e5efbe91 MK |
1064 | See |
1065 | .BR userfaultfd (2). | |
97b6084b MK |
1066 | .SH SEE ALSO |
1067 | .BR ioctl (2), | |
1068 | .BR mmap (2), | |
1069 | .BR userfaultfd (2) | |
c6d039a3 | 1070 | .P |
1ae6b2c7 | 1071 | .I Documentation/admin\-guide/mm/userfaultfd.rst |
97b6084b | 1072 | in the Linux kernel source tree |