]>
Commit | Line | Data |
---|---|---|
97b6084b MK |
1 | .\" Copyright (c) 2016, IBM Corporation. |
2 | .\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com> | |
3 | .\" and Copyright (C) 2016 Michael Kerrisk <mtk.manpages@gmail.com> | |
4 | .\" | |
5fbde956 | 5 | .\" SPDX-License-Identifier: Linux-man-pages-copyleft |
97b6084b MK |
6 | .\" |
7 | .\" | |
45186a5d | 8 | .TH IOCTL_USERFAULTFD 2 2021-03-22 "Linux man-pages (unreleased)" |
97b6084b | 9 | .SH NAME |
7c4dfcac | 10 | ioctl_userfaultfd \- create a file descriptor for handling page faults in user |
97b6084b | 11 | space |
1d7a4349 AC |
12 | .SH LIBRARY |
13 | Standard C library | |
8fc3b2cf | 14 | .RI ( libc ", " \-lc ) |
97b6084b MK |
15 | .SH SYNOPSIS |
16 | .nf | |
cac89bc7 | 17 | .BR "#include <linux/userfaultfd.h>" " /* Definition of " UFFD* " constants */" |
97b6084b | 18 | .B #include <sys/ioctl.h> |
dbfe9c70 | 19 | .PP |
97b6084b MK |
20 | .BI "int ioctl(int " fd ", int " cmd ", ...);" |
21 | .fi | |
22 | .SH DESCRIPTION | |
23 | Various | |
24 | .BR ioctl (2) | |
25 | operations can be performed on a userfaultfd object (created by a call to | |
26 | .BR userfaultfd (2)) | |
27 | using calls of the form: | |
a458bc45 MK |
28 | .PP |
29 | .in +4n | |
30 | .EX | |
31 | ioctl(fd, cmd, argp); | |
32 | .EE | |
33 | .in | |
97b6084b MK |
34 | In the above, |
35 | .I fd | |
36 | is a file descriptor referring to a userfaultfd object, | |
37 | .I cmd | |
38 | is one of the commands listed below, and | |
39 | .I argp | |
40 | is a pointer to a data structure that is specific to | |
41 | .IR cmd . | |
efeece04 | 42 | .PP |
fe905357 | 43 | The various |
97b6084b | 44 | .BR ioctl (2) |
fe905357 MK |
45 | operations are described below. |
46 | The | |
f7111396 | 47 | .BR UFFDIO_API , |
fe905357 MK |
48 | .BR UFFDIO_REGISTER , |
49 | and | |
1ae6b2c7 | 50 | .B UFFDIO_UNREGISTER |
fe905357 MK |
51 | operations are used to |
52 | .I configure | |
53 | userfaultfd behavior. | |
54 | These operations allow the caller to choose what features will be enabled and | |
97b6084b | 55 | what kinds of events will be delivered to the application. |
fe905357 | 56 | The remaining operations are |
1ae6b2c7 | 57 | .I range |
fe905357 | 58 | operations. |
e8a846ae | 59 | These operations enable the calling application to resolve page-fault |
a18cc179 | 60 | events. |
637512ed MK |
61 | .\" |
62 | .SS UFFDIO_API | |
1b945386 | 63 | (Since Linux 4.3.) |
97b6084b | 64 | Enable operation of the userfaultfd and perform API handshake. |
efeece04 | 65 | .PP |
97b6084b | 66 | The |
74bd6e93 MK |
67 | .I argp |
68 | argument is a pointer to a | |
1ae6b2c7 | 69 | .I uffdio_api |
74bd6e93 | 70 | structure, defined as: |
b76974c1 | 71 | .PP |
97b6084b | 72 | .in +4n |
b76974c1 | 73 | .EX |
97b6084b | 74 | struct uffdio_api { |
2294974b AC |
75 | uint64_t api; /* Requested API version (input) */ |
76 | uint64_t features; /* Requested features (input/output) */ | |
77 | uint64_t ioctls; /* Available ioctl() operations (output) */ | |
97b6084b | 78 | }; |
b76974c1 | 79 | .EE |
97b6084b | 80 | .in |
b76974c1 | 81 | .PP |
97b6084b MK |
82 | The |
83 | .I api | |
84 | field denotes the API version requested by the application. | |
efeece04 | 85 | .PP |
9a3c2194 MK |
86 | The kernel verifies that it can support the requested API version, |
87 | and sets the | |
97b6084b MK |
88 | .I features |
89 | and | |
90 | .I ioctls | |
91 | fields to bit masks representing all the available features and the generic | |
405513d3 | 92 | .BR ioctl (2) |
97b6084b | 93 | operations available. |
efeece04 | 94 | .PP |
30dced33 MR |
95 | For Linux kernel versions before 4.11, the |
96 | .I features | |
97 | field must be initialized to zero before the call to | |
4190ca0a | 98 | .BR UFFDIO_API , |
1a053be3 | 99 | and zero (i.e., no feature bits) is placed in the |
30dced33 MR |
100 | .I features |
101 | field by the kernel upon return from | |
102 | .BR ioctl (2). | |
efeece04 | 103 | .PP |
30dced33 MR |
104 | Starting from Linux 4.11, the |
105 | .I features | |
1a053be3 | 106 | field can be used to ask whether particular features are supported |
30dced33 MR |
107 | and explicitly enable userfaultfd features that are disabled by default. |
108 | The kernel always reports all the available features in the | |
9a3c2194 MK |
109 | .I features |
110 | field. | |
efeece04 | 111 | .PP |
66487517 MR |
112 | To enable userfaultfd features the application should set |
113 | a bit corresponding to each feature it wants to enable in the | |
114 | .I features | |
115 | field. | |
116 | If the kernel supports all the requested features it will enable them. | |
117 | Otherwise it will zero out the returned | |
118 | .I uffdio_api | |
119 | structure and return | |
120 | .BR EINVAL . | |
30dced33 | 121 | .\" FIXME add more details about feature negotiation and enablement |
efeece04 | 122 | .PP |
364270d0 | 123 | The following feature bits may be set: |
30dced33 | 124 | .TP |
364270d0 | 125 | .BR UFFD_FEATURE_EVENT_FORK " (since Linux 4.11)" |
66487517 MR |
126 | When this feature is enabled, |
127 | the userfaultfd objects associated with a parent process are duplicated | |
128 | into the child process during | |
129 | .BR fork (2) | |
42630fa9 MK |
130 | and a |
131 | .B UFFD_EVENT_FORK | |
81bb1233 | 132 | event is delivered to the userfaultfd monitor |
30dced33 | 133 | .TP |
364270d0 | 134 | .BR UFFD_FEATURE_EVENT_REMAP " (since Linux 4.11)" |
66487517 MR |
135 | If this feature is enabled, |
136 | when the faulting process invokes | |
42630fa9 | 137 | .BR mremap (2), |
66487517 | 138 | the userfaultfd monitor will receive an event of type |
42630fa9 | 139 | .BR UFFD_EVENT_REMAP . |
30dced33 | 140 | .TP |
364270d0 | 141 | .BR UFFD_FEATURE_EVENT_REMOVE " (since Linux 4.11)" |
66487517 MR |
142 | If this feature is enabled, |
143 | when the faulting process calls | |
42630fa9 MK |
144 | .BR madvise (2) |
145 | with the | |
146 | .B MADV_DONTNEED | |
66487517 | 147 | or |
42630fa9 MK |
148 | .B MADV_REMOVE |
149 | advice value to free a virtual memory area | |
66487517 | 150 | the userfaultfd monitor will receive an event of type |
42630fa9 | 151 | .BR UFFD_EVENT_REMOVE . |
30dced33 | 152 | .TP |
364270d0 | 153 | .BR UFFD_FEATURE_EVENT_UNMAP " (since Linux 4.11)" |
66487517 MR |
154 | If this feature is enabled, |
155 | when the faulting process unmaps virtual memory either explicitly with | |
42630fa9 MK |
156 | .BR munmap (2), |
157 | or implicitly during either | |
66487517 MR |
158 | .BR mmap (2) |
159 | or | |
03659d7d | 160 | .BR mremap (2), |
66487517 | 161 | the userfaultfd monitor will receive an event of type |
42630fa9 | 162 | .BR UFFD_EVENT_UNMAP . |
30dced33 | 163 | .TP |
364270d0 | 164 | .BR UFFD_FEATURE_MISSING_HUGETLBFS " (since Linux 4.11)" |
66487517 MR |
165 | If this feature bit is set, |
166 | the kernel supports registering userfaultfd ranges on hugetlbfs | |
167 | virtual memory areas | |
30dced33 | 168 | .TP |
364270d0 | 169 | .BR UFFD_FEATURE_MISSING_SHMEM " (since Linux 4.11)" |
66487517 | 170 | If this feature bit is set, |
41e9e648 MR |
171 | the kernel supports registering userfaultfd ranges on shared memory areas. |
172 | This includes all kernel shared memory APIs: | |
173 | System V shared memory, | |
5191c688 MK |
174 | .BR tmpfs (5), |
175 | shared mappings of | |
176 | .IR /dev/zero , | |
177 | .BR mmap (2) | |
178 | with the | |
f1008f5f | 179 | .B MAP_SHARED |
41e9e648 MR |
180 | flag set, |
181 | .BR memfd_create (2), | |
5191c688 | 182 | and so on. |
2c371032 | 183 | .TP |
364270d0 | 184 | .BR UFFD_FEATURE_SIGBUS " (since Linux 4.14)" |
1df2779d | 185 | .\" commit 2d6d6f5a09a96cc1fec7ed992b825e05f64cb50e |
1df2779d MK |
186 | If this feature bit is set, no page-fault events |
187 | .RB ( UFFD_EVENT_PAGEFAULT ) | |
188 | will be delivered. | |
189 | Instead, a | |
2c371032 | 190 | .B SIGBUS |
1df2779d MK |
191 | signal will be sent to the faulting process. |
192 | Applications using this | |
2c371032 PS |
193 | feature will not require the use of a userfaultfd monitor for processing |
194 | memory accesses to the regions registered with userfaultfd. | |
fbda69bb PX |
195 | .TP |
196 | .BR UFFD_FEATURE_THREAD_ID " (since Linux 4.14)" | |
197 | If this feature bit is set, | |
198 | .I uffd_msg.pagefault.feat.ptid | |
7a3d0845 | 199 | will be set to the faulted thread ID for each page-fault message. |
140b625a AR |
200 | .TP |
201 | .BR UFFD_FEATURE_MINOR_HUGETLBFS " (since Linux 5.13)" | |
202 | If this feature bit is set, | |
203 | the kernel supports registering userfaultfd ranges | |
204 | in minor mode on hugetlbfs-backed memory areas. | |
205 | .TP | |
206 | .BR UFFD_FEATURE_MINOR_SHMEM " (since Linux 5.14)" | |
207 | If this feature bit is set, | |
208 | the kernel supports registering userfaultfd ranges | |
209 | in minor mode on shmem-backed memory areas. | |
1df2779d | 210 | .PP |
9a3c2194 MK |
211 | The returned |
212 | .I ioctls | |
213 | field can contain the following bits: | |
214 | .\" FIXME This user-space API seems not fully polished. Why are there | |
53b95540 | 215 | .\" not constants defined for each of the bit-mask values listed below? |
9a3c2194 MK |
216 | .TP |
217 | .B 1 << _UFFDIO_API | |
218 | The | |
219 | .B UFFDIO_API | |
220 | operation is supported. | |
221 | .TP | |
222 | .B 1 << _UFFDIO_REGISTER | |
223 | The | |
224 | .B UFFDIO_REGISTER | |
225 | operation is supported. | |
226 | .TP | |
227 | .B 1 << _UFFDIO_UNREGISTER | |
228 | The | |
229 | .B UFFDIO_UNREGISTER | |
230 | operation is supported. | |
9a3c2194 | 231 | .PP |
97b6084b MK |
232 | This |
233 | .BR ioctl (2) | |
234 | operation returns 0 on success. | |
235 | On error, \-1 is returned and | |
236 | .I errno | |
855d489a | 237 | is set to indicate the error. |
97b6084b | 238 | Possible errors include: |
cd8389ff MK |
239 | .TP |
240 | .B EFAULT | |
241 | .I argp | |
242 | refers to an address that is outside the calling process's | |
243 | accessible address space. | |
97b6084b MK |
244 | .TP |
245 | .B EINVAL | |
ffbc7efc | 246 | The userfaultfd has already been enabled by a previous |
1ae6b2c7 | 247 | .B UFFDIO_API |
ffbc7efc MK |
248 | operation. |
249 | .TP | |
250 | .B EINVAL | |
9a3c2194 MK |
251 | The API version requested in the |
252 | .I api | |
253 | field is not supported by this kernel, or the | |
254 | .I features | |
66487517 MR |
255 | field passed to the kernel includes feature bits that are not supported |
256 | by the current kernel version. | |
ef443f07 | 257 | .\" FIXME In the above error case, the returned 'uffdio_api' structure is |
53b95540 | 258 | .\" zeroed out. Why is this done? This should be explained in the manual page. |
637512ed | 259 | .\" |
1c63cefd MK |
260 | .\" Mike Rapoport: |
261 | .\" In my understanding the uffdio_api | |
262 | .\" structure is zeroed to allow the caller | |
263 | .\" to distinguish the reasons for -EINVAL. | |
264 | .\" | |
637512ed | 265 | .SS UFFDIO_REGISTER |
1b945386 | 266 | (Since Linux 4.3.) |
97b6084b | 267 | Register a memory address range with the userfaultfd object. |
d87f377e | 268 | The pages in the range must be "compatible". |
140b625a AR |
269 | Please refer to the list of register modes below |
270 | for the compatible memory backends for each mode. | |
efeece04 | 271 | .PP |
97b6084b | 272 | The |
74bd6e93 MK |
273 | .I argp |
274 | argument is a pointer to a | |
97b6084b | 275 | .I uffdio_register |
74bd6e93 | 276 | structure, defined as: |
b76974c1 | 277 | .PP |
97b6084b | 278 | .in +4n |
b76974c1 | 279 | .EX |
97b6084b | 280 | struct uffdio_range { |
2294974b AC |
281 | uint64_t start; /* Start of range */ |
282 | uint64_t len; /* Length of range (bytes) */ | |
97b6084b MK |
283 | }; |
284 | ||
285 | struct uffdio_register { | |
286 | struct uffdio_range range; | |
2294974b AC |
287 | uint64_t mode; /* Desired mode of operation (input) */ |
288 | uint64_t ioctls; /* Available ioctl() operations (output) */ | |
97b6084b | 289 | }; |
b76974c1 | 290 | .EE |
97b6084b | 291 | .in |
efeece04 | 292 | .PP |
97b6084b MK |
293 | The |
294 | .I range | |
295 | field defines a memory range starting at | |
296 | .I start | |
297 | and continuing for | |
298 | .I len | |
299 | bytes that should be handled by the userfaultfd. | |
efeece04 | 300 | .PP |
97b6084b MK |
301 | The |
302 | .I mode | |
303 | field defines the mode of operation desired for this memory region. | |
304 | The following values may be bitwise ORed to set the userfaultfd mode for | |
305 | the specified range: | |
97b6084b MK |
306 | .TP |
307 | .B UFFDIO_REGISTER_MODE_MISSING | |
a23c93d8 | 308 | Track page faults on missing pages. |
140b625a AR |
309 | Since Linux 4.3, |
310 | only private anonymous ranges are compatible. | |
311 | Since Linux 4.11, | |
312 | hugetlbfs and shared memory ranges are also compatible. | |
97b6084b MK |
313 | .TP |
314 | .B UFFDIO_REGISTER_MODE_WP | |
315 | Track page faults on write-protected pages. | |
140b625a AR |
316 | Since Linux 5.7, |
317 | only private anonymous ranges are compatible. | |
318 | .TP | |
319 | .B UFFDIO_REGISTER_MODE_MINOR | |
320 | Track minor page faults. | |
321 | Since Linux 5.13, | |
322 | only hugetlbfs ranges are compatible. | |
323 | Since Linux 5.14, | |
324 | compatiblity with shmem ranges was added. | |
52b794f1 | 325 | .PP |
264a0a30 | 326 | If the operation is successful, the kernel modifies the |
97b6084b | 327 | .I ioctls |
264a0a30 MK |
328 | bit-mask field to indicate which |
329 | .BR ioctl (2) | |
330 | operations are available for the specified range. | |
6edec894 IA |
331 | This returned bit mask can contain the following bits: |
332 | .TP | |
333 | .B 1 << _UFFDIO_COPY | |
334 | The | |
335 | .B UFFDIO_COPY | |
336 | operation is supported. | |
337 | .TP | |
338 | .B 1 << _UFFDIO_WAKE | |
339 | The | |
340 | .B UFFDIO_WAKE | |
341 | operation is supported. | |
342 | .TP | |
343 | .B 1 << _UFFDIO_WRITEPROTECT | |
344 | The | |
345 | .B UFFDIO_WRITEPROTECT | |
346 | .TP | |
347 | .B 1 << _UFFDIO_ZEROPAGE | |
348 | The | |
349 | .B UFFDIO_ZEROPAGE | |
350 | operation is supported. | |
140b625a AR |
351 | .TP |
352 | .B 1 << _UFFDIO_CONTINUE | |
353 | The | |
354 | .B UFFDIO_CONTINUE | |
355 | operation is supported. | |
efeece04 | 356 | .PP |
264a0a30 MK |
357 | This |
358 | .BR ioctl (2) | |
359 | operation returns 0 on success. | |
360 | On error, \-1 is returned and | |
361 | .I errno | |
855d489a | 362 | is set to indicate the error. |
264a0a30 MK |
363 | Possible errors include: |
364 | .\" FIXME Is the following error list correct? | |
365 | .\" | |
366 | .TP | |
367 | .B EBUSY | |
368 | A mapping in the specified range is registered with another | |
369 | userfaultfd object. | |
370 | .TP | |
cd8389ff MK |
371 | .B EFAULT |
372 | .I argp | |
373 | refers to an address that is outside the calling process's | |
374 | accessible address space. | |
375 | .TP | |
264a0a30 MK |
376 | .B EINVAL |
377 | An invalid or unsupported bit was specified in the | |
378 | .I mode | |
379 | field; or the | |
380 | .I mode | |
381 | field was zero. | |
382 | .TP | |
383 | .B EINVAL | |
384 | There is no mapping in the specified address range. | |
385 | .TP | |
386 | .B EINVAL | |
242c2b6b MK |
387 | .I range.start |
388 | or | |
389 | .I range.len | |
390 | is not a multiple of the system page size; or, | |
391 | .I range.len | |
392 | is zero; or these fields are otherwise invalid. | |
393 | .TP | |
394 | .B EINVAL | |
264a0a30 | 395 | There as an incompatible mapping in the specified address range. |
a440cc33 MK |
396 | .\" Mike Rapoport: |
397 | .\" ENOMEM if the process is exiting and the | |
99cf1681 | 398 | .\" mm_struct has gone by the time userfault grabs it. |
637512ed | 399 | .SS UFFDIO_UNREGISTER |
1b945386 | 400 | (Since Linux 4.3.) |
97b6084b | 401 | Unregister a memory address range from userfaultfd. |
1c63cefd MK |
402 | The pages in the range must be "compatible" (see the description of |
403 | .BR UFFDIO_REGISTER .) | |
efeece04 | 404 | .PP |
97b6084b | 405 | The address range to unregister is specified in the |
1ae6b2c7 | 406 | .I uffdio_range |
97b6084b MK |
407 | structure pointed to by |
408 | .IR argp . | |
efeece04 | 409 | .PP |
97b6084b MK |
410 | This |
411 | .BR ioctl (2) | |
412 | operation returns 0 on success. | |
413 | On error, \-1 is returned and | |
414 | .I errno | |
855d489a | 415 | is set to indicate the error. |
97b6084b | 416 | Possible errors include: |
97b6084b MK |
417 | .TP |
418 | .B EINVAL | |
419 | Either the | |
420 | .I start | |
421 | or the | |
422 | .I len | |
423 | field of the | |
424 | .I ufdio_range | |
242c2b6b MK |
425 | structure was not a multiple of the system page size; or the |
426 | .I len | |
427 | field was zero; or these fields were otherwise invalid. | |
97b6084b MK |
428 | .TP |
429 | .B EINVAL | |
430 | There as an incompatible mapping in the specified address range. | |
431 | .TP | |
432 | .B EINVAL | |
433 | There was no mapping in the specified address range. | |
637512ed MK |
434 | .\" |
435 | .SS UFFDIO_COPY | |
1b945386 | 436 | (Since Linux 4.3.) |
97b6084b MK |
437 | Atomically copy a continuous memory chunk into the userfault registered |
438 | range and optionally wake up the blocked thread. | |
439 | The source and destination addresses and the number of bytes to copy are | |
440 | specified by the | |
441 | .IR src ", " dst ", and " len | |
74bd6e93 MK |
442 | fields of the |
443 | .I uffdio_copy | |
444 | structure pointed to by | |
445 | .IR argp : | |
efeece04 | 446 | .PP |
97b6084b | 447 | .in +4n |
b76974c1 | 448 | .EX |
97b6084b | 449 | struct uffdio_copy { |
2294974b AC |
450 | uint64_t dst; /* Destination of copy */ |
451 | uint64_t src; /* Source of copy */ | |
452 | uint64_t len; /* Number of bytes to copy */ | |
453 | uint64_t mode; /* Flags controlling behavior of copy */ | |
454 | int64_t copy; /* Number of bytes copied, or negated error */ | |
97b6084b | 455 | }; |
b76974c1 | 456 | .EE |
97b6084b | 457 | .in |
637512ed | 458 | .PP |
1dca8e3d | 459 | The following value may be bitwise ORed in |
1ae6b2c7 | 460 | .I mode |
97b6084b MK |
461 | to change the behavior of the |
462 | .B UFFDIO_COPY | |
463 | operation: | |
97b6084b MK |
464 | .TP |
465 | .B UFFDIO_COPY_MODE_DONTWAKE | |
e8a846ae | 466 | Do not wake up the thread that waits for page-fault resolution |
f559fa36 PX |
467 | .TP |
468 | .B UFFDIO_COPY_MODE_WP | |
469 | Copy the page with read-only permission. | |
470 | This allows the user to trap the next write to the page, | |
471 | which will block and generate another write-protect userfault message. | |
7a3d0845 | 472 | This is used only when both |
f559fa36 PX |
473 | .B UFFDIO_REGISTER_MODE_MISSING |
474 | and | |
475 | .B UFFDIO_REGISTER_MODE_WP | |
476 | modes are enabled for the registered range. | |
637512ed | 477 | .PP |
97b6084b MK |
478 | The |
479 | .I copy | |
1dca8e3d MK |
480 | field is used by the kernel to return the number of bytes |
481 | that was actually copied, or an error (a negated | |
482 | .IR errno -style | |
483 | value). | |
53b95540 MK |
484 | .\" FIXME Above: Why is the 'copy' field used to return error values? |
485 | .\" This should be explained in the manual page. | |
1dca8e3d MK |
486 | If the value returned in |
487 | .I copy | |
488 | doesn't match the value that was specified in | |
489 | .IR len , | |
490 | the operation fails with the error | |
491 | .BR EAGAIN . | |
492 | The | |
493 | .I copy | |
494 | field is output-only; | |
97b6084b MK |
495 | it is not read by the |
496 | .B UFFDIO_COPY | |
497 | operation. | |
efeece04 | 498 | .PP |
1dca8e3d MK |
499 | This |
500 | .BR ioctl (2) | |
501 | operation returns 0 on success. | |
502 | In this case, the entire area was copied. | |
503 | On error, \-1 is returned and | |
504 | .I errno | |
855d489a | 505 | is set to indicate the error. |
1dca8e3d MK |
506 | Possible errors include: |
507 | .TP | |
508 | .B EAGAIN | |
509 | The number of bytes copied (i.e., the value returned in the | |
510 | .I copy | |
511 | field) | |
512 | does not equal the value that was specified in the | |
513 | .I len | |
514 | field. | |
515 | .TP | |
516 | .B EINVAL | |
517 | Either | |
518 | .I dst | |
519 | or | |
520 | .I len | |
521 | was not a multiple of the system page size, or the range specified by | |
1ae6b2c7 | 522 | .I src |
1dca8e3d | 523 | and |
1ae6b2c7 | 524 | .I len |
1dca8e3d | 525 | or |
1ae6b2c7 | 526 | .I dst |
1dca8e3d | 527 | and |
1ae6b2c7 | 528 | .I len |
1dca8e3d MK |
529 | was invalid. |
530 | .TP | |
531 | .B EINVAL | |
532 | An invalid bit was specified in the | |
1ae6b2c7 | 533 | .I mode |
1dca8e3d | 534 | field. |
337405ef | 535 | .TP |
5191c688 | 536 | .BR ENOENT " (since Linux 4.11)" |
337405ef | 537 | The faulting process has changed |
5191c688 | 538 | its virtual memory layout simultaneously with an outstanding |
4190ca0a | 539 | .B UFFDIO_COPY |
337405ef MR |
540 | operation. |
541 | .TP | |
9dc44855 | 542 | .BR ENOSPC " (from Linux 4.11 until Linux 4.13)" |
e3ec1293 | 543 | The faulting process has exited at the time of a |
4190ca0a | 544 | .B UFFDIO_COPY |
e3ec1293 MR |
545 | operation. |
546 | .TP | |
547 | .BR ESRCH " (since Linux 4.13)" | |
5191c688 | 548 | The faulting process has exited at the time of a |
4190ca0a | 549 | .B UFFDIO_COPY |
337405ef | 550 | operation. |
97b6084b | 551 | .\" |
611ec3a3 | 552 | .SS UFFDIO_ZEROPAGE |
1b945386 | 553 | (Since Linux 4.3.) |
611ec3a3 | 554 | Zero out a memory range registered with userfaultfd. |
efeece04 | 555 | .PP |
97b6084b MK |
556 | The requested range is specified by the |
557 | .I range | |
558 | field of the | |
559 | .I uffdio_zeropage | |
74bd6e93 MK |
560 | structure pointed to by |
561 | .IR argp : | |
efeece04 | 562 | .PP |
97b6084b | 563 | .in +4n |
b8302363 | 564 | .EX |
97b6084b MK |
565 | struct uffdio_zeropage { |
566 | struct uffdio_range range; | |
2294974b AC |
567 | uint64_t mode; /* Flags controlling behavior of copy */ |
568 | int64_t zeropage; /* Number of bytes zeroed, or negated error */ | |
97b6084b | 569 | }; |
b8302363 | 570 | .EE |
97b6084b | 571 | .in |
637512ed | 572 | .PP |
611ec3a3 | 573 | The following value may be bitwise ORed in |
1ae6b2c7 | 574 | .I mode |
405513d3 | 575 | to change the behavior of the |
99de80c5 | 576 | .B UFFDIO_ZEROPAGE |
97b6084b | 577 | operation: |
97b6084b MK |
578 | .TP |
579 | .B UFFDIO_ZEROPAGE_MODE_DONTWAKE | |
580 | Do not wake up the thread that waits for page-fault resolution. | |
637512ed | 581 | .PP |
97b6084b MK |
582 | The |
583 | .I zeropage | |
611ec3a3 | 584 | field is used by the kernel to return the number of bytes |
97b6084b MK |
585 | that was actually zeroed, |
586 | or an error in the same manner as | |
611ec3a3 MK |
587 | .BR UFFDIO_COPY . |
588 | .\" FIXME Why is the 'zeropage' field used to return error values? | |
53b95540 | 589 | .\" This should be explained in the manual page. |
611ec3a3 MK |
590 | If the value returned in the |
591 | .I zeropage | |
592 | field doesn't match the value that was specified in | |
593 | .IR range.len , | |
594 | the operation fails with the error | |
595 | .BR EAGAIN . | |
596 | The | |
597 | .I zeropage | |
598 | field is output-only; | |
599 | it is not read by the | |
99de80c5 | 600 | .B UFFDIO_ZEROPAGE |
611ec3a3 | 601 | operation. |
efeece04 | 602 | .PP |
611ec3a3 MK |
603 | This |
604 | .BR ioctl (2) | |
605 | operation returns 0 on success. | |
606 | In this case, the entire area was zeroed. | |
607 | On error, \-1 is returned and | |
608 | .I errno | |
855d489a | 609 | is set to indicate the error. |
611ec3a3 MK |
610 | Possible errors include: |
611 | .TP | |
612 | .B EAGAIN | |
613 | The number of bytes zeroed (i.e., the value returned in the | |
614 | .I zeropage | |
615 | field) | |
616 | does not equal the value that was specified in the | |
617 | .I range.len | |
618 | field. | |
619 | .TP | |
620 | .B EINVAL | |
621 | Either | |
622 | .I range.start | |
623 | or | |
624 | .I range.len | |
242c2b6b MK |
625 | was not a multiple of the system page size; or |
626 | .I range.len | |
627 | was zero; or the range specified was invalid. | |
611ec3a3 MK |
628 | .TP |
629 | .B EINVAL | |
630 | An invalid bit was specified in the | |
1ae6b2c7 | 631 | .I mode |
611ec3a3 | 632 | field. |
e3ec1293 MR |
633 | .TP |
634 | .BR ESRCH " (since Linux 4.13)" | |
635 | The faulting process has exited at the time of a | |
4190ca0a | 636 | .B UFFDIO_ZEROPAGE |
e3ec1293 | 637 | operation. |
97b6084b | 638 | .\" |
637512ed | 639 | .SS UFFDIO_WAKE |
1b945386 | 640 | (Since Linux 4.3.) |
66e11c1f MK |
641 | Wake up the thread waiting for page-fault resolution on |
642 | a specified memory address range. | |
efeece04 | 643 | .PP |
be7979d7 MK |
644 | The |
645 | .B UFFDIO_WAKE | |
646 | operation is used in conjunction with | |
1ae6b2c7 | 647 | .B UFFDIO_COPY |
be7979d7 | 648 | and |
1ae6b2c7 | 649 | .B UFFDIO_ZEROPAGE |
be7979d7 | 650 | operations that have the |
1ae6b2c7 | 651 | .B UFFDIO_COPY_MODE_DONTWAKE |
be7979d7 | 652 | or |
1ae6b2c7 | 653 | .B UFFDIO_ZEROPAGE_MODE_DONTWAKE |
be7979d7 MK |
654 | bit set in the |
655 | .I mode | |
656 | field. | |
657 | The userfault monitor can perform several | |
1ae6b2c7 | 658 | .B UFFDIO_COPY |
be7979d7 | 659 | and |
1ae6b2c7 | 660 | .B UFFDIO_ZEROPAGE |
be7979d7 MK |
661 | operations in a batch and then explicitly wake up the faulting thread using |
662 | .BR UFFDIO_WAKE . | |
efeece04 | 663 | .PP |
74bd6e93 MK |
664 | The |
665 | .I argp | |
666 | argument is a pointer to a | |
667 | .I uffdio_range | |
66e11c1f | 668 | structure (shown above) that specifies the address range. |
efeece04 | 669 | .PP |
97b6084b MK |
670 | This |
671 | .BR ioctl (2) | |
672 | operation returns 0 on success. | |
673 | On error, \-1 is returned and | |
674 | .I errno | |
855d489a | 675 | is set to indicate the error. |
97b6084b | 676 | Possible errors include: |
97b6084b MK |
677 | .TP |
678 | .B EINVAL | |
242c2b6b | 679 | The |
97b6084b MK |
680 | .I start |
681 | or the | |
682 | .I len | |
683 | field of the | |
684 | .I ufdio_range | |
242c2b6b MK |
685 | structure was not a multiple of the system page size; or |
686 | .I len | |
687 | was zero; or the specified range was otherwise invalid. | |
f559fa36 | 688 | .SS UFFDIO_WRITEPROTECT (Since Linux 5.7) |
7a3d0845 | 689 | Write-protect or write-unprotect a userfaultfd-registered memory range |
f559fa36 PX |
690 | registered with mode |
691 | .BR UFFDIO_REGISTER_MODE_WP . | |
692 | .PP | |
693 | The | |
694 | .I argp | |
695 | argument is a pointer to a | |
696 | .I uffdio_range | |
697 | structure as shown below: | |
698 | .PP | |
699 | .in +4n | |
700 | .EX | |
701 | struct uffdio_writeprotect { | |
702 | struct uffdio_range range; /* Range to change write permission*/ | |
2294974b | 703 | uint64_t mode; /* Mode to change write permission */ |
f559fa36 PX |
704 | }; |
705 | .EE | |
706 | .in | |
707 | .PP | |
1cf1ada5 | 708 | There are two mode bits that are supported in this structure: |
f559fa36 PX |
709 | .TP |
710 | .B UFFDIO_WRITEPROTECT_MODE_WP | |
711 | When this mode bit is set, | |
712 | the ioctl will be a write-protect operation upon the memory range specified by | |
713 | .IR range . | |
1cf1ada5 | 714 | Otherwise it will be a write-unprotect operation upon the specified range, |
7a3d0845 | 715 | which can be used to resolve a userfaultfd write-protect page fault. |
f559fa36 PX |
716 | .TP |
717 | .B UFFDIO_WRITEPROTECT_MODE_DONTWAKE | |
718 | When this mode bit is set, | |
719 | do not wake up any thread that waits for | |
720 | page-fault resolution after the operation. | |
7a3d0845 | 721 | This can be specified only if |
f559fa36 PX |
722 | .B UFFDIO_WRITEPROTECT_MODE_WP |
723 | is not specified. | |
724 | .PP | |
725 | This | |
726 | .BR ioctl (2) | |
727 | operation returns 0 on success. | |
728 | On error, \-1 is returned and | |
729 | .I errno | |
730 | is set to indicate the error. | |
731 | Possible errors include: | |
732 | .TP | |
733 | .B EINVAL | |
734 | The | |
735 | .I start | |
736 | or the | |
737 | .I len | |
738 | field of the | |
739 | .I ufdio_range | |
740 | structure was not a multiple of the system page size; or | |
741 | .I len | |
742 | was zero; or the specified range was otherwise invalid. | |
743 | .TP | |
744 | .B EAGAIN | |
745 | The process was interrupted; retry this call. | |
746 | .TP | |
747 | .B ENOENT | |
748 | The range specified in | |
749 | .I range | |
750 | is not valid. | |
751 | For example, the virtual address does not exist, | |
752 | or not registered with userfaultfd write-protect mode. | |
753 | .TP | |
754 | .B EFAULT | |
755 | Encountered a generic fault during processing. | |
140b625a AR |
756 | .\" |
757 | .SS UFFDIO_CONTINUE | |
758 | (Since Linux 5.13.) | |
759 | Resolve a minor page fault | |
760 | by installing page table entries | |
761 | for existing pages in the page cache. | |
762 | .PP | |
763 | The | |
764 | .I argp | |
765 | argument is a pointer to a | |
766 | .I uffdio_continue | |
767 | structure as shown below: | |
768 | .PP | |
769 | .in +4n | |
770 | .EX | |
771 | struct uffdio_continue { | |
2294974b AC |
772 | struct uffdio_range range; |
773 | /* Range to install PTEs for and continue */ | |
774 | uint64_t mode; /* Flags controlling the behavior of continue */ | |
775 | int64_t mapped; /* Number of bytes mapped, or negated error */ | |
140b625a AR |
776 | }; |
777 | .EE | |
778 | .in | |
779 | .PP | |
780 | The following value may be bitwise ORed in | |
39a3ee3f | 781 | .I mode |
140b625a AR |
782 | to change the behavior of the |
783 | .B UFFDIO_CONTINUE | |
784 | operation: | |
785 | .TP | |
786 | .B UFFDIO_CONTINUE_MODE_DONTWAKE | |
787 | Do not wake up the thread that waits for page-fault resolution. | |
788 | .PP | |
789 | The | |
790 | .I mapped | |
791 | field is used by the kernel | |
792 | to return the number of bytes that were actually mapped, | |
793 | or an error in the same manner as | |
794 | .BR UFFDIO_COPY . | |
795 | If the value returned in the | |
796 | .I mapped | |
797 | field doesn't match the value that was specified in | |
798 | .IR range.len , | |
799 | the operation fails with the error | |
800 | .BR EAGAIN . | |
801 | The | |
802 | .I mapped | |
803 | field is output-only; | |
804 | it is not read by the | |
805 | .B UFFDIO_CONTINUE | |
806 | operation. | |
807 | .PP | |
808 | This | |
809 | .BR ioctl (2) | |
810 | operation returns 0 on success. | |
811 | In this case, | |
812 | the entire area was mapped. | |
813 | On error, \-1 is returned and | |
814 | .I errno | |
815 | is set to indicate the error. | |
816 | Possible errors include: | |
817 | .TP | |
818 | .B EAGAIN | |
819 | The number of bytes mapped | |
820 | (i.e., the value returned in the | |
821 | .I mapped | |
822 | field) | |
823 | does not equal the value that was specified in the | |
824 | .I range.len | |
825 | field. | |
826 | .TP | |
827 | .B EINVAL | |
828 | Either | |
829 | .I range.start | |
830 | or | |
831 | .I range.len | |
832 | was not a multiple of the system page size; or | |
833 | .I range.len | |
834 | was zero; or the range specified was invalid. | |
835 | .TP | |
836 | .B EINVAL | |
837 | An invalid bit was specified in the | |
39a3ee3f | 838 | .I mode |
140b625a AR |
839 | field. |
840 | .TP | |
841 | .B EEXIST | |
842 | One or more pages were already mapped in the given range. | |
843 | .TP | |
844 | .B ENOENT | |
845 | The faulting process has changed its virtual memory layout simultaneously with | |
846 | an outstanding | |
847 | .B UFFDIO_CONTINUE | |
848 | operation. | |
849 | .TP | |
850 | .B ENOMEM | |
851 | Allocating memory needed to setup the page table mappings failed. | |
852 | .TP | |
853 | .B EFAULT | |
854 | No existing page could be found in the page cache for the given range. | |
855 | .TP | |
f8dd996e | 856 | .B ESRCH |
140b625a AR |
857 | The faulting process has exited at the time of a |
858 | .B UFFDIO_CONTINUE | |
859 | operation. | |
860 | .\" | |
97b6084b MK |
861 | .SH RETURN VALUE |
862 | See descriptions of the individual operations, above. | |
863 | .SH ERRORS | |
864 | See descriptions of the individual operations, above. | |
24fa87ff MK |
865 | In addition, the following general errors can occur for all of the |
866 | operations described above: | |
867 | .TP | |
868 | .B EFAULT | |
869 | .I argp | |
870 | does not point to a valid memory address. | |
871 | .TP | |
872 | .B EINVAL | |
873 | (For all operations except | |
874 | .BR UFFDIO_API .) | |
875 | The userfaultfd object has not yet been enabled (via the | |
1ae6b2c7 | 876 | .B UFFDIO_API |
24fa87ff | 877 | operation). |
3113c7f3 | 878 | .SH STANDARDS |
97b6084b MK |
879 | These |
880 | .BR ioctl (2) | |
405513d3 | 881 | operations are Linux-specific. |
53004b3b MR |
882 | .SH BUGS |
883 | In order to detect available userfault features and | |
5191c688 MK |
884 | enable some subset of those features |
885 | the userfaultfd file descriptor must be closed after the first | |
1ae6b2c7 | 886 | .B UFFDIO_API |
5191c688 | 887 | operation that queries features availability and reopened before |
53004b3b | 888 | the second |
1ae6b2c7 | 889 | .B UFFDIO_API |
5191c688 | 890 | operation that actually enables the desired features. |
a14af333 | 891 | .SH EXAMPLES |
e5efbe91 MK |
892 | See |
893 | .BR userfaultfd (2). | |
97b6084b MK |
894 | .SH SEE ALSO |
895 | .BR ioctl (2), | |
896 | .BR mmap (2), | |
897 | .BR userfaultfd (2) | |
efeece04 | 898 | .PP |
1ae6b2c7 | 899 | .I Documentation/admin\-guide/mm/userfaultfd.rst |
97b6084b | 900 | in the Linux kernel source tree |