]> git.ipfire.org Git - thirdparty/man-pages.git/blob - man2/memfd_create.2
memfd_create.2: Add description of MFD_HUGETLB (hugetlbfs) support
[thirdparty/man-pages.git] / man2 / memfd_create.2
1 .\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
2 .\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
3 .\"
4 .\" %%%LICENSE_START(GPLv2+)
5 .\"
6 .\" This program is free software; you can redistribute it and/or modify
7 .\" it under the terms of the GNU General Public License as published by
8 .\" the Free Software Foundation; either version 2 of the License, or
9 .\" (at your option) any later version.
10 .\"
11 .\" This program is distributed in the hope that it will be useful,
12 .\" but WITHOUT ANY WARRANTY; without even the implied warranty of
13 .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 .\" GNU General Public License for more details.
15 .\"
16 .\" You should have received a copy of the GNU General Public
17 .\" License along with this manual; if not, see
18 .\" <http://www.gnu.org/licenses/>.
19 .\" %%%LICENSE_END
20 .\"
21 .TH MEMFD_CREATE 2 2017-09-15 Linux "Linux Programmer's Manual"
22 .SH NAME
23 memfd_create \- create an anonymous file
24 .SH SYNOPSIS
25 .B #include <sys/memfd.h>
26 .PP
27 .BI "int memfd_create(const char *" name ", unsigned int " flags ");"
28 .PP
29 .IR Note :
30 There is no glibc wrapper for this system call; see NOTES.
31 .SH DESCRIPTION
32 .BR memfd_create ()
33 creates an anonymous file and returns a file descriptor that refers to it.
34 The file behaves like a regular file, and so can be modified,
35 truncated, memory-mapped, and so on.
36 However, unlike a regular file,
37 it lives in RAM and has a volatile backing storage.
38 Once all references to the file are dropped, it is automatically released.
39 Anonymous memory is used for all backing pages of the file.
40 Therefore, files created by
41 .BR memfd_create ()
42 have the same semantics as other anonymous
43 .\" David Herrmann:
44 .\" memfd uses VM_NORESERVE so each page is accounted on first access.
45 .\" This means, the overcommit-limits (see __vm_enough_memory()) and the
46 .\" memory-cgroup limits (mem_cgroup_try_charge()) are applied. Note that
47 .\" those are accounted on "current" and "current->mm", that is, the
48 .\" process doing the first page access.
49 memory allocations such as those allocated using
50 .BR mmap (2)
51 with the
52 .BR MAP_ANONYMOUS
53 flag.
54 .PP
55 The initial size of the file is set to 0.
56 Following the call, the file size should be set using
57 .BR ftruncate (2).
58 (Alternatively, the file may be populated by calls to
59 .BR write (2)
60 or similar.)
61 .PP
62 The name supplied in
63 .I name
64 is used as a filename and will be displayed
65 as the target of the corresponding symbolic link in the directory
66 .IR /proc/self/fd/ .
67 The displayed name is always prefixed with
68 .IR memfd:
69 and serves only for debugging purposes.
70 Names do not affect the behavior of the file descriptor,
71 and as such multiple files can have the same name without any side effects.
72 .PP
73 The following values may be bitwise ORed in
74 .IR flags
75 to change the behavior of
76 .BR memfd_create ():
77 .TP
78 .BR MFD_CLOEXEC
79 Set the close-on-exec
80 .RB ( FD_CLOEXEC )
81 flag on the new file descriptor.
82 See the description of the
83 .B O_CLOEXEC
84 flag in
85 .BR open (2)
86 for reasons why this may be useful.
87 .TP
88 .BR MFD_ALLOW_SEALING
89 Allow sealing operations on this file.
90 See the discussion of the
91 .B F_ADD_SEALS
92 and
93 .BR F_GET_SEALS
94 operations in
95 .BR fcntl (2),
96 and also NOTES, below.
97 The initial set of seals is empty.
98 If this flag is not set, the initial set of seals will be
99 .BR F_SEAL_SEAL ,
100 meaning that no other seals can be set on the file.
101 .\" FIXME Why is the MFD_ALLOW_SEALING behavior not simply the default?
102 .\" Is it worth adding some text explaining this?
103 .TP
104 .BR MFD_HUGETLB " (since Linux 4.14)"
105 The anonymous file will be created in the hugetlbfs filesystem using
106 huge pages. See the Linux kernel source file
107 .I Documentation/vm/hugetlbpage.txt
108 for more information about hugetlbfs. The hugetlbfs filesystem does
109 not support file sealing operations. Therefore, specifying both
110 .B MFD_HUGETLB
111 and
112 .B MFD_ALLOW_SEALING
113 will result in an error
114 .RB (EINVAL)
115 being returned.
116
117 .TP
118 .BR MFD_HUGE_2MB ", " MFD_HUGE_1GB ", " "..."
119 Used in conjunction with
120 .B MFD_HUGETLB
121 to select alternative hugetlb page sizes (respectively, 2 MB, 1 GB, ...)
122 on systems that support multiple hugetlb page sizes. Definitions for known
123 huge page sizes are included in the header file
124 .I <sys/memfd.h>.
125
126 For details on encoding huge page sizes not included in the header file,
127 see the discussion of the similarly named constants in
128 .BR mmap (2).
129
130 .PP
131 Unused bits in
132 .I flags
133 must be 0.
134 .PP
135 As its return value,
136 .BR memfd_create ()
137 returns a new file descriptor that can be used to refer to the file.
138 This file descriptor is opened for both reading and writing
139 .RB ( O_RDWR )
140 and
141 .B O_LARGEFILE
142 is set for the file descriptor.
143 .PP
144 With respect to
145 .BR fork (2)
146 and
147 .BR execve (2),
148 the usual semantics apply for the file descriptor created by
149 .BR memfd_create ().
150 A copy of the file descriptor is inherited by the child produced by
151 .BR fork (2)
152 and refers to the same file.
153 The file descriptor is preserved across
154 .BR execve (2),
155 unless the close-on-exec flag has been set.
156 .SH RETURN VALUE
157 On success,
158 .BR memfd_create ()
159 returns a new file descriptor.
160 On error, \-1 is returned and
161 .I errno
162 is set to indicate the error.
163 .SH ERRORS
164 .TP
165 .B EFAULT
166 The address in
167 .IR name
168 points to invalid memory.
169 .TP
170 .B EINVAL
171 An unsupported value was specified in one of the arguments:
172 .I flags
173 included unknown bits, or
174 .I name
175 was too long.
176 .TP
177 .B EMFILE
178 The per-process limit on the number of open file descriptors has been reached.
179 .TP
180 .B ENFILE
181 The system-wide limit on the total number of open files has been reached.
182 .TP
183 .B ENOMEM
184 There was insufficient memory to create a new anonymous file.
185 .SH VERSIONS
186 The
187 .BR memfd_create ()
188 system call first appeared in Linux 3.17.
189 .SH CONFORMING TO
190 The
191 .BR memfd_create ()
192 system call is Linux-specific.
193 .SH NOTES
194 Glibc does not provide a wrapper for this system call; call it using
195 .BR syscall (2).
196 .PP
197 .\" See also http://lwn.net/Articles/593918/
198 .\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/
199 The
200 .BR memfd_create ()
201 system call provides a simple alternative to manually mounting a
202 .BR tmpfs (5)
203 filesystem and creating and opening a file in that filesystem.
204 The primary purpose of
205 .BR memfd_create ()
206 is to create files and associated file descriptors that are
207 used with the file-sealing APIs provided by
208 .BR fcntl (2).
209 .PP
210 The
211 .BR memfd_create ()
212 system call also has uses without file sealing
213 (which is why file-sealing is disabled, unless explicitly requested with the
214 .BR MFD_ALLOW_SEALING
215 flag).
216 In particular, it can be used as an alternative to creating files in
217 .IR tmp
218 or as an alternative to using the
219 .BR open (2)
220 .B O_TMPFILE
221 in cases where there is no intention to actually link the
222 resulting file into the filesystem.
223 .SS File sealing
224 In the absence of file sealing,
225 processes that communicate via shared memory must either trust each other,
226 or take measures to deal with the possibility that an untrusted peer
227 may manipulate the shared memory region in problematic ways.
228 For example, an untrusted peer might modify the contents of the
229 shared memory at any time, or shrink the shared memory region.
230 The former possibility leaves the local process vulnerable to
231 time-of-check-to-time-of-use race conditions
232 (typically dealt with by copying data from
233 the shared memory region before checking and using it).
234 The latter possibility leaves the local process vulnerable to
235 .BR SIGBUS
236 signals when an attempt is made to access a now-nonexistent
237 location in the shared memory region.
238 (Dealing with this possibility necessitates the use of a handler for the
239 .BR SIGBUS
240 signal.)
241 .PP
242 Dealing with untrusted peers imposes extra complexity on
243 code that employs shared memory.
244 Memory sealing enables that extra complexity to be eliminated,
245 by allowing a process to operate secure in the knowledge that
246 its peer can't modify the shared memory in an undesired fashion.
247 .PP
248 An example of the usage of the sealing mechanism is as follows:
249 .IP 1. 3
250 The first process creates a
251 .BR tmpfs (5)
252 file using
253 .BR memfd_create ().
254 The call yields a file descriptor used in subsequent steps.
255 .IP 2.
256 The first process
257 sizes the file created in the previous step using
258 .BR ftruncate (2),
259 maps it using
260 .BR mmap (2),
261 and populates the shared memory with the desired data.
262 .IP 3.
263 The first process uses the
264 .BR fcntl (2)
265 .B F_ADD_SEALS
266 operation to place one or more seals on the file,
267 in order to restrict further modifications on the file.
268 (If placing the seal
269 .BR F_SEAL_WRITE ,
270 then it will be necessary to first unmap the shared writable mapping
271 created in the previous step.)
272 .IP 4.
273 A second process obtains a file descriptor for the
274 .BR tmpfs (5)
275 file and maps it.
276 Among the possible ways in which this could happen are the following:
277 .RS
278 .IP * 3
279 The process that called
280 .BR memfd_create ()
281 could transfer the resulting file descriptor to the second process
282 via a UNIX domain socket (see
283 .BR unix (7)
284 and
285 .BR cmsg (3)).
286 The second process then maps the file using
287 .BR mmap (2).
288 .IP *
289 The second process is created via
290 .BR fork (2)
291 and thus automatically inherits the file descriptor and mapping.
292 (Note that in this case and the next,
293 there is a natural trust relationship between the two processes,
294 since they are running under the same user ID.
295 Therefore, file sealing would not normally be necessary.)
296 .IP *
297 The second process opens the file
298 .IR /proc/<pid>/fd/<fd> ,
299 where
300 .I <pid>
301 is the PID of the first process (the one that called
302 .BR memfd_create ()),
303 and
304 .I <fd>
305 is the number of the file descriptor returned by the call to
306 .BR memfd_create ()
307 in that process.
308 The second process then maps the file using
309 .BR mmap (2).
310 .RE
311 .IP 5.
312 The second process uses the
313 .BR fcntl (2)
314 .B F_GET_SEALS
315 operation to retrieve the bit mask of seals
316 that has been applied to the file.
317 This bit mask can be inspected in order to determine
318 what kinds of restrictions have been placed on file modifications.
319 If desired, the second process can apply further seals
320 to impose additional restrictions (so long as the
321 .BR F_SEAL_SEAL
322 seal has not yet been applied).
323 .SH EXAMPLE
324 Below are shown two example programs that demonstrate the use of
325 .BR memfd_create ()
326 and the file sealing API.
327 .PP
328 The first program,
329 .IR t_memfd_create.c ,
330 creates a
331 .BR tmpfs (5)
332 file using
333 .BR memfd_create (),
334 sets a size for the file, maps it into memory,
335 and optionally places some seals on the file.
336 The program accepts up to three command-line arguments,
337 of which the first two are required.
338 The first argument is the name to associate with the file,
339 the second argument is the size to be set for the file,
340 and the optional third argument is a string of characters that specify
341 seals to be set on file.
342 .PP
343 The second program,
344 .IR t_get_seals.c ,
345 can be used to open an existing file that was created via
346 .BR memfd_create ()
347 and inspect the set of seals that have been applied to that file.
348 .PP
349 The following shell session demonstrates the use of these programs.
350 First we create a
351 .BR tmpfs (5)
352 file and set some seals on it:
353 .PP
354 .in +4n
355 .EX
356 $ \fB./t_memfd_create my_memfd_file 4096 sw &\fP
357 [1] 11775
358 PID: 11775; fd: 3; /proc/11775/fd/3
359 .EE
360 .in
361 .PP
362 At this point, the
363 .I t_memfd_create
364 program continues to run in the background.
365 From another program, we can obtain a file descriptor for the
366 file created by
367 .BR memfd_create ()
368 by opening the
369 .IR /proc/[pid]/fd
370 file that corresponds to the file descriptor opened by
371 .BR memfd_create ().
372 Using that pathname, we inspect the content of the
373 .IR /proc/[pid]/fd
374 symbolic link, and use our
375 .I t_get_seals
376 program to view the seals that have been placed on the file:
377 .PP
378 .in +4n
379 .EX
380 $ \fBreadlink /proc/11775/fd/3\fP
381 /memfd:my_memfd_file (deleted)
382 $ \fB./t_get_seals /proc/11775/fd/3\fP
383 Existing seals: WRITE SHRINK
384 .EE
385 .in
386 .SS Program source: t_memfd_create.c
387 \&
388 .EX
389 #include <sys/memfd.h>
390 #include <fcntl.h>
391 #include <stdlib.h>
392 #include <unistd.h>
393 #include <string.h>
394 #include <stdio.h>
395
396 #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
397 } while (0)
398
399 int
400 main(int argc, char *argv[])
401 {
402 int fd;
403 unsigned int seals;
404 char *addr;
405 char *name, *seals_arg;
406 ssize_t len;
407
408 if (argc < 3) {
409 fprintf(stderr, "%s name size [seals]\\n", argv[0]);
410 fprintf(stderr, "\\t\(aqseals\(aq can contain any of the "
411 "following characters:\\n");
412 fprintf(stderr, "\\t\\tg \- F_SEAL_GROW\\n");
413 fprintf(stderr, "\\t\\ts \- F_SEAL_SHRINK\\n");
414 fprintf(stderr, "\\t\\tw \- F_SEAL_WRITE\\n");
415 fprintf(stderr, "\\t\\tS \- F_SEAL_SEAL\\n");
416 exit(EXIT_FAILURE);
417 }
418
419 name = argv[1];
420 len = atoi(argv[2]);
421 seals_arg = argv[3];
422
423 /* Create an anonymous file in tmpfs; allow seals to be
424 placed on the file */
425
426 fd = memfd_create(name, MFD_ALLOW_SEALING);
427 if (fd == \-1)
428 errExit("memfd_create");
429
430 /* Size the file as specified on the command line */
431
432 if (ftruncate(fd, len) == \-1)
433 errExit("truncate");
434
435 printf("PID: %ld; fd: %d; /proc/%ld/fd/%d\\n",
436 (long) getpid(), fd, (long) getpid(), fd);
437
438 /* Code to map the file and populate the mapping with data
439 omitted */
440
441 /* If a \(aqseals\(aq command\-line argument was supplied, set some
442 seals on the file */
443
444 if (seals_arg != NULL) {
445 seals = 0;
446
447 if (strchr(seals_arg, \(aqg\(aq) != NULL)
448 seals |= F_SEAL_GROW;
449 if (strchr(seals_arg, \(aqs\(aq) != NULL)
450 seals |= F_SEAL_SHRINK;
451 if (strchr(seals_arg, \(aqw\(aq) != NULL)
452 seals |= F_SEAL_WRITE;
453 if (strchr(seals_arg, \(aqS\(aq) != NULL)
454 seals |= F_SEAL_SEAL;
455
456 if (fcntl(fd, F_ADD_SEALS, seals) == \-1)
457 errExit("fcntl");
458 }
459
460 /* Keep running, so that the file created by memfd_create()
461 continues to exist */
462
463 pause();
464
465 exit(EXIT_SUCCESS);
466 }
467 .EE
468 .SS Program source: t_get_seals.c
469 \&
470 .EX
471 #include <sys/memfd.h>
472 #include <fcntl.h>
473 #include <unistd.h>
474 #include <stdlib.h>
475 #include <string.h>
476 #include <stdio.h>
477
478 #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
479 } while (0)
480
481 int
482 main(int argc, char *argv[])
483 {
484 int fd;
485 unsigned int seals;
486
487 if (argc != 2) {
488 fprintf(stderr, "%s /proc/PID/fd/FD\\n", argv[0]);
489 exit(EXIT_FAILURE);
490 }
491
492 fd = open(argv[1], O_RDWR);
493 if (fd == \-1)
494 errExit("open");
495
496 seals = fcntl(fd, F_GET_SEALS);
497 if (seals == \-1)
498 errExit("fcntl");
499
500 printf("Existing seals:");
501 if (seals & F_SEAL_SEAL)
502 printf(" SEAL");
503 if (seals & F_SEAL_GROW)
504 printf(" GROW");
505 if (seals & F_SEAL_WRITE)
506 printf(" WRITE");
507 if (seals & F_SEAL_SHRINK)
508 printf(" SHRINK");
509 printf("\\n");
510
511 /* Code to map the file and access the contents of the
512 resulting mapping omitted */
513
514 exit(EXIT_SUCCESS);
515 }
516 .EE
517 .SH SEE ALSO
518 .BR fcntl (2),
519 .BR ftruncate (2),
520 .BR mmap (2),
521 .BR shmget (2),
522 .BR shm_open (3)