]> git.ipfire.org Git - thirdparty/man-pages.git/blob - man2/memfd_create.2
memfd_create.2: ffix
[thirdparty/man-pages.git] / man2 / memfd_create.2
1 .\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
2 .\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
3 .\"
4 .\" %%%LICENSE_START(GPLv2+)
5 .\"
6 .\" This program is free software; you can redistribute it and/or modify
7 .\" it under the terms of the GNU General Public License as published by
8 .\" the Free Software Foundation; either version 2 of the License, or
9 .\" (at your option) any later version.
10 .\"
11 .\" This program is distributed in the hope that it will be useful,
12 .\" but WITHOUT ANY WARRANTY; without even the implied warranty of
13 .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 .\" GNU General Public License for more details.
15 .\"
16 .\" You should have received a copy of the GNU General Public
17 .\" License along with this manual; if not, see
18 .\" <http://www.gnu.org/licenses/>.
19 .\" %%%LICENSE_END
20 .\"
21 .TH MEMFD_CREATE 2 2017-09-15 Linux "Linux Programmer's Manual"
22 .SH NAME
23 memfd_create \- create an anonymous file
24 .SH SYNOPSIS
25 .B #include <sys/memfd.h>
26 .PP
27 .BI "int memfd_create(const char *" name ", unsigned int " flags ");"
28 .PP
29 .IR Note :
30 There is no glibc wrapper for this system call; see NOTES.
31 .SH DESCRIPTION
32 .BR memfd_create ()
33 creates an anonymous file and returns a file descriptor that refers to it.
34 The file behaves like a regular file, and so can be modified,
35 truncated, memory-mapped, and so on.
36 However, unlike a regular file,
37 it lives in RAM and has a volatile backing storage.
38 Once all references to the file are dropped, it is automatically released.
39 Anonymous memory is used for all backing pages of the file.
40 Therefore, files created by
41 .BR memfd_create ()
42 have the same semantics as other anonymous
43 .\" David Herrmann:
44 .\" memfd uses VM_NORESERVE so each page is accounted on first access.
45 .\" This means, the overcommit-limits (see __vm_enough_memory()) and the
46 .\" memory-cgroup limits (mem_cgroup_try_charge()) are applied. Note that
47 .\" those are accounted on "current" and "current->mm", that is, the
48 .\" process doing the first page access.
49 memory allocations such as those allocated using
50 .BR mmap (2)
51 with the
52 .BR MAP_ANONYMOUS
53 flag.
54 .PP
55 The initial size of the file is set to 0.
56 Following the call, the file size should be set using
57 .BR ftruncate (2).
58 (Alternatively, the file may be populated by calls to
59 .BR write (2)
60 or similar.)
61 .PP
62 The name supplied in
63 .I name
64 is used as a filename and will be displayed
65 as the target of the corresponding symbolic link in the directory
66 .IR /proc/self/fd/ .
67 The displayed name is always prefixed with
68 .IR memfd:
69 and serves only for debugging purposes.
70 Names do not affect the behavior of the file descriptor,
71 and as such multiple files can have the same name without any side effects.
72 .PP
73 The following values may be bitwise ORed in
74 .IR flags
75 to change the behavior of
76 .BR memfd_create ():
77 .TP
78 .BR MFD_CLOEXEC
79 Set the close-on-exec
80 .RB ( FD_CLOEXEC )
81 flag on the new file descriptor.
82 See the description of the
83 .B O_CLOEXEC
84 flag in
85 .BR open (2)
86 for reasons why this may be useful.
87 .TP
88 .BR MFD_ALLOW_SEALING
89 Allow sealing operations on this file.
90 See the discussion of the
91 .B F_ADD_SEALS
92 and
93 .BR F_GET_SEALS
94 operations in
95 .BR fcntl (2),
96 and also NOTES, below.
97 The initial set of seals is empty.
98 If this flag is not set, the initial set of seals will be
99 .BR F_SEAL_SEAL ,
100 meaning that no other seals can be set on the file.
101 .\" FIXME Why is the MFD_ALLOW_SEALING behavior not simply the default?
102 .\" Is it worth adding some text explaining this?
103 .TP
104 .BR MFD_HUGETLB " (since Linux 4.14)"
105 .\" commit 749df87bd7bee5a79cef073f5d032ddb2b211de8
106 The anonymous file will be created in the hugetlbfs filesystem using
107 huge pages.
108 See the Linux kernel source file
109 .I Documentation/vm/hugetlbpage.txt
110 for more information about hugetlbfs.
111 The hugetlbfs filesystem does not support file sealing operations.
112 Therefore, specifying both
113 .B MFD_HUGETLB
114 and
115 .B MFD_ALLOW_SEALING
116 will result in an error
117 .RB (EINVAL)
118 being returned.
119 .TP
120 .BR MFD_HUGE_2MB ", " MFD_HUGE_1GB ", " "..."
121 Used in conjunction with
122 .B MFD_HUGETLB
123 to select alternative hugetlb page sizes (respectively, 2 MB, 1 GB, ...)
124 on systems that support multiple hugetlb page sizes.
125 Definitions for known
126 huge page sizes are included in the header file
127 .I <sys/memfd.h>.
128 .IP
129 For details on encoding huge page sizes not included in the header file,
130 see the discussion of the similarly named constants in
131 .BR mmap (2).
132 .PP
133 Unused bits in
134 .I flags
135 must be 0.
136 .PP
137 As its return value,
138 .BR memfd_create ()
139 returns a new file descriptor that can be used to refer to the file.
140 This file descriptor is opened for both reading and writing
141 .RB ( O_RDWR )
142 and
143 .B O_LARGEFILE
144 is set for the file descriptor.
145 .PP
146 With respect to
147 .BR fork (2)
148 and
149 .BR execve (2),
150 the usual semantics apply for the file descriptor created by
151 .BR memfd_create ().
152 A copy of the file descriptor is inherited by the child produced by
153 .BR fork (2)
154 and refers to the same file.
155 The file descriptor is preserved across
156 .BR execve (2),
157 unless the close-on-exec flag has been set.
158 .SH RETURN VALUE
159 On success,
160 .BR memfd_create ()
161 returns a new file descriptor.
162 On error, \-1 is returned and
163 .I errno
164 is set to indicate the error.
165 .SH ERRORS
166 .TP
167 .B EFAULT
168 The address in
169 .IR name
170 points to invalid memory.
171 .TP
172 .B EINVAL
173 An unsupported value was specified in one of the arguments:
174 .I flags
175 included unknown bits, or
176 .I name
177 was too long.
178 .TP
179 .B EMFILE
180 The per-process limit on the number of open file descriptors has been reached.
181 .TP
182 .B ENFILE
183 The system-wide limit on the total number of open files has been reached.
184 .TP
185 .B ENOMEM
186 There was insufficient memory to create a new anonymous file.
187 .SH VERSIONS
188 The
189 .BR memfd_create ()
190 system call first appeared in Linux 3.17.
191 .SH CONFORMING TO
192 The
193 .BR memfd_create ()
194 system call is Linux-specific.
195 .SH NOTES
196 Glibc does not provide a wrapper for this system call; call it using
197 .BR syscall (2).
198 .PP
199 .\" See also http://lwn.net/Articles/593918/
200 .\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/
201 The
202 .BR memfd_create ()
203 system call provides a simple alternative to manually mounting a
204 .BR tmpfs (5)
205 filesystem and creating and opening a file in that filesystem.
206 The primary purpose of
207 .BR memfd_create ()
208 is to create files and associated file descriptors that are
209 used with the file-sealing APIs provided by
210 .BR fcntl (2).
211 .PP
212 The
213 .BR memfd_create ()
214 system call also has uses without file sealing
215 (which is why file-sealing is disabled, unless explicitly requested with the
216 .BR MFD_ALLOW_SEALING
217 flag).
218 In particular, it can be used as an alternative to creating files in
219 .IR tmp
220 or as an alternative to using the
221 .BR open (2)
222 .B O_TMPFILE
223 in cases where there is no intention to actually link the
224 resulting file into the filesystem.
225 .SS File sealing
226 In the absence of file sealing,
227 processes that communicate via shared memory must either trust each other,
228 or take measures to deal with the possibility that an untrusted peer
229 may manipulate the shared memory region in problematic ways.
230 For example, an untrusted peer might modify the contents of the
231 shared memory at any time, or shrink the shared memory region.
232 The former possibility leaves the local process vulnerable to
233 time-of-check-to-time-of-use race conditions
234 (typically dealt with by copying data from
235 the shared memory region before checking and using it).
236 The latter possibility leaves the local process vulnerable to
237 .BR SIGBUS
238 signals when an attempt is made to access a now-nonexistent
239 location in the shared memory region.
240 (Dealing with this possibility necessitates the use of a handler for the
241 .BR SIGBUS
242 signal.)
243 .PP
244 Dealing with untrusted peers imposes extra complexity on
245 code that employs shared memory.
246 Memory sealing enables that extra complexity to be eliminated,
247 by allowing a process to operate secure in the knowledge that
248 its peer can't modify the shared memory in an undesired fashion.
249 .PP
250 An example of the usage of the sealing mechanism is as follows:
251 .IP 1. 3
252 The first process creates a
253 .BR tmpfs (5)
254 file using
255 .BR memfd_create ().
256 The call yields a file descriptor used in subsequent steps.
257 .IP 2.
258 The first process
259 sizes the file created in the previous step using
260 .BR ftruncate (2),
261 maps it using
262 .BR mmap (2),
263 and populates the shared memory with the desired data.
264 .IP 3.
265 The first process uses the
266 .BR fcntl (2)
267 .B F_ADD_SEALS
268 operation to place one or more seals on the file,
269 in order to restrict further modifications on the file.
270 (If placing the seal
271 .BR F_SEAL_WRITE ,
272 then it will be necessary to first unmap the shared writable mapping
273 created in the previous step.)
274 .IP 4.
275 A second process obtains a file descriptor for the
276 .BR tmpfs (5)
277 file and maps it.
278 Among the possible ways in which this could happen are the following:
279 .RS
280 .IP * 3
281 The process that called
282 .BR memfd_create ()
283 could transfer the resulting file descriptor to the second process
284 via a UNIX domain socket (see
285 .BR unix (7)
286 and
287 .BR cmsg (3)).
288 The second process then maps the file using
289 .BR mmap (2).
290 .IP *
291 The second process is created via
292 .BR fork (2)
293 and thus automatically inherits the file descriptor and mapping.
294 (Note that in this case and the next,
295 there is a natural trust relationship between the two processes,
296 since they are running under the same user ID.
297 Therefore, file sealing would not normally be necessary.)
298 .IP *
299 The second process opens the file
300 .IR /proc/<pid>/fd/<fd> ,
301 where
302 .I <pid>
303 is the PID of the first process (the one that called
304 .BR memfd_create ()),
305 and
306 .I <fd>
307 is the number of the file descriptor returned by the call to
308 .BR memfd_create ()
309 in that process.
310 The second process then maps the file using
311 .BR mmap (2).
312 .RE
313 .IP 5.
314 The second process uses the
315 .BR fcntl (2)
316 .B F_GET_SEALS
317 operation to retrieve the bit mask of seals
318 that has been applied to the file.
319 This bit mask can be inspected in order to determine
320 what kinds of restrictions have been placed on file modifications.
321 If desired, the second process can apply further seals
322 to impose additional restrictions (so long as the
323 .BR F_SEAL_SEAL
324 seal has not yet been applied).
325 .SH EXAMPLE
326 Below are shown two example programs that demonstrate the use of
327 .BR memfd_create ()
328 and the file sealing API.
329 .PP
330 The first program,
331 .IR t_memfd_create.c ,
332 creates a
333 .BR tmpfs (5)
334 file using
335 .BR memfd_create (),
336 sets a size for the file, maps it into memory,
337 and optionally places some seals on the file.
338 The program accepts up to three command-line arguments,
339 of which the first two are required.
340 The first argument is the name to associate with the file,
341 the second argument is the size to be set for the file,
342 and the optional third argument is a string of characters that specify
343 seals to be set on file.
344 .PP
345 The second program,
346 .IR t_get_seals.c ,
347 can be used to open an existing file that was created via
348 .BR memfd_create ()
349 and inspect the set of seals that have been applied to that file.
350 .PP
351 The following shell session demonstrates the use of these programs.
352 First we create a
353 .BR tmpfs (5)
354 file and set some seals on it:
355 .PP
356 .in +4n
357 .EX
358 $ \fB./t_memfd_create my_memfd_file 4096 sw &\fP
359 [1] 11775
360 PID: 11775; fd: 3; /proc/11775/fd/3
361 .EE
362 .in
363 .PP
364 At this point, the
365 .I t_memfd_create
366 program continues to run in the background.
367 From another program, we can obtain a file descriptor for the
368 file created by
369 .BR memfd_create ()
370 by opening the
371 .IR /proc/[pid]/fd
372 file that corresponds to the file descriptor opened by
373 .BR memfd_create ().
374 Using that pathname, we inspect the content of the
375 .IR /proc/[pid]/fd
376 symbolic link, and use our
377 .I t_get_seals
378 program to view the seals that have been placed on the file:
379 .PP
380 .in +4n
381 .EX
382 $ \fBreadlink /proc/11775/fd/3\fP
383 /memfd:my_memfd_file (deleted)
384 $ \fB./t_get_seals /proc/11775/fd/3\fP
385 Existing seals: WRITE SHRINK
386 .EE
387 .in
388 .SS Program source: t_memfd_create.c
389 \&
390 .EX
391 #include <sys/memfd.h>
392 #include <fcntl.h>
393 #include <stdlib.h>
394 #include <unistd.h>
395 #include <string.h>
396 #include <stdio.h>
397
398 #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
399 } while (0)
400
401 int
402 main(int argc, char *argv[])
403 {
404 int fd;
405 unsigned int seals;
406 char *addr;
407 char *name, *seals_arg;
408 ssize_t len;
409
410 if (argc < 3) {
411 fprintf(stderr, "%s name size [seals]\\n", argv[0]);
412 fprintf(stderr, "\\t\(aqseals\(aq can contain any of the "
413 "following characters:\\n");
414 fprintf(stderr, "\\t\\tg \- F_SEAL_GROW\\n");
415 fprintf(stderr, "\\t\\ts \- F_SEAL_SHRINK\\n");
416 fprintf(stderr, "\\t\\tw \- F_SEAL_WRITE\\n");
417 fprintf(stderr, "\\t\\tS \- F_SEAL_SEAL\\n");
418 exit(EXIT_FAILURE);
419 }
420
421 name = argv[1];
422 len = atoi(argv[2]);
423 seals_arg = argv[3];
424
425 /* Create an anonymous file in tmpfs; allow seals to be
426 placed on the file */
427
428 fd = memfd_create(name, MFD_ALLOW_SEALING);
429 if (fd == \-1)
430 errExit("memfd_create");
431
432 /* Size the file as specified on the command line */
433
434 if (ftruncate(fd, len) == \-1)
435 errExit("truncate");
436
437 printf("PID: %ld; fd: %d; /proc/%ld/fd/%d\\n",
438 (long) getpid(), fd, (long) getpid(), fd);
439
440 /* Code to map the file and populate the mapping with data
441 omitted */
442
443 /* If a \(aqseals\(aq command\-line argument was supplied, set some
444 seals on the file */
445
446 if (seals_arg != NULL) {
447 seals = 0;
448
449 if (strchr(seals_arg, \(aqg\(aq) != NULL)
450 seals |= F_SEAL_GROW;
451 if (strchr(seals_arg, \(aqs\(aq) != NULL)
452 seals |= F_SEAL_SHRINK;
453 if (strchr(seals_arg, \(aqw\(aq) != NULL)
454 seals |= F_SEAL_WRITE;
455 if (strchr(seals_arg, \(aqS\(aq) != NULL)
456 seals |= F_SEAL_SEAL;
457
458 if (fcntl(fd, F_ADD_SEALS, seals) == \-1)
459 errExit("fcntl");
460 }
461
462 /* Keep running, so that the file created by memfd_create()
463 continues to exist */
464
465 pause();
466
467 exit(EXIT_SUCCESS);
468 }
469 .EE
470 .SS Program source: t_get_seals.c
471 \&
472 .EX
473 #include <sys/memfd.h>
474 #include <fcntl.h>
475 #include <unistd.h>
476 #include <stdlib.h>
477 #include <string.h>
478 #include <stdio.h>
479
480 #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
481 } while (0)
482
483 int
484 main(int argc, char *argv[])
485 {
486 int fd;
487 unsigned int seals;
488
489 if (argc != 2) {
490 fprintf(stderr, "%s /proc/PID/fd/FD\\n", argv[0]);
491 exit(EXIT_FAILURE);
492 }
493
494 fd = open(argv[1], O_RDWR);
495 if (fd == \-1)
496 errExit("open");
497
498 seals = fcntl(fd, F_GET_SEALS);
499 if (seals == \-1)
500 errExit("fcntl");
501
502 printf("Existing seals:");
503 if (seals & F_SEAL_SEAL)
504 printf(" SEAL");
505 if (seals & F_SEAL_GROW)
506 printf(" GROW");
507 if (seals & F_SEAL_WRITE)
508 printf(" WRITE");
509 if (seals & F_SEAL_SHRINK)
510 printf(" SHRINK");
511 printf("\\n");
512
513 /* Code to map the file and access the contents of the
514 resulting mapping omitted */
515
516 exit(EXIT_SUCCESS);
517 }
518 .EE
519 .SH SEE ALSO
520 .BR fcntl (2),
521 .BR ftruncate (2),
522 .BR mmap (2),
523 .BR shmget (2),
524 .BR shm_open (3)