]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/memfd_create.2
Ready for 5.00
[thirdparty/man-pages.git] / man2 / memfd_create.2
CommitLineData
878cc348
MK
1.\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
2.\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
73fc0b53 3.\"
46832662 4.\" %%%LICENSE_START(GPLv2+)
771e13d4 5.\"
73fc0b53
DH
6.\" This program is free software; you can redistribute it and/or modify
7.\" it under the terms of the GNU General Public License as published by
8.\" the Free Software Foundation; either version 2 of the License, or
9.\" (at your option) any later version.
10.\"
11.\" This program is distributed in the hope that it will be useful,
12.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
13.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14.\" GNU General Public License for more details.
15.\"
16.\" You should have received a copy of the GNU General Public
17.\" License along with this manual; if not, see
18.\" <http://www.gnu.org/licenses/>.
19.\" %%%LICENSE_END
20.\"
9ba01802 21.TH MEMFD_CREATE 2 2019-03-06 Linux "Linux Programmer's Manual"
73fc0b53
DH
22.SH NAME
23memfd_create \- create an anonymous file
24.SH SYNOPSIS
d6d367c7
JS
25.nf
26.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
6971614d 27.B #include <sys/mman.h>
68e4db0a 28.PP
73fc0b53
DH
29.BI "int memfd_create(const char *" name ", unsigned int " flags ");"
30.SH DESCRIPTION
31.BR memfd_create ()
f00ce3a0
MK
32creates an anonymous file and returns a file descriptor that refers to it.
33The file behaves like a regular file, and so can be modified,
afc5ca18 34truncated, memory-mapped, and so on.
f00ce3a0
MK
35However, unlike a regular file,
36it lives in RAM and has a volatile backing storage.
cb5b73cc
MK
37Once all references to the file are dropped, it is automatically released.
38Anonymous memory is used for all backing pages of the file.
f00ce3a0
MK
39Therefore, files created by
40.BR memfd_create ()
46832662
MK
41have the same semantics as other anonymous
42.\" David Herrmann:
43.\" memfd uses VM_NORESERVE so each page is accounted on first access.
44.\" This means, the overcommit-limits (see __vm_enough_memory()) and the
45.\" memory-cgroup limits (mem_cgroup_try_charge()) are applied. Note that
46.\" those are accounted on "current" and "current->mm", that is, the
47.\" process doing the first page access.
f00ce3a0 48memory allocations such as those allocated using
73fc0b53 49.BR mmap (2)
f00ce3a0
MK
50with the
51.BR MAP_ANONYMOUS
52flag.
efeece04 53.PP
73fc0b53 54The initial size of the file is set to 0.
f00ce3a0
MK
55Following the call, the file size should be set using
56.BR ftruncate (2).
46832662
MK
57(Alternatively, the file may be populated by calls to
58.BR write (2)
59or similar.)
efeece04 60.PP
f00ce3a0 61The name supplied in
73fc0b53 62.I name
46832662 63is used as a filename and will be displayed
f00ce3a0 64as the target of the corresponding symbolic link in the directory
73fc0b53 65.IR /proc/self/fd/ .
f00ce3a0
MK
66The displayed name is always prefixed with
67.IR memfd:
68and serves only for debugging purposes.
46832662 69Names do not affect the behavior of the file descriptor,
cb5b73cc 70and as such multiple files can have the same name without any side effects.
efeece04 71.PP
73fc0b53
DH
72The following values may be bitwise ORed in
73.IR flags
553deb41 74to change the behavior of
73fc0b53
DH
75.BR memfd_create ():
76.TP
77.BR MFD_CLOEXEC
78Set the close-on-exec
79.RB ( FD_CLOEXEC )
80flag on the new file descriptor.
81See the description of the
82.B O_CLOEXEC
83flag in
84.BR open (2)
cb5b73cc 85for reasons why this may be useful.
73fc0b53
DH
86.TP
87.BR MFD_ALLOW_SEALING
3a71dcd6 88Allow sealing operations on this file.
e8a0dfae 89See the discussion of the
73fc0b53
DH
90.B F_ADD_SEALS
91and
e8a0dfae
MK
92.BR F_GET_SEALS
93operations in
94.BR fcntl (2),
3a71dcd6 95and also NOTES, below.
cb5b73cc
MK
96The initial set of seals is empty.
97If this flag is not set, the initial set of seals will be
f00ce3a0
MK
98.BR F_SEAL_SEAL ,
99meaning that no other seals can be set on the file.
100.\" FIXME Why is the MFD_ALLOW_SEALING behavior not simply the default?
39b15554 101.\" Is it worth adding some text explaining this?
ce7fa2be
MK
102.TP
103.BR MFD_HUGETLB " (since Linux 4.14)"
d2cfa322 104.\" commit 749df87bd7bee5a79cef073f5d032ddb2b211de8
ce7fa2be 105The anonymous file will be created in the hugetlbfs filesystem using
d2cfa322
MK
106huge pages.
107See the Linux kernel source file
a2463bae 108.I Documentation/admin-guide/mm/hugetlbpage.rst
d2cfa322 109for more information about hugetlbfs.
659beec7
MAL
110.\" commit 47b9012ecdc747f6936395265e677d41e11a31ff
111Specifying both
ce7fa2be
MK
112.B MFD_HUGETLB
113and
114.B MFD_ALLOW_SEALING
88aa124a
MK
115in
116.I flags
659beec7 117is supported since Linux 4.16.
ce7fa2be
MK
118.TP
119.BR MFD_HUGE_2MB ", " MFD_HUGE_1GB ", " "..."
120Used in conjunction with
121.B MFD_HUGETLB
58988360 122to select alternative hugetlb page sizes (respectively, 2\ MB, 1\ GB, ...)
4332849f
MK
123on systems that support multiple hugetlb page sizes.
124Definitions for known
ce7fa2be 125huge page sizes are included in the header file
6971614d 126.I <linux/memfd.h>.
4332849f 127.IP
ce7fa2be
MK
128For details on encoding huge page sizes not included in the header file,
129see the discussion of the similarly named constants in
130.BR mmap (2).
73fc0b53 131.PP
f00ce3a0
MK
132Unused bits in
133.I flags
134must be 0.
efeece04 135.PP
73fc0b53
DH
136As its return value,
137.BR memfd_create ()
138returns a new file descriptor that can be used to refer to the file.
f00ce3a0
MK
139This file descriptor is opened for both reading and writing
140.RB ( O_RDWR )
141and
142.B O_LARGEFILE
d9cb0d7d 143is set for the file descriptor.
efeece04 144.PP
f00ce3a0
MK
145With respect to
146.BR fork (2)
147and
148.BR execve (2),
149the usual semantics apply for the file descriptor created by
150.BR memfd_create ().
151A copy of the file descriptor is inherited by the child produced by
152.BR fork (2)
153and refers to the same file.
154The file descriptor is preserved across
73fc0b53
DH
155.BR execve (2),
156unless the close-on-exec flag has been set.
157.SH RETURN VALUE
158On success,
159.BR memfd_create ()
160returns a new file descriptor.
161On error, \-1 is returned and
162.I errno
163is set to indicate the error.
164.SH ERRORS
165.TP
f00ce3a0
MK
166.B EFAULT
167The address in
168.IR name
169points to invalid memory.
170.TP
73fc0b53 171.B EINVAL
20acd21a 172.I flags
0fd5731e
MK
173included unknown bits.
174.TP
175.B EINVAL
20acd21a
MK
176.I name
177was too long.
5ade353d
MK
178(The limit is
179.\" NAME_MAX - strlen("memfd:")
180249 bytes, excluding the terminating null byte.)
73fc0b53 181.TP
88aa124a
MK
182.B EINVAL
183Both
184.B MFD_HUGETLB
185and
186.B MFD_ALLOW_SEALING
187were specified in
188.IR flags .
189.TP
73fc0b53 190.B EMFILE
26c32fab 191The per-process limit on the number of open file descriptors has been reached.
73fc0b53
DH
192.TP
193.B ENFILE
cb5b73cc 194The system-wide limit on the total number of open files has been reached.
73fc0b53 195.TP
73fc0b53
DH
196.B ENOMEM
197There was insufficient memory to create a new anonymous file.
198.SH VERSIONS
f00ce3a0
MK
199The
200.BR memfd_create ()
3411d30b
MK
201system call first appeared in Linux 3.17;
202glibc support was added in version 2.27.
73fc0b53 203.SH CONFORMING TO
f00ce3a0 204The
73fc0b53 205.BR memfd_create ()
f00ce3a0 206system call is Linux-specific.
51fa3cbf 207.SH NOTES
efeece04 208.PP
51fa3cbf
MK
209.\" See also http://lwn.net/Articles/593918/
210.\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/
211The
212.BR memfd_create ()
213system call provides a simple alternative to manually mounting a
4e07c70f 214.BR tmpfs (5)
51fa3cbf
MK
215filesystem and creating and opening a file in that filesystem.
216The primary purpose of
217.BR memfd_create ()
218is to create files and associated file descriptors that are
219used with the file-sealing APIs provided by
220.BR fcntl (2).
efeece04 221.PP
46832662
MK
222The
223.BR memfd_create ()
224system call also has uses without file sealing
225(which is why file-sealing is disabled, unless explicitly requested with the
226.BR MFD_ALLOW_SEALING
227flag).
228In particular, it can be used as an alternative to creating files in
229.IR tmp
230or as an alternative to using the
231.BR open (2)
232.B O_TMPFILE
233in cases where there is no intention to actually link the
234resulting file into the filesystem.
51fa3cbf
MK
235.SS File sealing
236In the absence of file sealing,
237processes that communicate via shared memory must either trust each other,
238or take measures to deal with the possibility that an untrusted peer
db61d4b2 239may manipulate the shared memory region in problematic ways.
51fa3cbf
MK
240For example, an untrusted peer might modify the contents of the
241shared memory at any time, or shrink the shared memory region.
242The former possibility leaves the local process vulnerable to
243time-of-check-to-time-of-use race conditions
244(typically dealt with by copying data from
245the shared memory region before checking and using it).
246The latter possibility leaves the local process vulnerable to
247.BR SIGBUS
248signals when an attempt is made to access a now-nonexistent
249location in the shared memory region.
250(Dealing with this possibility necessitates the use of a handler for the
251.BR SIGBUS
252signal.)
efeece04 253.PP
51fa3cbf
MK
254Dealing with untrusted peers imposes extra complexity on
255code that employs shared memory.
256Memory sealing enables that extra complexity to be eliminated,
257by allowing a process to operate secure in the knowledge that
258its peer can't modify the shared memory in an undesired fashion.
efeece04 259.PP
51fa3cbf 260An example of the usage of the sealing mechanism is as follows:
51fa3cbf
MK
261.IP 1. 3
262The first process creates a
4e07c70f 263.BR tmpfs (5)
771e13d4 264file using
51fa3cbf
MK
265.BR memfd_create ().
266The call yields a file descriptor used in subsequent steps.
267.IP 2.
268The first process
269sizes the file created in the previous step using
270.BR ftruncate (2),
271maps it using
272.BR mmap (2),
273and populates the shared memory with the desired data.
274.IP 3.
275The first process uses the
276.BR fcntl (2)
277.B F_ADD_SEALS
278operation to place one or more seals on the file,
279in order to restrict further modifications on the file.
280(If placing the seal
281.BR F_SEAL_WRITE ,
282then it will be necessary to first unmap the shared writable mapping
283created in the previous step.)
284.IP 4.
285A second process obtains a file descriptor for the
4e07c70f 286.BR tmpfs (5)
51fa3cbf 287file and maps it.
46832662 288Among the possible ways in which this could happen are the following:
51fa3cbf
MK
289.RS
290.IP * 3
46832662
MK
291The process that called
292.BR memfd_create ()
293could transfer the resulting file descriptor to the second process
294via a UNIX domain socket (see
295.BR unix (7)
296and
297.BR cmsg (3)).
298The second process then maps the file using
299.BR mmap (2).
300.IP *
51fa3cbf
MK
301The second process is created via
302.BR fork (2)
303and thus automatically inherits the file descriptor and mapping.
46832662
MK
304(Note that in this case and the next,
305there is a natural trust relationship between the two processes,
306since they are running under the same user ID.
307Therefore, file sealing would not normally be necessary.)
51fa3cbf 308.IP *
771e13d4 309The second process opens the file
9aae8d48 310.IR /proc/<pid>/fd/<fd> ,
51fa3cbf
MK
311where
312.I <pid>
313is the PID of the first process (the one that called
314.BR memfd_create ()),
315and
316.I <fd>
317is the number of the file descriptor returned by the call to
318.BR memfd_create ()
319in that process.
320The second process then maps the file using
321.BR mmap (2).
322.RE
323.IP 5.
324The second process uses the
325.BR fcntl (2)
326.B F_GET_SEALS
4f32648e
MK
327operation to retrieve the bit mask of seals
328that has been applied to the file.
329This bit mask can be inspected in order to determine
330what kinds of restrictions have been placed on file modifications.
51fa3cbf
MK
331If desired, the second process can apply further seals
332to impose additional restrictions (so long as the
333.BR F_SEAL_SEAL
334seal has not yet been applied).
878cc348
MK
335.SH EXAMPLE
336Below are shown two example programs that demonstrate the use of
337.BR memfd_create ()
338and the file sealing API.
efeece04 339.PP
878cc348
MK
340The first program,
341.IR t_memfd_create.c ,
342creates a
4e07c70f 343.BR tmpfs (5)
878cc348
MK
344file using
345.BR memfd_create (),
346sets a size for the file, maps it into memory,
347and optionally places some seals on the file.
348The program accepts up to three command-line arguments,
349of which the first two are required.
350The first argument is the name to associate with the file,
351the second argument is the size to be set for the file,
e57f8d34 352and the optional third argument is a string of characters that specify
878cc348 353seals to be set on file.
efeece04 354.PP
878cc348
MK
355The second program,
356.IR t_get_seals.c ,
357can be used to open an existing file that was created via
358.BR memfd_create ()
359and inspect the set of seals that have been applied to that file.
efeece04 360.PP
878cc348
MK
361The following shell session demonstrates the use of these programs.
362First we create a
4e07c70f 363.BR tmpfs (5)
878cc348 364file and set some seals on it:
efeece04 365.PP
878cc348 366.in +4n
b8302363 367.EX
878cc348
MK
368$ \fB./t_memfd_create my_memfd_file 4096 sw &\fP
369[1] 11775
370PID: 11775; fd: 3; /proc/11775/fd/3
b8302363 371.EE
878cc348 372.in
efeece04 373.PP
878cc348
MK
374At this point, the
375.I t_memfd_create
376program continues to run in the background.
377From another program, we can obtain a file descriptor for the
46832662
MK
378file created by
379.BR memfd_create ()
380by opening the
750653a8 381.IR /proc/[pid]/fd
d9cb0d7d 382file that corresponds to the file descriptor opened by
878cc348
MK
383.BR memfd_create ().
384Using that pathname, we inspect the content of the
750653a8 385.IR /proc/[pid]/fd
878cc348
MK
386symbolic link, and use our
387.I t_get_seals
388program to view the seals that have been placed on the file:
efeece04 389.PP
878cc348 390.in +4n
b8302363 391.EX
878cc348
MK
392$ \fBreadlink /proc/11775/fd/3\fP
393/memfd:my_memfd_file (deleted)
394$ \fB./t_get_seals /proc/11775/fd/3\fP
395Existing seals: WRITE SHRINK
b8302363 396.EE
878cc348
MK
397.in
398.SS Program source: t_memfd_create.c
399\&
408731d4 400.EX
d6d367c7 401#define _GNU_SOURCE
6971614d 402#include <sys/mman.h>
878cc348
MK
403#include <fcntl.h>
404#include <stdlib.h>
405#include <unistd.h>
406#include <string.h>
407#include <stdio.h>
408
d1a71985 409#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
878cc348
MK
410 } while (0)
411
412int
413main(int argc, char *argv[])
414{
415 int fd;
416 unsigned int seals;
417 char *addr;
418 char *name, *seals_arg;
419 ssize_t len;
420
421 if (argc < 3) {
d1a71985
MK
422 fprintf(stderr, "%s name size [seals]\en", argv[0]);
423 fprintf(stderr, "\et\(aqseals\(aq can contain any of the "
424 "following characters:\en");
425 fprintf(stderr, "\et\etg \- F_SEAL_GROW\en");
426 fprintf(stderr, "\et\ets \- F_SEAL_SHRINK\en");
427 fprintf(stderr, "\et\etw \- F_SEAL_WRITE\en");
428 fprintf(stderr, "\et\etS \- F_SEAL_SEAL\en");
878cc348
MK
429 exit(EXIT_FAILURE);
430 }
431
432 name = argv[1];
433 len = atoi(argv[2]);
434 seals_arg = argv[3];
435
436 /* Create an anonymous file in tmpfs; allow seals to be
437 placed on the file */
438
439 fd = memfd_create(name, MFD_ALLOW_SEALING);
440 if (fd == \-1)
441 errExit("memfd_create");
442
443 /* Size the file as specified on the command line */
444
445 if (ftruncate(fd, len) == \-1)
446 errExit("truncate");
447
d1a71985 448 printf("PID: %ld; fd: %d; /proc/%ld/fd/%d\en",
878cc348
MK
449 (long) getpid(), fd, (long) getpid(), fd);
450
451 /* Code to map the file and populate the mapping with data
452 omitted */
453
454 /* If a \(aqseals\(aq command\-line argument was supplied, set some
455 seals on the file */
456
457 if (seals_arg != NULL) {
458 seals = 0;
459
460 if (strchr(seals_arg, \(aqg\(aq) != NULL)
461 seals |= F_SEAL_GROW;
462 if (strchr(seals_arg, \(aqs\(aq) != NULL)
463 seals |= F_SEAL_SHRINK;
464 if (strchr(seals_arg, \(aqw\(aq) != NULL)
465 seals |= F_SEAL_WRITE;
466 if (strchr(seals_arg, \(aqS\(aq) != NULL)
467 seals |= F_SEAL_SEAL;
468
469 if (fcntl(fd, F_ADD_SEALS, seals) == \-1)
470 errExit("fcntl");
471 }
472
473 /* Keep running, so that the file created by memfd_create()
474 continues to exist */
475
476 pause();
477
478 exit(EXIT_SUCCESS);
479}
408731d4 480.EE
878cc348
MK
481.SS Program source: t_get_seals.c
482\&
408731d4 483.EX
d6d367c7 484#define _GNU_SOURCE
6971614d 485#include <sys/mman.h>
878cc348
MK
486#include <fcntl.h>
487#include <unistd.h>
488#include <stdlib.h>
489#include <string.h>
490#include <stdio.h>
491
d1a71985 492#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
878cc348
MK
493 } while (0)
494
495int
496main(int argc, char *argv[])
497{
498 int fd;
499 unsigned int seals;
500
501 if (argc != 2) {
d1a71985 502 fprintf(stderr, "%s /proc/PID/fd/FD\en", argv[0]);
878cc348
MK
503 exit(EXIT_FAILURE);
504 }
505
506 fd = open(argv[1], O_RDWR);
507 if (fd == \-1)
508 errExit("open");
509
510 seals = fcntl(fd, F_GET_SEALS);
511 if (seals == \-1)
512 errExit("fcntl");
513
514 printf("Existing seals:");
515 if (seals & F_SEAL_SEAL)
516 printf(" SEAL");
517 if (seals & F_SEAL_GROW)
518 printf(" GROW");
519 if (seals & F_SEAL_WRITE)
520 printf(" WRITE");
521 if (seals & F_SEAL_SHRINK)
522 printf(" SHRINK");
d1a71985 523 printf("\en");
878cc348
MK
524
525 /* Code to map the file and access the contents of the
526 resulting mapping omitted */
527
528 exit(EXIT_SUCCESS);
529}
408731d4 530.EE
73fc0b53 531.SH SEE ALSO
73fc0b53 532.BR fcntl (2),
3a71dcd6
MK
533.BR ftruncate (2),
534.BR mmap (2),
46832662
MK
535.BR shmget (2),
536.BR shm_open (3)