]> git.ipfire.org Git - thirdparty/man-pages.git/blob - man2/memfd_create.2
memfd_create.2: Update hugetlb file-sealing support
[thirdparty/man-pages.git] / man2 / memfd_create.2
1 .\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
2 .\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
3 .\"
4 .\" %%%LICENSE_START(GPLv2+)
5 .\"
6 .\" This program is free software; you can redistribute it and/or modify
7 .\" it under the terms of the GNU General Public License as published by
8 .\" the Free Software Foundation; either version 2 of the License, or
9 .\" (at your option) any later version.
10 .\"
11 .\" This program is distributed in the hope that it will be useful,
12 .\" but WITHOUT ANY WARRANTY; without even the implied warranty of
13 .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 .\" GNU General Public License for more details.
15 .\"
16 .\" You should have received a copy of the GNU General Public
17 .\" License along with this manual; if not, see
18 .\" <http://www.gnu.org/licenses/>.
19 .\" %%%LICENSE_END
20 .\"
21 .TH MEMFD_CREATE 2 2018-02-02 Linux "Linux Programmer's Manual"
22 .SH NAME
23 memfd_create \- create an anonymous file
24 .SH SYNOPSIS
25 .nf
26 .BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
27 .B #include <sys/mman.h>
28 .PP
29 .BI "int memfd_create(const char *" name ", unsigned int " flags ");"
30 .SH DESCRIPTION
31 .BR memfd_create ()
32 creates an anonymous file and returns a file descriptor that refers to it.
33 The file behaves like a regular file, and so can be modified,
34 truncated, memory-mapped, and so on.
35 However, unlike a regular file,
36 it lives in RAM and has a volatile backing storage.
37 Once all references to the file are dropped, it is automatically released.
38 Anonymous memory is used for all backing pages of the file.
39 Therefore, files created by
40 .BR memfd_create ()
41 have the same semantics as other anonymous
42 .\" David Herrmann:
43 .\" memfd uses VM_NORESERVE so each page is accounted on first access.
44 .\" This means, the overcommit-limits (see __vm_enough_memory()) and the
45 .\" memory-cgroup limits (mem_cgroup_try_charge()) are applied. Note that
46 .\" those are accounted on "current" and "current->mm", that is, the
47 .\" process doing the first page access.
48 memory allocations such as those allocated using
49 .BR mmap (2)
50 with the
51 .BR MAP_ANONYMOUS
52 flag.
53 .PP
54 The initial size of the file is set to 0.
55 Following the call, the file size should be set using
56 .BR ftruncate (2).
57 (Alternatively, the file may be populated by calls to
58 .BR write (2)
59 or similar.)
60 .PP
61 The name supplied in
62 .I name
63 is used as a filename and will be displayed
64 as the target of the corresponding symbolic link in the directory
65 .IR /proc/self/fd/ .
66 The displayed name is always prefixed with
67 .IR memfd:
68 and serves only for debugging purposes.
69 Names do not affect the behavior of the file descriptor,
70 and as such multiple files can have the same name without any side effects.
71 .PP
72 The following values may be bitwise ORed in
73 .IR flags
74 to change the behavior of
75 .BR memfd_create ():
76 .TP
77 .BR MFD_CLOEXEC
78 Set the close-on-exec
79 .RB ( FD_CLOEXEC )
80 flag on the new file descriptor.
81 See the description of the
82 .B O_CLOEXEC
83 flag in
84 .BR open (2)
85 for reasons why this may be useful.
86 .TP
87 .BR MFD_ALLOW_SEALING
88 Allow sealing operations on this file.
89 See the discussion of the
90 .B F_ADD_SEALS
91 and
92 .BR F_GET_SEALS
93 operations in
94 .BR fcntl (2),
95 and also NOTES, below.
96 The initial set of seals is empty.
97 If this flag is not set, the initial set of seals will be
98 .BR F_SEAL_SEAL ,
99 meaning that no other seals can be set on the file.
100 .\" FIXME Why is the MFD_ALLOW_SEALING behavior not simply the default?
101 .\" Is it worth adding some text explaining this?
102 .TP
103 .BR MFD_HUGETLB " (since Linux 4.14)"
104 .\" commit 749df87bd7bee5a79cef073f5d032ddb2b211de8
105 The anonymous file will be created in the hugetlbfs filesystem using
106 huge pages.
107 See the Linux kernel source file
108 .I Documentation/vm/hugetlbpage.txt
109 for more information about hugetlbfs.
110 .\" commit 47b9012ecdc747f6936395265e677d41e11a31ff
111 Specifying both
112 .B MFD_HUGETLB
113 and
114 .B MFD_ALLOW_SEALING
115 in
116 .I flags
117 is supported since Linux 4.16.
118 .TP
119 .BR MFD_HUGE_2MB ", " MFD_HUGE_1GB ", " "..."
120 Used in conjunction with
121 .B MFD_HUGETLB
122 to select alternative hugetlb page sizes (respectively, 2\ MB, 1\ GB, ...)
123 on systems that support multiple hugetlb page sizes.
124 Definitions for known
125 huge page sizes are included in the header file
126 .I <linux/memfd.h>.
127 .IP
128 For details on encoding huge page sizes not included in the header file,
129 see the discussion of the similarly named constants in
130 .BR mmap (2).
131 .PP
132 Unused bits in
133 .I flags
134 must be 0.
135 .PP
136 As its return value,
137 .BR memfd_create ()
138 returns a new file descriptor that can be used to refer to the file.
139 This file descriptor is opened for both reading and writing
140 .RB ( O_RDWR )
141 and
142 .B O_LARGEFILE
143 is set for the file descriptor.
144 .PP
145 With respect to
146 .BR fork (2)
147 and
148 .BR execve (2),
149 the usual semantics apply for the file descriptor created by
150 .BR memfd_create ().
151 A copy of the file descriptor is inherited by the child produced by
152 .BR fork (2)
153 and refers to the same file.
154 The file descriptor is preserved across
155 .BR execve (2),
156 unless the close-on-exec flag has been set.
157 .SH RETURN VALUE
158 On success,
159 .BR memfd_create ()
160 returns a new file descriptor.
161 On error, \-1 is returned and
162 .I errno
163 is set to indicate the error.
164 .SH ERRORS
165 .TP
166 .B EFAULT
167 The address in
168 .IR name
169 points to invalid memory.
170 .TP
171 .B EINVAL
172 .I flags
173 included unknown bits.
174 .TP
175 .B EINVAL
176 .I name
177 was too long.
178 (The limit is
179 .\" NAME_MAX - strlen("memfd:")
180 249 bytes, excluding the terminating null byte.)
181 .TP
182 .B EINVAL
183 Both
184 .B MFD_HUGETLB
185 and
186 .B MFD_ALLOW_SEALING
187 were specified in
188 .IR flags .
189 .TP
190 .B EMFILE
191 The per-process limit on the number of open file descriptors has been reached.
192 .TP
193 .B ENFILE
194 The system-wide limit on the total number of open files has been reached.
195 .TP
196 .B ENOMEM
197 There was insufficient memory to create a new anonymous file.
198 .SH VERSIONS
199 The
200 .BR memfd_create ()
201 system call first appeared in Linux 3.17;
202 glibc support was added in version 2.27.
203 .SH CONFORMING TO
204 The
205 .BR memfd_create ()
206 system call is Linux-specific.
207 .SH NOTES
208 .PP
209 .\" See also http://lwn.net/Articles/593918/
210 .\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/
211 The
212 .BR memfd_create ()
213 system call provides a simple alternative to manually mounting a
214 .BR tmpfs (5)
215 filesystem and creating and opening a file in that filesystem.
216 The primary purpose of
217 .BR memfd_create ()
218 is to create files and associated file descriptors that are
219 used with the file-sealing APIs provided by
220 .BR fcntl (2).
221 .PP
222 The
223 .BR memfd_create ()
224 system call also has uses without file sealing
225 (which is why file-sealing is disabled, unless explicitly requested with the
226 .BR MFD_ALLOW_SEALING
227 flag).
228 In particular, it can be used as an alternative to creating files in
229 .IR tmp
230 or as an alternative to using the
231 .BR open (2)
232 .B O_TMPFILE
233 in cases where there is no intention to actually link the
234 resulting file into the filesystem.
235 .SS File sealing
236 In the absence of file sealing,
237 processes that communicate via shared memory must either trust each other,
238 or take measures to deal with the possibility that an untrusted peer
239 may manipulate the shared memory region in problematic ways.
240 For example, an untrusted peer might modify the contents of the
241 shared memory at any time, or shrink the shared memory region.
242 The former possibility leaves the local process vulnerable to
243 time-of-check-to-time-of-use race conditions
244 (typically dealt with by copying data from
245 the shared memory region before checking and using it).
246 The latter possibility leaves the local process vulnerable to
247 .BR SIGBUS
248 signals when an attempt is made to access a now-nonexistent
249 location in the shared memory region.
250 (Dealing with this possibility necessitates the use of a handler for the
251 .BR SIGBUS
252 signal.)
253 .PP
254 Dealing with untrusted peers imposes extra complexity on
255 code that employs shared memory.
256 Memory sealing enables that extra complexity to be eliminated,
257 by allowing a process to operate secure in the knowledge that
258 its peer can't modify the shared memory in an undesired fashion.
259 .PP
260 An example of the usage of the sealing mechanism is as follows:
261 .IP 1. 3
262 The first process creates a
263 .BR tmpfs (5)
264 file using
265 .BR memfd_create ().
266 The call yields a file descriptor used in subsequent steps.
267 .IP 2.
268 The first process
269 sizes the file created in the previous step using
270 .BR ftruncate (2),
271 maps it using
272 .BR mmap (2),
273 and populates the shared memory with the desired data.
274 .IP 3.
275 The first process uses the
276 .BR fcntl (2)
277 .B F_ADD_SEALS
278 operation to place one or more seals on the file,
279 in order to restrict further modifications on the file.
280 (If placing the seal
281 .BR F_SEAL_WRITE ,
282 then it will be necessary to first unmap the shared writable mapping
283 created in the previous step.)
284 .IP 4.
285 A second process obtains a file descriptor for the
286 .BR tmpfs (5)
287 file and maps it.
288 Among the possible ways in which this could happen are the following:
289 .RS
290 .IP * 3
291 The process that called
292 .BR memfd_create ()
293 could transfer the resulting file descriptor to the second process
294 via a UNIX domain socket (see
295 .BR unix (7)
296 and
297 .BR cmsg (3)).
298 The second process then maps the file using
299 .BR mmap (2).
300 .IP *
301 The second process is created via
302 .BR fork (2)
303 and thus automatically inherits the file descriptor and mapping.
304 (Note that in this case and the next,
305 there is a natural trust relationship between the two processes,
306 since they are running under the same user ID.
307 Therefore, file sealing would not normally be necessary.)
308 .IP *
309 The second process opens the file
310 .IR /proc/<pid>/fd/<fd> ,
311 where
312 .I <pid>
313 is the PID of the first process (the one that called
314 .BR memfd_create ()),
315 and
316 .I <fd>
317 is the number of the file descriptor returned by the call to
318 .BR memfd_create ()
319 in that process.
320 The second process then maps the file using
321 .BR mmap (2).
322 .RE
323 .IP 5.
324 The second process uses the
325 .BR fcntl (2)
326 .B F_GET_SEALS
327 operation to retrieve the bit mask of seals
328 that has been applied to the file.
329 This bit mask can be inspected in order to determine
330 what kinds of restrictions have been placed on file modifications.
331 If desired, the second process can apply further seals
332 to impose additional restrictions (so long as the
333 .BR F_SEAL_SEAL
334 seal has not yet been applied).
335 .SH EXAMPLE
336 Below are shown two example programs that demonstrate the use of
337 .BR memfd_create ()
338 and the file sealing API.
339 .PP
340 The first program,
341 .IR t_memfd_create.c ,
342 creates a
343 .BR tmpfs (5)
344 file using
345 .BR memfd_create (),
346 sets a size for the file, maps it into memory,
347 and optionally places some seals on the file.
348 The program accepts up to three command-line arguments,
349 of which the first two are required.
350 The first argument is the name to associate with the file,
351 the second argument is the size to be set for the file,
352 and the optional third argument is a string of characters that specify
353 seals to be set on file.
354 .PP
355 The second program,
356 .IR t_get_seals.c ,
357 can be used to open an existing file that was created via
358 .BR memfd_create ()
359 and inspect the set of seals that have been applied to that file.
360 .PP
361 The following shell session demonstrates the use of these programs.
362 First we create a
363 .BR tmpfs (5)
364 file and set some seals on it:
365 .PP
366 .in +4n
367 .EX
368 $ \fB./t_memfd_create my_memfd_file 4096 sw &\fP
369 [1] 11775
370 PID: 11775; fd: 3; /proc/11775/fd/3
371 .EE
372 .in
373 .PP
374 At this point, the
375 .I t_memfd_create
376 program continues to run in the background.
377 From another program, we can obtain a file descriptor for the
378 file created by
379 .BR memfd_create ()
380 by opening the
381 .IR /proc/[pid]/fd
382 file that corresponds to the file descriptor opened by
383 .BR memfd_create ().
384 Using that pathname, we inspect the content of the
385 .IR /proc/[pid]/fd
386 symbolic link, and use our
387 .I t_get_seals
388 program to view the seals that have been placed on the file:
389 .PP
390 .in +4n
391 .EX
392 $ \fBreadlink /proc/11775/fd/3\fP
393 /memfd:my_memfd_file (deleted)
394 $ \fB./t_get_seals /proc/11775/fd/3\fP
395 Existing seals: WRITE SHRINK
396 .EE
397 .in
398 .SS Program source: t_memfd_create.c
399 \&
400 .EX
401 #define _GNU_SOURCE
402 #include <sys/mman.h>
403 #include <fcntl.h>
404 #include <stdlib.h>
405 #include <unistd.h>
406 #include <string.h>
407 #include <stdio.h>
408
409 #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
410 } while (0)
411
412 int
413 main(int argc, char *argv[])
414 {
415 int fd;
416 unsigned int seals;
417 char *addr;
418 char *name, *seals_arg;
419 ssize_t len;
420
421 if (argc < 3) {
422 fprintf(stderr, "%s name size [seals]\\n", argv[0]);
423 fprintf(stderr, "\\t\(aqseals\(aq can contain any of the "
424 "following characters:\\n");
425 fprintf(stderr, "\\t\\tg \- F_SEAL_GROW\\n");
426 fprintf(stderr, "\\t\\ts \- F_SEAL_SHRINK\\n");
427 fprintf(stderr, "\\t\\tw \- F_SEAL_WRITE\\n");
428 fprintf(stderr, "\\t\\tS \- F_SEAL_SEAL\\n");
429 exit(EXIT_FAILURE);
430 }
431
432 name = argv[1];
433 len = atoi(argv[2]);
434 seals_arg = argv[3];
435
436 /* Create an anonymous file in tmpfs; allow seals to be
437 placed on the file */
438
439 fd = memfd_create(name, MFD_ALLOW_SEALING);
440 if (fd == \-1)
441 errExit("memfd_create");
442
443 /* Size the file as specified on the command line */
444
445 if (ftruncate(fd, len) == \-1)
446 errExit("truncate");
447
448 printf("PID: %ld; fd: %d; /proc/%ld/fd/%d\\n",
449 (long) getpid(), fd, (long) getpid(), fd);
450
451 /* Code to map the file and populate the mapping with data
452 omitted */
453
454 /* If a \(aqseals\(aq command\-line argument was supplied, set some
455 seals on the file */
456
457 if (seals_arg != NULL) {
458 seals = 0;
459
460 if (strchr(seals_arg, \(aqg\(aq) != NULL)
461 seals |= F_SEAL_GROW;
462 if (strchr(seals_arg, \(aqs\(aq) != NULL)
463 seals |= F_SEAL_SHRINK;
464 if (strchr(seals_arg, \(aqw\(aq) != NULL)
465 seals |= F_SEAL_WRITE;
466 if (strchr(seals_arg, \(aqS\(aq) != NULL)
467 seals |= F_SEAL_SEAL;
468
469 if (fcntl(fd, F_ADD_SEALS, seals) == \-1)
470 errExit("fcntl");
471 }
472
473 /* Keep running, so that the file created by memfd_create()
474 continues to exist */
475
476 pause();
477
478 exit(EXIT_SUCCESS);
479 }
480 .EE
481 .SS Program source: t_get_seals.c
482 \&
483 .EX
484 #define _GNU_SOURCE
485 #include <sys/mman.h>
486 #include <fcntl.h>
487 #include <unistd.h>
488 #include <stdlib.h>
489 #include <string.h>
490 #include <stdio.h>
491
492 #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
493 } while (0)
494
495 int
496 main(int argc, char *argv[])
497 {
498 int fd;
499 unsigned int seals;
500
501 if (argc != 2) {
502 fprintf(stderr, "%s /proc/PID/fd/FD\\n", argv[0]);
503 exit(EXIT_FAILURE);
504 }
505
506 fd = open(argv[1], O_RDWR);
507 if (fd == \-1)
508 errExit("open");
509
510 seals = fcntl(fd, F_GET_SEALS);
511 if (seals == \-1)
512 errExit("fcntl");
513
514 printf("Existing seals:");
515 if (seals & F_SEAL_SEAL)
516 printf(" SEAL");
517 if (seals & F_SEAL_GROW)
518 printf(" GROW");
519 if (seals & F_SEAL_WRITE)
520 printf(" WRITE");
521 if (seals & F_SEAL_SHRINK)
522 printf(" SHRINK");
523 printf("\\n");
524
525 /* Code to map the file and access the contents of the
526 resulting mapping omitted */
527
528 exit(EXIT_SUCCESS);
529 }
530 .EE
531 .SH SEE ALSO
532 .BR fcntl (2),
533 .BR ftruncate (2),
534 .BR mmap (2),
535 .BR shmget (2),
536 .BR shm_open (3)