-.\" Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
-.\" and Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
.\"
-.\" %%%LICENSE_START(GPLv2+_SW_3_PARA)
+.\" %%%LICENSE_START(GPLv2+)
.\"
-.\" FIXME What is _SW_3_PARA?
-.\"
.\" This program is free software; you can redistribute it and/or modify
.\" it under the terms of the GNU General Public License as published by
.\" the Free Software Foundation; either version 2 of the License, or
.\" <http://www.gnu.org/licenses/>.
.\" %%%LICENSE_END
.\"
-.TH MEMFD_CREATE 2 2014-07-08 Linux "Linux Programmer's Manual"
+.TH MEMFD_CREATE 2 2018-02-02 Linux "Linux Programmer's Manual"
.SH NAME
memfd_create \- create an anonymous file
.SH SYNOPSIS
-.B #include <sys/memfd.h>
-.sp
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/mman.h>
+.PP
.BI "int memfd_create(const char *" name ", unsigned int " flags ");"
.SH DESCRIPTION
.BR memfd_create ()
truncated, memory-mapped, and so on.
However, unlike a regular file,
it lives in RAM and has a volatile backing storage.
-.\" FIXME In the following sentence I changed "released" to
-.\" "destroyed". Okay?
Once all references to the file are dropped, it is automatically released.
Anonymous memory is used for all backing pages of the file.
-.\" FIXME In the following sentence I changed "they" to
-.\" "files created by memfd_create()". Okay?
Therefore, files created by
.BR memfd_create ()
-are subject to the same restrictions as other anonymous
-.\" FIXME Can you give some examples of some of the restrictions please.
+have the same semantics as other anonymous
+.\" David Herrmann:
+.\" memfd uses VM_NORESERVE so each page is accounted on first access.
+.\" This means, the overcommit-limits (see __vm_enough_memory()) and the
+.\" memory-cgroup limits (mem_cgroup_try_charge()) are applied. Note that
+.\" those are accounted on "current" and "current->mm", that is, the
+.\" process doing the first page access.
memory allocations such as those allocated using
.BR mmap (2)
with the
.BR MAP_ANONYMOUS
flag.
-
+.PP
The initial size of the file is set to 0.
-.\" FIXME I added the following sentence. Please review.
Following the call, the file size should be set using
.BR ftruncate (2).
-
+(Alternatively, the file may be populated by calls to
+.BR write (2)
+or similar.)
+.PP
The name supplied in
.I name
-is used as an internal filename and will be displayed
-.\" FIXME What does "internal" in the previous line mean?
+is used as a filename and will be displayed
as the target of the corresponding symbolic link in the directory
-.\" FIXME I added the previous line. Is it correct?
.IR /proc/self/fd/ .
-.\" FIXME In the next line, I added "as displayed in that
The displayed name is always prefixed with
.IR memfd:
and serves only for debugging purposes.
-Names do not affect the behavior of the memfd,
-.\" FIXME The term "memfd" appears here without having previously been
-.\" defined. Would the correct definition of "the memfd" be
-.\" "the file descriptor created by memfd_create"?
+Names do not affect the behavior of the file descriptor,
and as such multiple files can have the same name without any side effects.
-
+.PP
The following values may be bitwise ORed in
.IR flags
-to change the behaviour of
+to change the behavior of
.BR memfd_create ():
.TP
.BR MFD_CLOEXEC
.TP
.BR MFD_ALLOW_SEALING
Allow sealing operations on this file.
-See
-.BR fcntl (2)
-with
+See the discussion of the
.B F_ADD_SEALS
and
-.BR F_GET_SEALS ,
+.BR F_GET_SEALS
+operations in
+.BR fcntl (2),
and also NOTES, below.
The initial set of seals is empty.
If this flag is not set, the initial set of seals will be
.BR F_SEAL_SEAL ,
meaning that no other seals can be set on the file.
.\" FIXME Why is the MFD_ALLOW_SEALING behavior not simply the default?
-.\" Is it worth adding some text explaining this?
+.\" Is it worth adding some text explaining this?
+.TP
+.BR MFD_HUGETLB " (since Linux 4.14)"
+.\" commit 749df87bd7bee5a79cef073f5d032ddb2b211de8
+The anonymous file will be created in the hugetlbfs filesystem using
+huge pages.
+See the Linux kernel source file
+.I Documentation/admin-guide/mm/hugetlbpage.rst
+for more information about hugetlbfs.
+.\" commit 47b9012ecdc747f6936395265e677d41e11a31ff
+Specifying both
+.B MFD_HUGETLB
+and
+.B MFD_ALLOW_SEALING
+in
+.I flags
+is supported since Linux 4.16.
+.TP
+.BR MFD_HUGE_2MB ", " MFD_HUGE_1GB ", " "..."
+Used in conjunction with
+.B MFD_HUGETLB
+to select alternative hugetlb page sizes (respectively, 2\ MB, 1\ GB, ...)
+on systems that support multiple hugetlb page sizes.
+Definitions for known
+huge page sizes are included in the header file
+.I <linux/memfd.h>.
+.IP
+For details on encoding huge page sizes not included in the header file,
+see the discussion of the similarly named constants in
+.BR mmap (2).
.PP
Unused bits in
.I flags
must be 0.
-
+.PP
As its return value,
.BR memfd_create ()
returns a new file descriptor that can be used to refer to the file.
.RB ( O_RDWR )
and
.B O_LARGEFILE
-is set for the descriptor.
-
+is set for the file descriptor.
+.PP
With respect to
.BR fork (2)
and
points to invalid memory.
.TP
.B EINVAL
-An unsupported value was specified in one of the arguments:
.I flags
-included unknown bits, or
+included unknown bits.
+.TP
+.B EINVAL
.I name
was too long.
+(The limit is
+.\" NAME_MAX - strlen("memfd:")
+249 bytes, excluding the terminating null byte.)
+.TP
+.B EINVAL
+Both
+.B MFD_HUGETLB
+and
+.B MFD_ALLOW_SEALING
+were specified in
+.IR flags .
.TP
.B EMFILE
-The per-process limit on open file descriptors has been reached.
+The per-process limit on the number of open file descriptors has been reached.
.TP
.B ENFILE
The system-wide limit on the total number of open files has been reached.
.SH VERSIONS
The
.BR memfd_create ()
-system call first appeared in Linux 3.17.
-.\" FIXME . When glibc support appears, update the following sentence:
-Support in the GNU C library is pending.
+system call first appeared in Linux 3.17;
+glibc support was added in version 2.27.
.SH CONFORMING TO
The
.BR memfd_create ()
system call is Linux-specific.
-.\" FIXME I added the NOTES section below. Please review.
.SH NOTES
+.PP
.\" See also http://lwn.net/Articles/593918/
.\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/
The
.BR memfd_create ()
system call provides a simple alternative to manually mounting a
-.I tmpfs
+.BR tmpfs (5)
filesystem and creating and opening a file in that filesystem.
The primary purpose of
.BR memfd_create ()
is to create files and associated file descriptors that are
used with the file-sealing APIs provided by
.BR fcntl (2).
+.PP
+The
+.BR memfd_create ()
+system call also has uses without file sealing
+(which is why file-sealing is disabled, unless explicitly requested with the
+.BR MFD_ALLOW_SEALING
+flag).
+In particular, it can be used as an alternative to creating files in
+.IR tmp
+or as an alternative to using the
+.BR open (2)
+.B O_TMPFILE
+in cases where there is no intention to actually link the
+resulting file into the filesystem.
.SS File sealing
In the absence of file sealing,
processes that communicate via shared memory must either trust each other,
(Dealing with this possibility necessitates the use of a handler for the
.BR SIGBUS
signal.)
-
+.PP
Dealing with untrusted peers imposes extra complexity on
code that employs shared memory.
Memory sealing enables that extra complexity to be eliminated,
by allowing a process to operate secure in the knowledge that
its peer can't modify the shared memory in an undesired fashion.
-
+.PP
An example of the usage of the sealing mechanism is as follows:
-
.IP 1. 3
The first process creates a
-.I tmpfs
-file using
+.BR tmpfs (5)
+file using
.BR memfd_create ().
The call yields a file descriptor used in subsequent steps.
.IP 2.
created in the previous step.)
.IP 4.
A second process obtains a file descriptor for the
-.I tmpfs
+.BR tmpfs (5)
file and maps it.
-This could happen in one of two ways:
+Among the possible ways in which this could happen are the following:
.RS
.IP * 3
+The process that called
+.BR memfd_create ()
+could transfer the resulting file descriptor to the second process
+via a UNIX domain socket (see
+.BR unix (7)
+and
+.BR cmsg (3)).
+The second process then maps the file using
+.BR mmap (2).
+.IP *
The second process is created via
.BR fork (2)
and thus automatically inherits the file descriptor and mapping.
+(Note that in this case and the next,
+there is a natural trust relationship between the two processes,
+since they are running under the same user ID.
+Therefore, file sealing would not normally be necessary.)
.IP *
-The second process opens the file
-.IR /proc/<pd>/fd/<fd> ,
+The second process opens the file
+.IR /proc/<pid>/fd/<fd> ,
where
.I <pid>
is the PID of the first process (the one that called
to impose additional restrictions (so long as the
.BR F_SEAL_SEAL
seal has not yet been applied).
-.\"
-.\" FIXME Do we have any nice example program that could go in the man page?
+.SH EXAMPLE
+Below are shown two example programs that demonstrate the use of
+.BR memfd_create ()
+and the file sealing API.
+.PP
+The first program,
+.IR t_memfd_create.c ,
+creates a
+.BR tmpfs (5)
+file using
+.BR memfd_create (),
+sets a size for the file, maps it into memory,
+and optionally places some seals on the file.
+The program accepts up to three command-line arguments,
+of which the first two are required.
+The first argument is the name to associate with the file,
+the second argument is the size to be set for the file,
+and the optional third argument is a string of characters that specify
+seals to be set on file.
+.PP
+The second program,
+.IR t_get_seals.c ,
+can be used to open an existing file that was created via
+.BR memfd_create ()
+and inspect the set of seals that have been applied to that file.
+.PP
+The following shell session demonstrates the use of these programs.
+First we create a
+.BR tmpfs (5)
+file and set some seals on it:
+.PP
+.in +4n
+.EX
+$ \fB./t_memfd_create my_memfd_file 4096 sw &\fP
+[1] 11775
+PID: 11775; fd: 3; /proc/11775/fd/3
+.EE
+.in
+.PP
+At this point, the
+.I t_memfd_create
+program continues to run in the background.
+From another program, we can obtain a file descriptor for the
+file created by
+.BR memfd_create ()
+by opening the
+.IR /proc/[pid]/fd
+file that corresponds to the file descriptor opened by
+.BR memfd_create ().
+Using that pathname, we inspect the content of the
+.IR /proc/[pid]/fd
+symbolic link, and use our
+.I t_get_seals
+program to view the seals that have been placed on the file:
+.PP
+.in +4n
+.EX
+$ \fBreadlink /proc/11775/fd/3\fP
+/memfd:my_memfd_file (deleted)
+$ \fB./t_get_seals /proc/11775/fd/3\fP
+Existing seals: WRITE SHRINK
+.EE
+.in
+.SS Program source: t_memfd_create.c
+\&
+.EX
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+
+#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
+ } while (0)
+
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ unsigned int seals;
+ char *addr;
+ char *name, *seals_arg;
+ ssize_t len;
+
+ if (argc < 3) {
+ fprintf(stderr, "%s name size [seals]\\n", argv[0]);
+ fprintf(stderr, "\\t\(aqseals\(aq can contain any of the "
+ "following characters:\\n");
+ fprintf(stderr, "\\t\\tg \- F_SEAL_GROW\\n");
+ fprintf(stderr, "\\t\\ts \- F_SEAL_SHRINK\\n");
+ fprintf(stderr, "\\t\\tw \- F_SEAL_WRITE\\n");
+ fprintf(stderr, "\\t\\tS \- F_SEAL_SEAL\\n");
+ exit(EXIT_FAILURE);
+ }
+
+ name = argv[1];
+ len = atoi(argv[2]);
+ seals_arg = argv[3];
+
+ /* Create an anonymous file in tmpfs; allow seals to be
+ placed on the file */
+
+ fd = memfd_create(name, MFD_ALLOW_SEALING);
+ if (fd == \-1)
+ errExit("memfd_create");
+
+ /* Size the file as specified on the command line */
+
+ if (ftruncate(fd, len) == \-1)
+ errExit("truncate");
+
+ printf("PID: %ld; fd: %d; /proc/%ld/fd/%d\\n",
+ (long) getpid(), fd, (long) getpid(), fd);
+
+ /* Code to map the file and populate the mapping with data
+ omitted */
+
+ /* If a \(aqseals\(aq command\-line argument was supplied, set some
+ seals on the file */
+
+ if (seals_arg != NULL) {
+ seals = 0;
+
+ if (strchr(seals_arg, \(aqg\(aq) != NULL)
+ seals |= F_SEAL_GROW;
+ if (strchr(seals_arg, \(aqs\(aq) != NULL)
+ seals |= F_SEAL_SHRINK;
+ if (strchr(seals_arg, \(aqw\(aq) != NULL)
+ seals |= F_SEAL_WRITE;
+ if (strchr(seals_arg, \(aqS\(aq) != NULL)
+ seals |= F_SEAL_SEAL;
+
+ if (fcntl(fd, F_ADD_SEALS, seals) == \-1)
+ errExit("fcntl");
+ }
+
+ /* Keep running, so that the file created by memfd_create()
+ continues to exist */
+
+ pause();
+
+ exit(EXIT_SUCCESS);
+}
+.EE
+.SS Program source: t_get_seals.c
+\&
+.EX
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
+ } while (0)
+
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ unsigned int seals;
+
+ if (argc != 2) {
+ fprintf(stderr, "%s /proc/PID/fd/FD\\n", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ fd = open(argv[1], O_RDWR);
+ if (fd == \-1)
+ errExit("open");
+
+ seals = fcntl(fd, F_GET_SEALS);
+ if (seals == \-1)
+ errExit("fcntl");
+
+ printf("Existing seals:");
+ if (seals & F_SEAL_SEAL)
+ printf(" SEAL");
+ if (seals & F_SEAL_GROW)
+ printf(" GROW");
+ if (seals & F_SEAL_WRITE)
+ printf(" WRITE");
+ if (seals & F_SEAL_SHRINK)
+ printf(" SHRINK");
+ printf("\\n");
+
+ /* Code to map the file and access the contents of the
+ resulting mapping omitted */
+
+ exit(EXIT_SUCCESS);
+}
+.EE
.SH SEE ALSO
.BR fcntl (2),
.BR ftruncate (2),
.BR mmap (2),
-.\" FIXME Why the reference to shmget(2) in particular (and not,
-.\" e.g., shm_open(3))?
-.BR shmget (2)
+.BR shmget (2),
+.BR shm_open (3)