1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
8 #include <linux/memfd.h>
11 #include "alloc-util.h"
13 #include "data-fd-util.h"
17 #include "memfd-util.h"
18 #include "missing_mman.h"
19 #include "missing_syscall.h"
20 #include "tmpfile-util.h"
22 /* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */
23 #define DATA_FD_MEMORY_LIMIT (64U*1024U)
25 /* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */
26 #define DATA_FD_TMP_LIMIT (1024U*1024U)
28 int acquire_data_fd(const void *data
, size_t size
, unsigned flags
) {
29 _cleanup_close_pair_
int pipefds
[2] = PIPE_EBADF
;
30 char pattern
[] = "/dev/shm/data-fd-XXXXXX";
31 _cleanup_close_
int fd
= -EBADF
;
36 assert(data
|| size
== 0);
38 /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
39 * complex than I wish it was. But here's why:
41 * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
42 * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
44 * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
45 * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
46 * clients can only bump their size to a system-wide limit, which might be quite low.
48 * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
49 * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
50 * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
52 * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
54 * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
57 if (size
== 0 && ((flags
& ACQUIRE_NO_DEV_NULL
) == 0))
58 /* As a special case, return /dev/null if we have been called for an empty data block */
59 return RET_NERRNO(open("/dev/null", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
));
61 if ((flags
& ACQUIRE_NO_MEMFD
) == 0) {
62 fd
= memfd_new("data-fd");
66 n
= write(fd
, data
, size
);
69 if ((size_t) n
!= size
)
72 f
= lseek(fd
, 0, SEEK_SET
);
76 r
= memfd_set_sealed(fd
);
84 if ((flags
& ACQUIRE_NO_PIPE
) == 0) {
85 if (pipe2(pipefds
, O_CLOEXEC
|O_NONBLOCK
) < 0)
88 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
92 if ((size_t) isz
< size
) {
94 if (isz
< 0 || (size_t) isz
!= size
)
97 /* Try to bump the pipe size */
98 (void) fcntl(pipefds
[1], F_SETPIPE_SZ
, isz
);
100 /* See if that worked */
101 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
105 if ((size_t) isz
< size
)
109 n
= write(pipefds
[1], data
, size
);
112 if ((size_t) n
!= size
)
115 (void) fd_nonblock(pipefds
[0], false);
117 return TAKE_FD(pipefds
[0]);
121 if ((flags
& ACQUIRE_NO_TMPFILE
) == 0) {
122 fd
= open("/dev/shm", O_RDWR
|O_TMPFILE
|O_CLOEXEC
, 0500);
124 goto try_dev_shm_without_o_tmpfile
;
126 n
= write(fd
, data
, size
);
129 if ((size_t) n
!= size
)
132 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
133 return fd_reopen(fd
, O_RDONLY
|O_CLOEXEC
);
136 try_dev_shm_without_o_tmpfile
:
137 if ((flags
& ACQUIRE_NO_REGULAR
) == 0) {
138 fd
= mkostemp_safe(pattern
);
142 n
= write(fd
, data
, size
);
145 goto unlink_and_return
;
147 if ((size_t) n
!= size
) {
149 goto unlink_and_return
;
152 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
153 r
= open(pattern
, O_RDONLY
|O_CLOEXEC
);
158 (void) unlink(pattern
);
165 int copy_data_fd(int fd
) {
166 _cleanup_close_
int copy_fd
= -EBADF
, tmp_fd
= -EBADF
;
167 _cleanup_free_
void *remains
= NULL
;
168 size_t remains_size
= 0;
173 /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but
174 * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be
175 * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported
176 * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in
179 if (fstat(fd
, &st
) < 0)
182 /* For now, let's only accept regular files, sockets, pipes and char devices */
183 if (S_ISDIR(st
.st_mode
))
185 if (S_ISLNK(st
.st_mode
))
187 if (!S_ISREG(st
.st_mode
) && !S_ISSOCK(st
.st_mode
) && !S_ISFIFO(st
.st_mode
) && !S_ISCHR(st
.st_mode
))
190 /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note
191 * that we use the reported regular file size only as a hint, given that there are plenty special files in
192 * /proc and /sys which report a zero file size but can be read from. */
194 if (!S_ISREG(st
.st_mode
) || st
.st_size
< DATA_FD_MEMORY_LIMIT
) {
196 /* Try a memfd first */
197 copy_fd
= memfd_new("data-fd");
201 r
= copy_bytes(fd
, copy_fd
, DATA_FD_MEMORY_LIMIT
, 0);
205 f
= lseek(copy_fd
, 0, SEEK_SET
);
210 /* Did it fit into the limit? If so, we are done. */
211 r
= memfd_set_sealed(copy_fd
);
215 return TAKE_FD(copy_fd
);
218 /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */
221 _cleanup_(close_pairp
) int pipefds
[2] = PIPE_EBADF
;
224 /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather
225 * then block indefinitely when we hit the pipe size limit */
227 if (pipe2(pipefds
, O_CLOEXEC
|O_NONBLOCK
) < 0)
230 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
234 /* Try to enlarge the pipe size if necessary */
235 if ((size_t) isz
< DATA_FD_MEMORY_LIMIT
) {
237 (void) fcntl(pipefds
[1], F_SETPIPE_SZ
, DATA_FD_MEMORY_LIMIT
);
239 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
244 if ((size_t) isz
>= DATA_FD_MEMORY_LIMIT
) {
246 r
= copy_bytes_full(fd
, pipefds
[1], DATA_FD_MEMORY_LIMIT
, 0, &remains
, &remains_size
, NULL
, NULL
);
247 if (r
< 0 && r
!= -EAGAIN
)
248 return r
; /* If we get EAGAIN it could be because of the source or because of
249 * the destination fd, we can't know, as sendfile() and friends won't
250 * tell us. Hence, treat this as reason to fall back, just to be
253 /* Everything fit in, yay! */
254 (void) fd_nonblock(pipefds
[0], false);
256 return TAKE_FD(pipefds
[0]);
259 /* Things didn't fit in. But we read data into the pipe, let's remember that, so that
260 * when writing the new file we incorporate this first. */
261 copy_fd
= TAKE_FD(pipefds
[0]);
266 /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */
267 if ((!S_ISREG(st
.st_mode
) || st
.st_size
< DATA_FD_TMP_LIMIT
) &&
268 (DATA_FD_MEMORY_LIMIT
+ remains_size
) < DATA_FD_TMP_LIMIT
) {
271 tmp_fd
= open_tmpfile_unlinkable(NULL
/* NULL as directory means /tmp */, O_RDWR
|O_CLOEXEC
);
276 /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the
277 * temporary file first. */
279 r
= copy_bytes(copy_fd
, tmp_fd
, UINT64_MAX
, 0);
286 if (remains_size
> 0) {
287 /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the
288 * failed copy operation, let's flush them out next. */
290 r
= loop_write(tmp_fd
, remains
, remains_size
, false);
295 r
= copy_bytes(fd
, tmp_fd
, DATA_FD_TMP_LIMIT
- DATA_FD_MEMORY_LIMIT
- remains_size
, COPY_REFLINK
);
299 goto finish
; /* Yay, it fit in */
301 /* It didn't fit in. Let's not forget to use what we already used */
302 f
= lseek(tmp_fd
, 0, SEEK_SET
);
306 close_and_replace(copy_fd
, tmp_fd
);
308 remains
= mfree(remains
);
312 /* As last fallback use /var/tmp */
313 r
= var_tmp_dir(&td
);
317 tmp_fd
= open_tmpfile_unlinkable(td
, O_RDWR
|O_CLOEXEC
);
322 /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this
323 * into the temporary file first. */
324 r
= copy_bytes(copy_fd
, tmp_fd
, UINT64_MAX
, COPY_REFLINK
);
331 if (remains_size
> 0) {
332 /* Then, copy in any read but not yet written bytes. */
333 r
= loop_write(tmp_fd
, remains
, remains_size
, false);
338 /* Copy in the rest */
339 r
= copy_bytes(fd
, tmp_fd
, UINT64_MAX
, COPY_REFLINK
);
346 /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the
349 return fd_reopen(tmp_fd
, O_RDONLY
|O_CLOEXEC
);
352 int memfd_clone_fd(int fd
, const char *name
, int mode
) {
353 _cleanup_close_
int mfd
= -EBADF
;
357 /* Creates a clone of a regular file in a memfd. Unlike copy_data_fd() this returns strictly a memfd
358 * (and if it can't it will fail). Thus the resulting fd is seekable, and definitely reports as
363 assert(IN_SET(mode
& O_ACCMODE
, O_RDONLY
, O_RDWR
));
364 assert((mode
& ~(O_RDONLY
|O_RDWR
|O_CLOEXEC
)) == 0);
366 ro
= (mode
& O_ACCMODE
) == O_RDONLY
;
368 mfd
= memfd_create(name
,
369 ((FLAGS_SET(mode
, O_CLOEXEC
) || ro
) ? MFD_CLOEXEC
: 0) |
370 (ro
? MFD_ALLOW_SEALING
: 0));
374 r
= copy_bytes(fd
, mfd
, UINT64_MAX
, COPY_REFLINK
);
379 _cleanup_close_
int rfd
= -EBADF
;
381 r
= memfd_set_sealed(mfd
);
385 rfd
= fd_reopen(mfd
, mode
);
392 off_t f
= lseek(mfd
, 0, SEEK_SET
);