1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
8 #include <linux/memfd.h>
11 #include "alloc-util.h"
13 #include "data-fd-util.h"
17 #include "memfd-util.h"
18 #include "missing_mman.h"
19 #include "missing_syscall.h"
20 #include "tmpfile-util.h"
22 /* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */
23 #define DATA_FD_MEMORY_LIMIT (64U*1024U)
25 /* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */
26 #define DATA_FD_TMP_LIMIT (1024U*1024U)
28 int acquire_data_fd(const void *data
, size_t size
, unsigned flags
) {
29 _cleanup_close_pair_
int pipefds
[2] = EBADF_PAIR
;
30 _cleanup_close_
int fd
= -EBADF
;
34 assert(data
|| size
== 0);
36 /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
37 * complex than I wish it was. But here's why:
39 * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
40 * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
42 * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
43 * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
44 * clients can only bump their size to a system-wide limit, which might be quite low.
46 * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
47 * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
48 * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
50 * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
52 * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
55 if (size
== 0 && ((flags
& ACQUIRE_NO_DEV_NULL
) == 0))
56 /* As a special case, return /dev/null if we have been called for an empty data block */
57 return RET_NERRNO(open("/dev/null", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
));
59 if ((flags
& ACQUIRE_NO_MEMFD
) == 0) {
60 fd
= memfd_new_and_seal("data-fd", data
, size
);
62 if (ERRNO_IS_NOT_SUPPORTED(fd
))
72 if ((flags
& ACQUIRE_NO_PIPE
) == 0) {
73 if (pipe2(pipefds
, O_CLOEXEC
|O_NONBLOCK
) < 0)
76 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
80 if ((size_t) isz
< size
) {
82 if (isz
< 0 || (size_t) isz
!= size
)
85 /* Try to bump the pipe size */
86 (void) fcntl(pipefds
[1], F_SETPIPE_SZ
, isz
);
88 /* See if that worked */
89 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
93 if ((size_t) isz
< size
)
97 n
= write(pipefds
[1], data
, size
);
100 if ((size_t) n
!= size
)
103 (void) fd_nonblock(pipefds
[0], false);
105 return TAKE_FD(pipefds
[0]);
109 if ((flags
& ACQUIRE_NO_TMPFILE
) == 0) {
110 fd
= open("/dev/shm", O_RDWR
|O_TMPFILE
|O_CLOEXEC
, 0500);
112 goto try_dev_shm_without_o_tmpfile
;
114 n
= write(fd
, data
, size
);
117 if ((size_t) n
!= size
)
120 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
121 return fd_reopen(fd
, O_RDONLY
|O_CLOEXEC
);
124 try_dev_shm_without_o_tmpfile
:
125 if ((flags
& ACQUIRE_NO_REGULAR
) == 0) {
126 char pattern
[] = "/dev/shm/data-fd-XXXXXX";
128 fd
= mkostemp_safe(pattern
);
132 n
= write(fd
, data
, size
);
135 goto unlink_and_return
;
137 if ((size_t) n
!= size
) {
139 goto unlink_and_return
;
142 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
143 r
= fd_reopen(fd
, O_RDONLY
|O_CLOEXEC
);
146 (void) unlink(pattern
);
153 int copy_data_fd(int fd
) {
154 _cleanup_close_
int copy_fd
= -EBADF
, tmp_fd
= -EBADF
;
155 _cleanup_free_
void *remains
= NULL
;
156 size_t remains_size
= 0;
161 /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but
162 * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be
163 * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported
164 * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in
167 if (fstat(fd
, &st
) < 0)
170 /* For now, let's only accept regular files, sockets, pipes and char devices */
171 if (S_ISDIR(st
.st_mode
))
173 if (S_ISLNK(st
.st_mode
))
175 if (!S_ISREG(st
.st_mode
) && !S_ISSOCK(st
.st_mode
) && !S_ISFIFO(st
.st_mode
) && !S_ISCHR(st
.st_mode
))
178 /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note
179 * that we use the reported regular file size only as a hint, given that there are plenty special files in
180 * /proc and /sys which report a zero file size but can be read from. */
182 if (!S_ISREG(st
.st_mode
) || st
.st_size
< DATA_FD_MEMORY_LIMIT
) {
184 /* Try a memfd first */
185 copy_fd
= memfd_new("data-fd");
189 r
= copy_bytes(fd
, copy_fd
, DATA_FD_MEMORY_LIMIT
, 0);
193 f
= lseek(copy_fd
, 0, SEEK_SET
);
198 /* Did it fit into the limit? If so, we are done. */
199 r
= memfd_set_sealed(copy_fd
);
203 return TAKE_FD(copy_fd
);
206 /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */
209 _cleanup_close_pair_
int pipefds
[2] = EBADF_PAIR
;
212 /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather
213 * then block indefinitely when we hit the pipe size limit */
215 if (pipe2(pipefds
, O_CLOEXEC
|O_NONBLOCK
) < 0)
218 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
222 /* Try to enlarge the pipe size if necessary */
223 if ((size_t) isz
< DATA_FD_MEMORY_LIMIT
) {
225 (void) fcntl(pipefds
[1], F_SETPIPE_SZ
, DATA_FD_MEMORY_LIMIT
);
227 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
232 if ((size_t) isz
>= DATA_FD_MEMORY_LIMIT
) {
234 r
= copy_bytes_full(fd
, pipefds
[1], DATA_FD_MEMORY_LIMIT
, 0, &remains
, &remains_size
, NULL
, NULL
);
235 if (r
< 0 && r
!= -EAGAIN
)
236 return r
; /* If we get EAGAIN it could be because of the source or because of
237 * the destination fd, we can't know, as sendfile() and friends won't
238 * tell us. Hence, treat this as reason to fall back, just to be
241 /* Everything fit in, yay! */
242 (void) fd_nonblock(pipefds
[0], false);
244 return TAKE_FD(pipefds
[0]);
247 /* Things didn't fit in. But we read data into the pipe, let's remember that, so that
248 * when writing the new file we incorporate this first. */
249 copy_fd
= TAKE_FD(pipefds
[0]);
254 /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */
255 if ((!S_ISREG(st
.st_mode
) || st
.st_size
< DATA_FD_TMP_LIMIT
) &&
256 (DATA_FD_MEMORY_LIMIT
+ remains_size
) < DATA_FD_TMP_LIMIT
) {
259 tmp_fd
= open_tmpfile_unlinkable(NULL
/* NULL as directory means /tmp */, O_RDWR
|O_CLOEXEC
);
264 /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the
265 * temporary file first. */
267 r
= copy_bytes(copy_fd
, tmp_fd
, UINT64_MAX
, 0);
274 if (remains_size
> 0) {
275 /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the
276 * failed copy operation, let's flush them out next. */
278 r
= loop_write(tmp_fd
, remains
, remains_size
);
283 r
= copy_bytes(fd
, tmp_fd
, DATA_FD_TMP_LIMIT
- DATA_FD_MEMORY_LIMIT
- remains_size
, COPY_REFLINK
);
287 goto finish
; /* Yay, it fit in */
289 /* It didn't fit in. Let's not forget to use what we already used */
290 f
= lseek(tmp_fd
, 0, SEEK_SET
);
294 close_and_replace(copy_fd
, tmp_fd
);
296 remains
= mfree(remains
);
300 /* As last fallback use /var/tmp */
301 r
= var_tmp_dir(&td
);
305 tmp_fd
= open_tmpfile_unlinkable(td
, O_RDWR
|O_CLOEXEC
);
310 /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this
311 * into the temporary file first. */
312 r
= copy_bytes(copy_fd
, tmp_fd
, UINT64_MAX
, COPY_REFLINK
);
319 if (remains_size
> 0) {
320 /* Then, copy in any read but not yet written bytes. */
321 r
= loop_write(tmp_fd
, remains
, remains_size
);
326 /* Copy in the rest */
327 r
= copy_bytes(fd
, tmp_fd
, UINT64_MAX
, COPY_REFLINK
);
334 /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the
337 return fd_reopen(tmp_fd
, O_RDONLY
|O_CLOEXEC
);
340 int memfd_clone_fd(int fd
, const char *name
, int mode
) {
341 _cleanup_close_
int mfd
= -EBADF
;
346 /* Creates a clone of a regular file in a memfd. Unlike copy_data_fd() this returns strictly a memfd
347 * (and if it can't it will fail). Thus the resulting fd is seekable, and definitely reports as
352 assert(IN_SET(mode
& O_ACCMODE
, O_RDONLY
, O_RDWR
));
353 assert((mode
& ~(O_RDONLY
|O_RDWR
|O_CLOEXEC
)) == 0);
355 if (fstat(fd
, &st
) < 0)
358 ro
= (mode
& O_ACCMODE
) == O_RDONLY
;
359 exec
= st
.st_mode
& 0111;
361 mfd
= memfd_create_wrapper(name
,
362 ((FLAGS_SET(mode
, O_CLOEXEC
) || ro
) ? MFD_CLOEXEC
: 0) |
363 (ro
? MFD_ALLOW_SEALING
: 0) |
364 (exec
? MFD_EXEC
: MFD_NOEXEC_SEAL
));
368 r
= copy_bytes(fd
, mfd
, UINT64_MAX
, COPY_REFLINK
);
373 _cleanup_close_
int rfd
= -EBADF
;
375 r
= memfd_set_sealed(mfd
);
379 rfd
= fd_reopen(mfd
, mode
);
386 off_t f
= lseek(mfd
, 0, SEEK_SET
);