]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/data-fd-util.c
tree-wide: introduce PIPE_EBADF macro
[thirdparty/systemd.git] / src / shared / data-fd-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <fcntl.h>
4 #include <sys/stat.h>
5 #include <sys/types.h>
6 #include <unistd.h>
7 #if HAVE_LINUX_MEMFD_H
8 #include <linux/memfd.h>
9 #endif
10
11 #include "alloc-util.h"
12 #include "copy.h"
13 #include "data-fd-util.h"
14 #include "fd-util.h"
15 #include "fs-util.h"
16 #include "io-util.h"
17 #include "memfd-util.h"
18 #include "missing_mman.h"
19 #include "missing_syscall.h"
20 #include "tmpfile-util.h"
21
22 /* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */
23 #define DATA_FD_MEMORY_LIMIT (64U*1024U)
24
25 /* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */
26 #define DATA_FD_TMP_LIMIT (1024U*1024U)
27
28 int acquire_data_fd(const void *data, size_t size, unsigned flags) {
29 _cleanup_close_pair_ int pipefds[2] = PIPE_EBADF;
30 char pattern[] = "/dev/shm/data-fd-XXXXXX";
31 _cleanup_close_ int fd = -EBADF;
32 int isz = 0, r;
33 ssize_t n;
34 off_t f;
35
36 assert(data || size == 0);
37
38 /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
39 * complex than I wish it was. But here's why:
40 *
41 * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
42 * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
43 *
44 * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
45 * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
46 * clients can only bump their size to a system-wide limit, which might be quite low.
47 *
48 * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
49 * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
50 * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
51 *
52 * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
53 *
54 * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
55 * figure. */
56
57 if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0))
58 /* As a special case, return /dev/null if we have been called for an empty data block */
59 return RET_NERRNO(open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY));
60
61 if ((flags & ACQUIRE_NO_MEMFD) == 0) {
62 fd = memfd_new("data-fd");
63 if (fd < 0)
64 goto try_pipe;
65
66 n = write(fd, data, size);
67 if (n < 0)
68 return -errno;
69 if ((size_t) n != size)
70 return -EIO;
71
72 f = lseek(fd, 0, SEEK_SET);
73 if (f != 0)
74 return -errno;
75
76 r = memfd_set_sealed(fd);
77 if (r < 0)
78 return r;
79
80 return TAKE_FD(fd);
81 }
82
83 try_pipe:
84 if ((flags & ACQUIRE_NO_PIPE) == 0) {
85 if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
86 return -errno;
87
88 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
89 if (isz < 0)
90 return -errno;
91
92 if ((size_t) isz < size) {
93 isz = (int) size;
94 if (isz < 0 || (size_t) isz != size)
95 return -E2BIG;
96
97 /* Try to bump the pipe size */
98 (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
99
100 /* See if that worked */
101 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
102 if (isz < 0)
103 return -errno;
104
105 if ((size_t) isz < size)
106 goto try_dev_shm;
107 }
108
109 n = write(pipefds[1], data, size);
110 if (n < 0)
111 return -errno;
112 if ((size_t) n != size)
113 return -EIO;
114
115 (void) fd_nonblock(pipefds[0], false);
116
117 return TAKE_FD(pipefds[0]);
118 }
119
120 try_dev_shm:
121 if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
122 fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
123 if (fd < 0)
124 goto try_dev_shm_without_o_tmpfile;
125
126 n = write(fd, data, size);
127 if (n < 0)
128 return -errno;
129 if ((size_t) n != size)
130 return -EIO;
131
132 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
133 return fd_reopen(fd, O_RDONLY|O_CLOEXEC);
134 }
135
136 try_dev_shm_without_o_tmpfile:
137 if ((flags & ACQUIRE_NO_REGULAR) == 0) {
138 fd = mkostemp_safe(pattern);
139 if (fd < 0)
140 return fd;
141
142 n = write(fd, data, size);
143 if (n < 0) {
144 r = -errno;
145 goto unlink_and_return;
146 }
147 if ((size_t) n != size) {
148 r = -EIO;
149 goto unlink_and_return;
150 }
151
152 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
153 r = open(pattern, O_RDONLY|O_CLOEXEC);
154 if (r < 0)
155 r = -errno;
156
157 unlink_and_return:
158 (void) unlink(pattern);
159 return r;
160 }
161
162 return -EOPNOTSUPP;
163 }
164
165 int copy_data_fd(int fd) {
166 _cleanup_close_ int copy_fd = -EBADF, tmp_fd = -EBADF;
167 _cleanup_free_ void *remains = NULL;
168 size_t remains_size = 0;
169 const char *td;
170 struct stat st;
171 int r;
172
173 /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but
174 * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be
175 * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported
176 * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in
177 * /var/tmp. */
178
179 if (fstat(fd, &st) < 0)
180 return -errno;
181
182 /* For now, let's only accept regular files, sockets, pipes and char devices */
183 if (S_ISDIR(st.st_mode))
184 return -EISDIR;
185 if (S_ISLNK(st.st_mode))
186 return -ELOOP;
187 if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode))
188 return -EBADFD;
189
190 /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note
191 * that we use the reported regular file size only as a hint, given that there are plenty special files in
192 * /proc and /sys which report a zero file size but can be read from. */
193
194 if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) {
195
196 /* Try a memfd first */
197 copy_fd = memfd_new("data-fd");
198 if (copy_fd >= 0) {
199 off_t f;
200
201 r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0);
202 if (r < 0)
203 return r;
204
205 f = lseek(copy_fd, 0, SEEK_SET);
206 if (f != 0)
207 return -errno;
208
209 if (r == 0) {
210 /* Did it fit into the limit? If so, we are done. */
211 r = memfd_set_sealed(copy_fd);
212 if (r < 0)
213 return r;
214
215 return TAKE_FD(copy_fd);
216 }
217
218 /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */
219
220 } else {
221 _cleanup_(close_pairp) int pipefds[2] = PIPE_EBADF;
222 int isz;
223
224 /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather
225 * then block indefinitely when we hit the pipe size limit */
226
227 if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
228 return -errno;
229
230 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
231 if (isz < 0)
232 return -errno;
233
234 /* Try to enlarge the pipe size if necessary */
235 if ((size_t) isz < DATA_FD_MEMORY_LIMIT) {
236
237 (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT);
238
239 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
240 if (isz < 0)
241 return -errno;
242 }
243
244 if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) {
245
246 r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL);
247 if (r < 0 && r != -EAGAIN)
248 return r; /* If we get EAGAIN it could be because of the source or because of
249 * the destination fd, we can't know, as sendfile() and friends won't
250 * tell us. Hence, treat this as reason to fall back, just to be
251 * sure. */
252 if (r == 0) {
253 /* Everything fit in, yay! */
254 (void) fd_nonblock(pipefds[0], false);
255
256 return TAKE_FD(pipefds[0]);
257 }
258
259 /* Things didn't fit in. But we read data into the pipe, let's remember that, so that
260 * when writing the new file we incorporate this first. */
261 copy_fd = TAKE_FD(pipefds[0]);
262 }
263 }
264 }
265
266 /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */
267 if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) &&
268 (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) {
269 off_t f;
270
271 tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC);
272 if (tmp_fd < 0)
273 return tmp_fd;
274
275 if (copy_fd >= 0) {
276 /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the
277 * temporary file first. */
278
279 r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0);
280 if (r < 0)
281 return r;
282
283 assert(r == 0);
284 }
285
286 if (remains_size > 0) {
287 /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the
288 * failed copy operation, let's flush them out next. */
289
290 r = loop_write(tmp_fd, remains, remains_size, false);
291 if (r < 0)
292 return r;
293 }
294
295 r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK);
296 if (r < 0)
297 return r;
298 if (r == 0)
299 goto finish; /* Yay, it fit in */
300
301 /* It didn't fit in. Let's not forget to use what we already used */
302 f = lseek(tmp_fd, 0, SEEK_SET);
303 if (f != 0)
304 return -errno;
305
306 close_and_replace(copy_fd, tmp_fd);
307
308 remains = mfree(remains);
309 remains_size = 0;
310 }
311
312 /* As last fallback use /var/tmp */
313 r = var_tmp_dir(&td);
314 if (r < 0)
315 return r;
316
317 tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC);
318 if (tmp_fd < 0)
319 return tmp_fd;
320
321 if (copy_fd >= 0) {
322 /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this
323 * into the temporary file first. */
324 r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
325 if (r < 0)
326 return r;
327
328 assert(r == 0);
329 }
330
331 if (remains_size > 0) {
332 /* Then, copy in any read but not yet written bytes. */
333 r = loop_write(tmp_fd, remains, remains_size, false);
334 if (r < 0)
335 return r;
336 }
337
338 /* Copy in the rest */
339 r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
340 if (r < 0)
341 return r;
342
343 assert(r == 0);
344
345 finish:
346 /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the
347 * file again */
348
349 return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC);
350 }
351
352 int memfd_clone_fd(int fd, const char *name, int mode) {
353 _cleanup_close_ int mfd = -EBADF;
354 bool ro;
355 int r;
356
357 /* Creates a clone of a regular file in a memfd. Unlike copy_data_fd() this returns strictly a memfd
358 * (and if it can't it will fail). Thus the resulting fd is seekable, and definitely reports as
359 * S_ISREG. */
360
361 assert(fd >= 0);
362 assert(name);
363 assert(IN_SET(mode & O_ACCMODE, O_RDONLY, O_RDWR));
364 assert((mode & ~(O_RDONLY|O_RDWR|O_CLOEXEC)) == 0);
365
366 ro = (mode & O_ACCMODE) == O_RDONLY;
367
368 mfd = memfd_create(name,
369 ((FLAGS_SET(mode, O_CLOEXEC) || ro) ? MFD_CLOEXEC : 0) |
370 (ro ? MFD_ALLOW_SEALING : 0));
371 if (mfd < 0)
372 return -errno;
373
374 r = copy_bytes(fd, mfd, UINT64_MAX, COPY_REFLINK);
375 if (r < 0)
376 return r;
377
378 if (ro) {
379 _cleanup_close_ int rfd = -EBADF;
380
381 r = memfd_set_sealed(mfd);
382 if (r < 0)
383 return r;
384
385 rfd = fd_reopen(mfd, mode);
386 if (rfd < 0)
387 return rfd;
388
389 return TAKE_FD(rfd);
390 }
391
392 off_t f = lseek(mfd, 0, SEEK_SET);
393 if (f < 0)
394 return -errno;
395
396 return TAKE_FD(mfd);
397 }