]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/data-fd-util.c
Merge pull request #24670 from keszybz/early-boot-ordering
[thirdparty/systemd.git] / src / shared / data-fd-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <fcntl.h>
4 #include <sys/stat.h>
5 #include <sys/types.h>
6 #include <unistd.h>
7
8 #include "alloc-util.h"
9 #include "copy.h"
10 #include "data-fd-util.h"
11 #include "fd-util.h"
12 #include "fs-util.h"
13 #include "io-util.h"
14 #include "memfd-util.h"
15 #include "tmpfile-util.h"
16
17 /* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */
18 #define DATA_FD_MEMORY_LIMIT (64U*1024U)
19
20 /* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */
21 #define DATA_FD_TMP_LIMIT (1024U*1024U)
22
23 int acquire_data_fd(const void *data, size_t size, unsigned flags) {
24 _cleanup_close_pair_ int pipefds[2] = { -1, -1 };
25 char pattern[] = "/dev/shm/data-fd-XXXXXX";
26 _cleanup_close_ int fd = -1;
27 int isz = 0, r;
28 ssize_t n;
29 off_t f;
30
31 assert(data || size == 0);
32
33 /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
34 * complex than I wish it was. But here's why:
35 *
36 * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
37 * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
38 *
39 * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
40 * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
41 * clients can only bump their size to a system-wide limit, which might be quite low.
42 *
43 * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
44 * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
45 * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
46 *
47 * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
48 *
49 * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
50 * figure. */
51
52 if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0))
53 /* As a special case, return /dev/null if we have been called for an empty data block */
54 return RET_NERRNO(open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY));
55
56 if ((flags & ACQUIRE_NO_MEMFD) == 0) {
57 fd = memfd_new("data-fd");
58 if (fd < 0)
59 goto try_pipe;
60
61 n = write(fd, data, size);
62 if (n < 0)
63 return -errno;
64 if ((size_t) n != size)
65 return -EIO;
66
67 f = lseek(fd, 0, SEEK_SET);
68 if (f != 0)
69 return -errno;
70
71 r = memfd_set_sealed(fd);
72 if (r < 0)
73 return r;
74
75 return TAKE_FD(fd);
76 }
77
78 try_pipe:
79 if ((flags & ACQUIRE_NO_PIPE) == 0) {
80 if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
81 return -errno;
82
83 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
84 if (isz < 0)
85 return -errno;
86
87 if ((size_t) isz < size) {
88 isz = (int) size;
89 if (isz < 0 || (size_t) isz != size)
90 return -E2BIG;
91
92 /* Try to bump the pipe size */
93 (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
94
95 /* See if that worked */
96 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
97 if (isz < 0)
98 return -errno;
99
100 if ((size_t) isz < size)
101 goto try_dev_shm;
102 }
103
104 n = write(pipefds[1], data, size);
105 if (n < 0)
106 return -errno;
107 if ((size_t) n != size)
108 return -EIO;
109
110 (void) fd_nonblock(pipefds[0], false);
111
112 return TAKE_FD(pipefds[0]);
113 }
114
115 try_dev_shm:
116 if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
117 fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
118 if (fd < 0)
119 goto try_dev_shm_without_o_tmpfile;
120
121 n = write(fd, data, size);
122 if (n < 0)
123 return -errno;
124 if ((size_t) n != size)
125 return -EIO;
126
127 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
128 return fd_reopen(fd, O_RDONLY|O_CLOEXEC);
129 }
130
131 try_dev_shm_without_o_tmpfile:
132 if ((flags & ACQUIRE_NO_REGULAR) == 0) {
133 fd = mkostemp_safe(pattern);
134 if (fd < 0)
135 return fd;
136
137 n = write(fd, data, size);
138 if (n < 0) {
139 r = -errno;
140 goto unlink_and_return;
141 }
142 if ((size_t) n != size) {
143 r = -EIO;
144 goto unlink_and_return;
145 }
146
147 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
148 r = open(pattern, O_RDONLY|O_CLOEXEC);
149 if (r < 0)
150 r = -errno;
151
152 unlink_and_return:
153 (void) unlink(pattern);
154 return r;
155 }
156
157 return -EOPNOTSUPP;
158 }
159
160 int copy_data_fd(int fd) {
161 _cleanup_close_ int copy_fd = -1, tmp_fd = -1;
162 _cleanup_free_ void *remains = NULL;
163 size_t remains_size = 0;
164 const char *td;
165 struct stat st;
166 int r;
167
168 /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but
169 * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be
170 * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported
171 * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in
172 * /var/tmp. */
173
174 if (fstat(fd, &st) < 0)
175 return -errno;
176
177 /* For now, let's only accept regular files, sockets, pipes and char devices */
178 if (S_ISDIR(st.st_mode))
179 return -EISDIR;
180 if (S_ISLNK(st.st_mode))
181 return -ELOOP;
182 if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode))
183 return -EBADFD;
184
185 /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note
186 * that we use the reported regular file size only as a hint, given that there are plenty special files in
187 * /proc and /sys which report a zero file size but can be read from. */
188
189 if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) {
190
191 /* Try a memfd first */
192 copy_fd = memfd_new("data-fd");
193 if (copy_fd >= 0) {
194 off_t f;
195
196 r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0);
197 if (r < 0)
198 return r;
199
200 f = lseek(copy_fd, 0, SEEK_SET);
201 if (f != 0)
202 return -errno;
203
204 if (r == 0) {
205 /* Did it fit into the limit? If so, we are done. */
206 r = memfd_set_sealed(copy_fd);
207 if (r < 0)
208 return r;
209
210 return TAKE_FD(copy_fd);
211 }
212
213 /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */
214
215 } else {
216 _cleanup_(close_pairp) int pipefds[2] = { -1, -1 };
217 int isz;
218
219 /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather
220 * then block indefinitely when we hit the pipe size limit */
221
222 if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
223 return -errno;
224
225 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
226 if (isz < 0)
227 return -errno;
228
229 /* Try to enlarge the pipe size if necessary */
230 if ((size_t) isz < DATA_FD_MEMORY_LIMIT) {
231
232 (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT);
233
234 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
235 if (isz < 0)
236 return -errno;
237 }
238
239 if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) {
240
241 r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL);
242 if (r < 0 && r != -EAGAIN)
243 return r; /* If we get EAGAIN it could be because of the source or because of
244 * the destination fd, we can't know, as sendfile() and friends won't
245 * tell us. Hence, treat this as reason to fall back, just to be
246 * sure. */
247 if (r == 0) {
248 /* Everything fit in, yay! */
249 (void) fd_nonblock(pipefds[0], false);
250
251 return TAKE_FD(pipefds[0]);
252 }
253
254 /* Things didn't fit in. But we read data into the pipe, let's remember that, so that
255 * when writing the new file we incorporate this first. */
256 copy_fd = TAKE_FD(pipefds[0]);
257 }
258 }
259 }
260
261 /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */
262 if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) &&
263 (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) {
264 off_t f;
265
266 tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC);
267 if (tmp_fd < 0)
268 return tmp_fd;
269
270 if (copy_fd >= 0) {
271 /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the
272 * temporary file first. */
273
274 r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0);
275 if (r < 0)
276 return r;
277
278 assert(r == 0);
279 }
280
281 if (remains_size > 0) {
282 /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the
283 * failed copy operation, let's flush them out next. */
284
285 r = loop_write(tmp_fd, remains, remains_size, false);
286 if (r < 0)
287 return r;
288 }
289
290 r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK);
291 if (r < 0)
292 return r;
293 if (r == 0)
294 goto finish; /* Yay, it fit in */
295
296 /* It didn't fit in. Let's not forget to use what we already used */
297 f = lseek(tmp_fd, 0, SEEK_SET);
298 if (f != 0)
299 return -errno;
300
301 close_and_replace(copy_fd, tmp_fd);
302
303 remains = mfree(remains);
304 remains_size = 0;
305 }
306
307 /* As last fallback use /var/tmp */
308 r = var_tmp_dir(&td);
309 if (r < 0)
310 return r;
311
312 tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC);
313 if (tmp_fd < 0)
314 return tmp_fd;
315
316 if (copy_fd >= 0) {
317 /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this
318 * into the temporary file first. */
319 r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
320 if (r < 0)
321 return r;
322
323 assert(r == 0);
324 }
325
326 if (remains_size > 0) {
327 /* Then, copy in any read but not yet written bytes. */
328 r = loop_write(tmp_fd, remains, remains_size, false);
329 if (r < 0)
330 return r;
331 }
332
333 /* Copy in the rest */
334 r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
335 if (r < 0)
336 return r;
337
338 assert(r == 0);
339
340 finish:
341 /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the
342 * file again */
343
344 return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC);
345 }