]>
Commit | Line | Data |
---|---|---|
6a818c3c ZJS |
1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
2 | ||
3 | #include <fcntl.h> | |
4 | #include <sys/stat.h> | |
5 | #include <sys/types.h> | |
6 | #include <unistd.h> | |
6bea3d8e LP |
7 | #if HAVE_LINUX_MEMFD_H |
8 | #include <linux/memfd.h> | |
9 | #endif | |
6a818c3c ZJS |
10 | |
11 | #include "alloc-util.h" | |
12 | #include "copy.h" | |
13 | #include "data-fd-util.h" | |
14 | #include "fd-util.h" | |
15 | #include "fs-util.h" | |
16 | #include "io-util.h" | |
17 | #include "memfd-util.h" | |
6bea3d8e LP |
18 | #include "missing_mman.h" |
19 | #include "missing_syscall.h" | |
6a818c3c ZJS |
20 | #include "tmpfile-util.h" |
21 | ||
22 | /* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */ | |
23 | #define DATA_FD_MEMORY_LIMIT (64U*1024U) | |
24 | ||
25 | /* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */ | |
26 | #define DATA_FD_TMP_LIMIT (1024U*1024U) | |
27 | ||
28 | int acquire_data_fd(const void *data, size_t size, unsigned flags) { | |
19ee48a6 | 29 | _cleanup_close_pair_ int pipefds[2] = PIPE_EBADF; |
6a818c3c | 30 | char pattern[] = "/dev/shm/data-fd-XXXXXX"; |
254d1313 | 31 | _cleanup_close_ int fd = -EBADF; |
6a818c3c ZJS |
32 | int isz = 0, r; |
33 | ssize_t n; | |
34 | off_t f; | |
35 | ||
36 | assert(data || size == 0); | |
37 | ||
38 | /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more | |
39 | * complex than I wish it was. But here's why: | |
40 | * | |
41 | * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them | |
42 | * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14. | |
43 | * | |
44 | * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining | |
45 | * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged | |
46 | * clients can only bump their size to a system-wide limit, which might be quite low. | |
47 | * | |
48 | * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from | |
49 | * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via | |
50 | * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs. | |
51 | * | |
52 | * d) Finally, we try creating a regular file in /dev/shm, which we then delete. | |
53 | * | |
54 | * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I | |
55 | * figure. */ | |
56 | ||
7c248223 | 57 | if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) |
6a818c3c | 58 | /* As a special case, return /dev/null if we have been called for an empty data block */ |
7c248223 | 59 | return RET_NERRNO(open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY)); |
6a818c3c ZJS |
60 | |
61 | if ((flags & ACQUIRE_NO_MEMFD) == 0) { | |
62 | fd = memfd_new("data-fd"); | |
63 | if (fd < 0) | |
64 | goto try_pipe; | |
65 | ||
66 | n = write(fd, data, size); | |
67 | if (n < 0) | |
68 | return -errno; | |
69 | if ((size_t) n != size) | |
70 | return -EIO; | |
71 | ||
72 | f = lseek(fd, 0, SEEK_SET); | |
73 | if (f != 0) | |
74 | return -errno; | |
75 | ||
76 | r = memfd_set_sealed(fd); | |
77 | if (r < 0) | |
78 | return r; | |
79 | ||
80 | return TAKE_FD(fd); | |
81 | } | |
82 | ||
83 | try_pipe: | |
84 | if ((flags & ACQUIRE_NO_PIPE) == 0) { | |
85 | if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0) | |
86 | return -errno; | |
87 | ||
88 | isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); | |
89 | if (isz < 0) | |
90 | return -errno; | |
91 | ||
92 | if ((size_t) isz < size) { | |
93 | isz = (int) size; | |
94 | if (isz < 0 || (size_t) isz != size) | |
95 | return -E2BIG; | |
96 | ||
97 | /* Try to bump the pipe size */ | |
98 | (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz); | |
99 | ||
100 | /* See if that worked */ | |
101 | isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); | |
102 | if (isz < 0) | |
103 | return -errno; | |
104 | ||
105 | if ((size_t) isz < size) | |
106 | goto try_dev_shm; | |
107 | } | |
108 | ||
109 | n = write(pipefds[1], data, size); | |
110 | if (n < 0) | |
111 | return -errno; | |
112 | if ((size_t) n != size) | |
113 | return -EIO; | |
114 | ||
115 | (void) fd_nonblock(pipefds[0], false); | |
116 | ||
117 | return TAKE_FD(pipefds[0]); | |
118 | } | |
119 | ||
120 | try_dev_shm: | |
121 | if ((flags & ACQUIRE_NO_TMPFILE) == 0) { | |
122 | fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500); | |
123 | if (fd < 0) | |
124 | goto try_dev_shm_without_o_tmpfile; | |
125 | ||
126 | n = write(fd, data, size); | |
127 | if (n < 0) | |
128 | return -errno; | |
129 | if ((size_t) n != size) | |
130 | return -EIO; | |
131 | ||
132 | /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ | |
133 | return fd_reopen(fd, O_RDONLY|O_CLOEXEC); | |
134 | } | |
135 | ||
136 | try_dev_shm_without_o_tmpfile: | |
137 | if ((flags & ACQUIRE_NO_REGULAR) == 0) { | |
138 | fd = mkostemp_safe(pattern); | |
139 | if (fd < 0) | |
140 | return fd; | |
141 | ||
142 | n = write(fd, data, size); | |
143 | if (n < 0) { | |
144 | r = -errno; | |
145 | goto unlink_and_return; | |
146 | } | |
147 | if ((size_t) n != size) { | |
148 | r = -EIO; | |
149 | goto unlink_and_return; | |
150 | } | |
151 | ||
152 | /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ | |
153 | r = open(pattern, O_RDONLY|O_CLOEXEC); | |
154 | if (r < 0) | |
155 | r = -errno; | |
156 | ||
157 | unlink_and_return: | |
158 | (void) unlink(pattern); | |
159 | return r; | |
160 | } | |
161 | ||
162 | return -EOPNOTSUPP; | |
163 | } | |
164 | ||
165 | int copy_data_fd(int fd) { | |
254d1313 | 166 | _cleanup_close_ int copy_fd = -EBADF, tmp_fd = -EBADF; |
6a818c3c ZJS |
167 | _cleanup_free_ void *remains = NULL; |
168 | size_t remains_size = 0; | |
169 | const char *td; | |
170 | struct stat st; | |
171 | int r; | |
172 | ||
173 | /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but | |
174 | * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be | |
175 | * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported | |
176 | * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in | |
177 | * /var/tmp. */ | |
178 | ||
179 | if (fstat(fd, &st) < 0) | |
180 | return -errno; | |
181 | ||
182 | /* For now, let's only accept regular files, sockets, pipes and char devices */ | |
183 | if (S_ISDIR(st.st_mode)) | |
184 | return -EISDIR; | |
185 | if (S_ISLNK(st.st_mode)) | |
186 | return -ELOOP; | |
187 | if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode)) | |
188 | return -EBADFD; | |
189 | ||
190 | /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note | |
191 | * that we use the reported regular file size only as a hint, given that there are plenty special files in | |
192 | * /proc and /sys which report a zero file size but can be read from. */ | |
193 | ||
194 | if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) { | |
195 | ||
196 | /* Try a memfd first */ | |
197 | copy_fd = memfd_new("data-fd"); | |
198 | if (copy_fd >= 0) { | |
199 | off_t f; | |
200 | ||
201 | r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0); | |
202 | if (r < 0) | |
203 | return r; | |
204 | ||
205 | f = lseek(copy_fd, 0, SEEK_SET); | |
206 | if (f != 0) | |
207 | return -errno; | |
208 | ||
209 | if (r == 0) { | |
210 | /* Did it fit into the limit? If so, we are done. */ | |
211 | r = memfd_set_sealed(copy_fd); | |
212 | if (r < 0) | |
213 | return r; | |
214 | ||
215 | return TAKE_FD(copy_fd); | |
216 | } | |
217 | ||
218 | /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */ | |
219 | ||
220 | } else { | |
19ee48a6 | 221 | _cleanup_(close_pairp) int pipefds[2] = PIPE_EBADF; |
6a818c3c ZJS |
222 | int isz; |
223 | ||
224 | /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather | |
225 | * then block indefinitely when we hit the pipe size limit */ | |
226 | ||
227 | if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0) | |
228 | return -errno; | |
229 | ||
230 | isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); | |
231 | if (isz < 0) | |
232 | return -errno; | |
233 | ||
234 | /* Try to enlarge the pipe size if necessary */ | |
235 | if ((size_t) isz < DATA_FD_MEMORY_LIMIT) { | |
236 | ||
237 | (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT); | |
238 | ||
239 | isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); | |
240 | if (isz < 0) | |
241 | return -errno; | |
242 | } | |
243 | ||
244 | if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) { | |
245 | ||
246 | r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL); | |
247 | if (r < 0 && r != -EAGAIN) | |
248 | return r; /* If we get EAGAIN it could be because of the source or because of | |
249 | * the destination fd, we can't know, as sendfile() and friends won't | |
250 | * tell us. Hence, treat this as reason to fall back, just to be | |
251 | * sure. */ | |
252 | if (r == 0) { | |
253 | /* Everything fit in, yay! */ | |
254 | (void) fd_nonblock(pipefds[0], false); | |
255 | ||
256 | return TAKE_FD(pipefds[0]); | |
257 | } | |
258 | ||
259 | /* Things didn't fit in. But we read data into the pipe, let's remember that, so that | |
260 | * when writing the new file we incorporate this first. */ | |
261 | copy_fd = TAKE_FD(pipefds[0]); | |
262 | } | |
263 | } | |
264 | } | |
265 | ||
266 | /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */ | |
267 | if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) && | |
268 | (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) { | |
269 | off_t f; | |
270 | ||
271 | tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC); | |
272 | if (tmp_fd < 0) | |
273 | return tmp_fd; | |
274 | ||
275 | if (copy_fd >= 0) { | |
276 | /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the | |
277 | * temporary file first. */ | |
278 | ||
279 | r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0); | |
280 | if (r < 0) | |
281 | return r; | |
282 | ||
283 | assert(r == 0); | |
284 | } | |
285 | ||
286 | if (remains_size > 0) { | |
287 | /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the | |
288 | * failed copy operation, let's flush them out next. */ | |
289 | ||
290 | r = loop_write(tmp_fd, remains, remains_size, false); | |
291 | if (r < 0) | |
292 | return r; | |
293 | } | |
294 | ||
295 | r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK); | |
296 | if (r < 0) | |
297 | return r; | |
298 | if (r == 0) | |
299 | goto finish; /* Yay, it fit in */ | |
300 | ||
301 | /* It didn't fit in. Let's not forget to use what we already used */ | |
302 | f = lseek(tmp_fd, 0, SEEK_SET); | |
303 | if (f != 0) | |
304 | return -errno; | |
305 | ||
ee3455cf | 306 | close_and_replace(copy_fd, tmp_fd); |
6a818c3c ZJS |
307 | |
308 | remains = mfree(remains); | |
309 | remains_size = 0; | |
310 | } | |
311 | ||
312 | /* As last fallback use /var/tmp */ | |
313 | r = var_tmp_dir(&td); | |
314 | if (r < 0) | |
315 | return r; | |
316 | ||
317 | tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC); | |
318 | if (tmp_fd < 0) | |
319 | return tmp_fd; | |
320 | ||
321 | if (copy_fd >= 0) { | |
322 | /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this | |
323 | * into the temporary file first. */ | |
324 | r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK); | |
325 | if (r < 0) | |
326 | return r; | |
327 | ||
328 | assert(r == 0); | |
329 | } | |
330 | ||
331 | if (remains_size > 0) { | |
332 | /* Then, copy in any read but not yet written bytes. */ | |
333 | r = loop_write(tmp_fd, remains, remains_size, false); | |
334 | if (r < 0) | |
335 | return r; | |
336 | } | |
337 | ||
338 | /* Copy in the rest */ | |
339 | r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK); | |
340 | if (r < 0) | |
341 | return r; | |
342 | ||
343 | assert(r == 0); | |
344 | ||
345 | finish: | |
346 | /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the | |
347 | * file again */ | |
348 | ||
349 | return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC); | |
350 | } | |
6bea3d8e LP |
351 | |
352 | int memfd_clone_fd(int fd, const char *name, int mode) { | |
353 | _cleanup_close_ int mfd = -EBADF; | |
354 | bool ro; | |
355 | int r; | |
356 | ||
357 | /* Creates a clone of a regular file in a memfd. Unlike copy_data_fd() this returns strictly a memfd | |
358 | * (and if it can't it will fail). Thus the resulting fd is seekable, and definitely reports as | |
359 | * S_ISREG. */ | |
360 | ||
361 | assert(fd >= 0); | |
362 | assert(name); | |
363 | assert(IN_SET(mode & O_ACCMODE, O_RDONLY, O_RDWR)); | |
364 | assert((mode & ~(O_RDONLY|O_RDWR|O_CLOEXEC)) == 0); | |
365 | ||
366 | ro = (mode & O_ACCMODE) == O_RDONLY; | |
367 | ||
368 | mfd = memfd_create(name, | |
369 | ((FLAGS_SET(mode, O_CLOEXEC) || ro) ? MFD_CLOEXEC : 0) | | |
370 | (ro ? MFD_ALLOW_SEALING : 0)); | |
371 | if (mfd < 0) | |
372 | return -errno; | |
373 | ||
374 | r = copy_bytes(fd, mfd, UINT64_MAX, COPY_REFLINK); | |
375 | if (r < 0) | |
376 | return r; | |
377 | ||
378 | if (ro) { | |
254d1313 | 379 | _cleanup_close_ int rfd = -EBADF; |
6bea3d8e LP |
380 | |
381 | r = memfd_set_sealed(mfd); | |
382 | if (r < 0) | |
383 | return r; | |
384 | ||
385 | rfd = fd_reopen(mfd, mode); | |
386 | if (rfd < 0) | |
387 | return rfd; | |
388 | ||
389 | return TAKE_FD(rfd); | |
390 | } | |
391 | ||
392 | off_t f = lseek(mfd, 0, SEEK_SET); | |
393 | if (f < 0) | |
394 | return -errno; | |
395 | ||
396 | return TAKE_FD(mfd); | |
397 | } |