]>
Commit | Line | Data |
---|---|---|
6a818c3c ZJS |
1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
2 | ||
3 | #include <fcntl.h> | |
4 | #include <sys/stat.h> | |
5 | #include <sys/types.h> | |
6 | #include <unistd.h> | |
6bea3d8e LP |
7 | #if HAVE_LINUX_MEMFD_H |
8 | #include <linux/memfd.h> | |
9 | #endif | |
6a818c3c ZJS |
10 | |
11 | #include "alloc-util.h" | |
12 | #include "copy.h" | |
13 | #include "data-fd-util.h" | |
14 | #include "fd-util.h" | |
15 | #include "fs-util.h" | |
16 | #include "io-util.h" | |
17 | #include "memfd-util.h" | |
6bea3d8e LP |
18 | #include "missing_mman.h" |
19 | #include "missing_syscall.h" | |
6a818c3c ZJS |
20 | #include "tmpfile-util.h" |
21 | ||
22 | /* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */ | |
23 | #define DATA_FD_MEMORY_LIMIT (64U*1024U) | |
24 | ||
25 | /* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */ | |
26 | #define DATA_FD_TMP_LIMIT (1024U*1024U) | |
27 | ||
28 | int acquire_data_fd(const void *data, size_t size, unsigned flags) { | |
71136404 | 29 | _cleanup_close_pair_ int pipefds[2] = EBADF_PAIR; |
254d1313 | 30 | _cleanup_close_ int fd = -EBADF; |
6a818c3c ZJS |
31 | int isz = 0, r; |
32 | ssize_t n; | |
6a818c3c ZJS |
33 | |
34 | assert(data || size == 0); | |
35 | ||
36 | /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more | |
37 | * complex than I wish it was. But here's why: | |
38 | * | |
39 | * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them | |
40 | * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14. | |
41 | * | |
42 | * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining | |
43 | * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged | |
44 | * clients can only bump their size to a system-wide limit, which might be quite low. | |
45 | * | |
46 | * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from | |
47 | * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via | |
48 | * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs. | |
49 | * | |
50 | * d) Finally, we try creating a regular file in /dev/shm, which we then delete. | |
51 | * | |
52 | * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I | |
53 | * figure. */ | |
54 | ||
7c248223 | 55 | if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) |
6a818c3c | 56 | /* As a special case, return /dev/null if we have been called for an empty data block */ |
7c248223 | 57 | return RET_NERRNO(open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY)); |
6a818c3c ZJS |
58 | |
59 | if ((flags & ACQUIRE_NO_MEMFD) == 0) { | |
44777d7a DDM |
60 | fd = memfd_new_and_seal("data-fd", data, size); |
61 | if (fd < 0) { | |
62 | if (ERRNO_IS_NOT_SUPPORTED(fd)) | |
63 | goto try_pipe; | |
6a818c3c | 64 | |
44777d7a DDM |
65 | return fd; |
66 | } | |
6a818c3c ZJS |
67 | |
68 | return TAKE_FD(fd); | |
69 | } | |
70 | ||
71 | try_pipe: | |
72 | if ((flags & ACQUIRE_NO_PIPE) == 0) { | |
73 | if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0) | |
74 | return -errno; | |
75 | ||
76 | isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); | |
77 | if (isz < 0) | |
78 | return -errno; | |
79 | ||
80 | if ((size_t) isz < size) { | |
81 | isz = (int) size; | |
82 | if (isz < 0 || (size_t) isz != size) | |
83 | return -E2BIG; | |
84 | ||
85 | /* Try to bump the pipe size */ | |
86 | (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz); | |
87 | ||
88 | /* See if that worked */ | |
89 | isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); | |
90 | if (isz < 0) | |
91 | return -errno; | |
92 | ||
93 | if ((size_t) isz < size) | |
94 | goto try_dev_shm; | |
95 | } | |
96 | ||
97 | n = write(pipefds[1], data, size); | |
98 | if (n < 0) | |
99 | return -errno; | |
100 | if ((size_t) n != size) | |
101 | return -EIO; | |
102 | ||
103 | (void) fd_nonblock(pipefds[0], false); | |
104 | ||
105 | return TAKE_FD(pipefds[0]); | |
106 | } | |
107 | ||
108 | try_dev_shm: | |
109 | if ((flags & ACQUIRE_NO_TMPFILE) == 0) { | |
110 | fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500); | |
111 | if (fd < 0) | |
112 | goto try_dev_shm_without_o_tmpfile; | |
113 | ||
114 | n = write(fd, data, size); | |
115 | if (n < 0) | |
116 | return -errno; | |
117 | if ((size_t) n != size) | |
118 | return -EIO; | |
119 | ||
120 | /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ | |
121 | return fd_reopen(fd, O_RDONLY|O_CLOEXEC); | |
122 | } | |
123 | ||
124 | try_dev_shm_without_o_tmpfile: | |
125 | if ((flags & ACQUIRE_NO_REGULAR) == 0) { | |
274a3b00 LP |
126 | char pattern[] = "/dev/shm/data-fd-XXXXXX"; |
127 | ||
6a818c3c ZJS |
128 | fd = mkostemp_safe(pattern); |
129 | if (fd < 0) | |
130 | return fd; | |
131 | ||
132 | n = write(fd, data, size); | |
133 | if (n < 0) { | |
134 | r = -errno; | |
135 | goto unlink_and_return; | |
136 | } | |
137 | if ((size_t) n != size) { | |
138 | r = -EIO; | |
139 | goto unlink_and_return; | |
140 | } | |
141 | ||
142 | /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ | |
274a3b00 | 143 | r = fd_reopen(fd, O_RDONLY|O_CLOEXEC); |
6a818c3c ZJS |
144 | |
145 | unlink_and_return: | |
146 | (void) unlink(pattern); | |
147 | return r; | |
148 | } | |
149 | ||
150 | return -EOPNOTSUPP; | |
151 | } | |
152 | ||
153 | int copy_data_fd(int fd) { | |
254d1313 | 154 | _cleanup_close_ int copy_fd = -EBADF, tmp_fd = -EBADF; |
6a818c3c ZJS |
155 | _cleanup_free_ void *remains = NULL; |
156 | size_t remains_size = 0; | |
157 | const char *td; | |
158 | struct stat st; | |
159 | int r; | |
160 | ||
161 | /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but | |
162 | * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be | |
163 | * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported | |
164 | * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in | |
165 | * /var/tmp. */ | |
166 | ||
167 | if (fstat(fd, &st) < 0) | |
168 | return -errno; | |
169 | ||
170 | /* For now, let's only accept regular files, sockets, pipes and char devices */ | |
171 | if (S_ISDIR(st.st_mode)) | |
172 | return -EISDIR; | |
173 | if (S_ISLNK(st.st_mode)) | |
174 | return -ELOOP; | |
175 | if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode)) | |
176 | return -EBADFD; | |
177 | ||
178 | /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note | |
179 | * that we use the reported regular file size only as a hint, given that there are plenty special files in | |
180 | * /proc and /sys which report a zero file size but can be read from. */ | |
181 | ||
182 | if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) { | |
183 | ||
184 | /* Try a memfd first */ | |
185 | copy_fd = memfd_new("data-fd"); | |
186 | if (copy_fd >= 0) { | |
187 | off_t f; | |
188 | ||
189 | r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0); | |
190 | if (r < 0) | |
191 | return r; | |
192 | ||
193 | f = lseek(copy_fd, 0, SEEK_SET); | |
194 | if (f != 0) | |
195 | return -errno; | |
196 | ||
197 | if (r == 0) { | |
198 | /* Did it fit into the limit? If so, we are done. */ | |
199 | r = memfd_set_sealed(copy_fd); | |
200 | if (r < 0) | |
201 | return r; | |
202 | ||
203 | return TAKE_FD(copy_fd); | |
204 | } | |
205 | ||
206 | /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */ | |
207 | ||
208 | } else { | |
71136404 | 209 | _cleanup_close_pair_ int pipefds[2] = EBADF_PAIR; |
6a818c3c ZJS |
210 | int isz; |
211 | ||
212 | /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather | |
213 | * then block indefinitely when we hit the pipe size limit */ | |
214 | ||
215 | if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0) | |
216 | return -errno; | |
217 | ||
218 | isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); | |
219 | if (isz < 0) | |
220 | return -errno; | |
221 | ||
222 | /* Try to enlarge the pipe size if necessary */ | |
223 | if ((size_t) isz < DATA_FD_MEMORY_LIMIT) { | |
224 | ||
225 | (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT); | |
226 | ||
227 | isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); | |
228 | if (isz < 0) | |
229 | return -errno; | |
230 | } | |
231 | ||
232 | if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) { | |
233 | ||
234 | r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL); | |
235 | if (r < 0 && r != -EAGAIN) | |
236 | return r; /* If we get EAGAIN it could be because of the source or because of | |
237 | * the destination fd, we can't know, as sendfile() and friends won't | |
238 | * tell us. Hence, treat this as reason to fall back, just to be | |
239 | * sure. */ | |
240 | if (r == 0) { | |
241 | /* Everything fit in, yay! */ | |
242 | (void) fd_nonblock(pipefds[0], false); | |
243 | ||
244 | return TAKE_FD(pipefds[0]); | |
245 | } | |
246 | ||
247 | /* Things didn't fit in. But we read data into the pipe, let's remember that, so that | |
248 | * when writing the new file we incorporate this first. */ | |
249 | copy_fd = TAKE_FD(pipefds[0]); | |
250 | } | |
251 | } | |
252 | } | |
253 | ||
254 | /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */ | |
255 | if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) && | |
256 | (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) { | |
257 | off_t f; | |
258 | ||
259 | tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC); | |
260 | if (tmp_fd < 0) | |
261 | return tmp_fd; | |
262 | ||
263 | if (copy_fd >= 0) { | |
264 | /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the | |
265 | * temporary file first. */ | |
266 | ||
267 | r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0); | |
268 | if (r < 0) | |
269 | return r; | |
270 | ||
271 | assert(r == 0); | |
272 | } | |
273 | ||
274 | if (remains_size > 0) { | |
275 | /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the | |
276 | * failed copy operation, let's flush them out next. */ | |
277 | ||
e22c60a9 | 278 | r = loop_write(tmp_fd, remains, remains_size); |
6a818c3c ZJS |
279 | if (r < 0) |
280 | return r; | |
281 | } | |
282 | ||
283 | r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK); | |
284 | if (r < 0) | |
285 | return r; | |
286 | if (r == 0) | |
287 | goto finish; /* Yay, it fit in */ | |
288 | ||
289 | /* It didn't fit in. Let's not forget to use what we already used */ | |
290 | f = lseek(tmp_fd, 0, SEEK_SET); | |
291 | if (f != 0) | |
292 | return -errno; | |
293 | ||
ee3455cf | 294 | close_and_replace(copy_fd, tmp_fd); |
6a818c3c ZJS |
295 | |
296 | remains = mfree(remains); | |
297 | remains_size = 0; | |
298 | } | |
299 | ||
300 | /* As last fallback use /var/tmp */ | |
301 | r = var_tmp_dir(&td); | |
302 | if (r < 0) | |
303 | return r; | |
304 | ||
305 | tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC); | |
306 | if (tmp_fd < 0) | |
307 | return tmp_fd; | |
308 | ||
309 | if (copy_fd >= 0) { | |
310 | /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this | |
311 | * into the temporary file first. */ | |
312 | r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK); | |
313 | if (r < 0) | |
314 | return r; | |
315 | ||
316 | assert(r == 0); | |
317 | } | |
318 | ||
319 | if (remains_size > 0) { | |
320 | /* Then, copy in any read but not yet written bytes. */ | |
e22c60a9 | 321 | r = loop_write(tmp_fd, remains, remains_size); |
6a818c3c ZJS |
322 | if (r < 0) |
323 | return r; | |
324 | } | |
325 | ||
326 | /* Copy in the rest */ | |
327 | r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK); | |
328 | if (r < 0) | |
329 | return r; | |
330 | ||
331 | assert(r == 0); | |
332 | ||
333 | finish: | |
334 | /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the | |
335 | * file again */ | |
336 | ||
337 | return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC); | |
338 | } | |
6bea3d8e LP |
339 | |
340 | int memfd_clone_fd(int fd, const char *name, int mode) { | |
341 | _cleanup_close_ int mfd = -EBADF; | |
c29715a8 TW |
342 | struct stat st; |
343 | bool ro, exec; | |
6bea3d8e LP |
344 | int r; |
345 | ||
346 | /* Creates a clone of a regular file in a memfd. Unlike copy_data_fd() this returns strictly a memfd | |
347 | * (and if it can't it will fail). Thus the resulting fd is seekable, and definitely reports as | |
348 | * S_ISREG. */ | |
349 | ||
350 | assert(fd >= 0); | |
351 | assert(name); | |
352 | assert(IN_SET(mode & O_ACCMODE, O_RDONLY, O_RDWR)); | |
353 | assert((mode & ~(O_RDONLY|O_RDWR|O_CLOEXEC)) == 0); | |
354 | ||
c29715a8 TW |
355 | if (fstat(fd, &st) < 0) |
356 | return -errno; | |
357 | ||
6bea3d8e | 358 | ro = (mode & O_ACCMODE) == O_RDONLY; |
c29715a8 | 359 | exec = st.st_mode & 0111; |
6bea3d8e | 360 | |
c29715a8 TW |
361 | mfd = memfd_create_wrapper(name, |
362 | ((FLAGS_SET(mode, O_CLOEXEC) || ro) ? MFD_CLOEXEC : 0) | | |
363 | (ro ? MFD_ALLOW_SEALING : 0) | | |
364 | (exec ? MFD_EXEC : MFD_NOEXEC_SEAL)); | |
6bea3d8e | 365 | if (mfd < 0) |
c29715a8 | 366 | return mfd; |
6bea3d8e LP |
367 | |
368 | r = copy_bytes(fd, mfd, UINT64_MAX, COPY_REFLINK); | |
369 | if (r < 0) | |
370 | return r; | |
371 | ||
372 | if (ro) { | |
254d1313 | 373 | _cleanup_close_ int rfd = -EBADF; |
6bea3d8e LP |
374 | |
375 | r = memfd_set_sealed(mfd); | |
376 | if (r < 0) | |
377 | return r; | |
378 | ||
379 | rfd = fd_reopen(mfd, mode); | |
380 | if (rfd < 0) | |
381 | return rfd; | |
382 | ||
383 | return TAKE_FD(rfd); | |
384 | } | |
385 | ||
386 | off_t f = lseek(mfd, 0, SEEK_SET); | |
387 | if (f < 0) | |
388 | return -errno; | |
389 | ||
390 | return TAKE_FD(mfd); | |
391 | } |