]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/fd-util.c
Merge pull request #11827 from keszybz/pkgconfig-variables
[thirdparty/systemd.git] / src / basic / fd-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
3ffd4af2 2
11c3a366
TA
3#include <errno.h>
4#include <fcntl.h>
5#include <sys/resource.h>
6#include <sys/socket.h>
7#include <sys/stat.h>
8#include <unistd.h>
9
4960ce43
LP
10#include "alloc-util.h"
11#include "copy.h"
8fb3f009 12#include "dirent-util.h"
3ffd4af2 13#include "fd-util.h"
a548e14d 14#include "fileio.h"
4aeb20f5 15#include "fs-util.h"
4960ce43 16#include "io-util.h"
11c3a366 17#include "macro.h"
a548e14d 18#include "memfd-util.h"
11c3a366 19#include "missing.h"
93cc7779 20#include "parse-util.h"
11c3a366 21#include "path-util.h"
df0ff127 22#include "process-util.h"
93cc7779 23#include "socket-util.h"
4aeb20f5 24#include "stdio-util.h"
3ffd4af2 25#include "util.h"
e4de7287 26#include "tmpfile-util.h"
3ffd4af2
LP
27
28int close_nointr(int fd) {
29 assert(fd >= 0);
30
31 if (close(fd) >= 0)
32 return 0;
33
34 /*
35 * Just ignore EINTR; a retry loop is the wrong thing to do on
36 * Linux.
37 *
38 * http://lkml.indiana.edu/hypermail/linux/kernel/0509.1/0877.html
39 * https://bugzilla.gnome.org/show_bug.cgi?id=682819
40 * http://utcc.utoronto.ca/~cks/space/blog/unix/CloseEINTR
41 * https://sites.google.com/site/michaelsafyan/software-engineering/checkforeintrwheninvokingclosethinkagain
42 */
43 if (errno == EINTR)
44 return 0;
45
46 return -errno;
47}
48
49int safe_close(int fd) {
50
51 /*
52 * Like close_nointr() but cannot fail. Guarantees errno is
53 * unchanged. Is a NOP with negative fds passed, and returns
54 * -1, so that it can be used in this syntax:
55 *
56 * fd = safe_close(fd);
57 */
58
59 if (fd >= 0) {
60 PROTECT_ERRNO;
61
62 /* The kernel might return pretty much any error code
63 * via close(), but the fd will be closed anyway. The
64 * only condition we want to check for here is whether
65 * the fd was invalid at all... */
66
67 assert_se(close_nointr(fd) != -EBADF);
68 }
69
70 return -1;
71}
72
3042bbeb 73void safe_close_pair(int p[static 2]) {
3ffd4af2
LP
74 assert(p);
75
76 if (p[0] == p[1]) {
77 /* Special case pairs which use the same fd in both
78 * directions... */
79 p[0] = p[1] = safe_close(p[0]);
80 return;
81 }
82
83 p[0] = safe_close(p[0]);
84 p[1] = safe_close(p[1]);
85}
86
da6053d0
LP
87void close_many(const int fds[], size_t n_fd) {
88 size_t i;
3ffd4af2
LP
89
90 assert(fds || n_fd <= 0);
91
92 for (i = 0; i < n_fd; i++)
93 safe_close(fds[i]);
94}
95
96int fclose_nointr(FILE *f) {
97 assert(f);
98
99 /* Same as close_nointr(), but for fclose() */
100
101 if (fclose(f) == 0)
102 return 0;
103
104 if (errno == EINTR)
105 return 0;
106
107 return -errno;
108}
109
110FILE* safe_fclose(FILE *f) {
111
112 /* Same as safe_close(), but for fclose() */
113
114 if (f) {
115 PROTECT_ERRNO;
116
6dce3bb4 117 assert_se(fclose_nointr(f) != -EBADF);
3ffd4af2
LP
118 }
119
120 return NULL;
121}
122
123DIR* safe_closedir(DIR *d) {
124
125 if (d) {
126 PROTECT_ERRNO;
127
128 assert_se(closedir(d) >= 0 || errno != EBADF);
129 }
130
131 return NULL;
132}
133
134int fd_nonblock(int fd, bool nonblock) {
135 int flags, nflags;
136
137 assert(fd >= 0);
138
139 flags = fcntl(fd, F_GETFL, 0);
140 if (flags < 0)
141 return -errno;
142
143 if (nonblock)
144 nflags = flags | O_NONBLOCK;
145 else
146 nflags = flags & ~O_NONBLOCK;
147
148 if (nflags == flags)
149 return 0;
150
151 if (fcntl(fd, F_SETFL, nflags) < 0)
152 return -errno;
153
154 return 0;
155}
156
157int fd_cloexec(int fd, bool cloexec) {
158 int flags, nflags;
159
160 assert(fd >= 0);
161
162 flags = fcntl(fd, F_GETFD, 0);
163 if (flags < 0)
164 return -errno;
165
166 if (cloexec)
167 nflags = flags | FD_CLOEXEC;
168 else
169 nflags = flags & ~FD_CLOEXEC;
170
171 if (nflags == flags)
172 return 0;
173
174 if (fcntl(fd, F_SETFD, nflags) < 0)
175 return -errno;
176
177 return 0;
178}
179
da6053d0
LP
180_pure_ static bool fd_in_set(int fd, const int fdset[], size_t n_fdset) {
181 size_t i;
3ffd4af2
LP
182
183 assert(n_fdset == 0 || fdset);
184
185 for (i = 0; i < n_fdset; i++)
186 if (fdset[i] == fd)
187 return true;
188
189 return false;
190}
191
498e265d
LP
192static int get_max_fd(void) {
193 struct rlimit rl;
194 rlim_t m;
195
196 /* Return the highest possible fd, based RLIMIT_NOFILE, but enforcing FD_SETSIZE-1 as lower boundary
197 * and INT_MAX as upper boundary. */
198
199 if (getrlimit(RLIMIT_NOFILE, &rl) < 0)
200 return -errno;
201
202 m = MAX(rl.rlim_cur, rl.rlim_max);
203 if (m < FD_SETSIZE) /* Let's always cover at least 1024 fds */
204 return FD_SETSIZE-1;
205
206 if (m == RLIM_INFINITY || m > INT_MAX) /* Saturate on overflow. After all fds are "int", hence can
207 * never be above INT_MAX */
208 return INT_MAX;
209
210 return (int) (m - 1);
211}
212
da6053d0 213int close_all_fds(const int except[], size_t n_except) {
3ffd4af2
LP
214 _cleanup_closedir_ DIR *d = NULL;
215 struct dirent *de;
216 int r = 0;
217
218 assert(n_except == 0 || except);
219
220 d = opendir("/proc/self/fd");
221 if (!d) {
37bc14de 222 int fd, max_fd;
3ffd4af2 223
498e265d
LP
224 /* When /proc isn't available (for example in chroots) the fallback is brute forcing through
225 * the fd table */
37bc14de 226
498e265d
LP
227 max_fd = get_max_fd();
228 if (max_fd < 0)
229 return max_fd;
37bc14de
LP
230
231 for (fd = 3; fd >= 0; fd = fd < max_fd ? fd + 1 : -1) {
e43bc9f5 232 int q;
3ffd4af2
LP
233
234 if (fd_in_set(fd, except, n_except))
235 continue;
236
e43bc9f5
LP
237 q = close_nointr(fd);
238 if (q < 0 && q != -EBADF && r >= 0)
239 r = q;
3ffd4af2
LP
240 }
241
242 return r;
243 }
244
8fb3f009 245 FOREACH_DIRENT(de, d, return -errno) {
e43bc9f5 246 int fd = -1, q;
3ffd4af2 247
3ffd4af2
LP
248 if (safe_atoi(de->d_name, &fd) < 0)
249 /* Let's better ignore this, just in case */
250 continue;
251
252 if (fd < 3)
253 continue;
254
255 if (fd == dirfd(d))
256 continue;
257
258 if (fd_in_set(fd, except, n_except))
259 continue;
260
e43bc9f5
LP
261 q = close_nointr(fd);
262 if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */
263 r = q;
3ffd4af2
LP
264 }
265
266 return r;
267}
268
269int same_fd(int a, int b) {
270 struct stat sta, stb;
271 pid_t pid;
272 int r, fa, fb;
273
274 assert(a >= 0);
275 assert(b >= 0);
276
277 /* Compares two file descriptors. Note that semantics are
278 * quite different depending on whether we have kcmp() or we
279 * don't. If we have kcmp() this will only return true for
280 * dup()ed file descriptors, but not otherwise. If we don't
281 * have kcmp() this will also return true for two fds of the same
282 * file, created by separate open() calls. Since we use this
283 * call mostly for filtering out duplicates in the fd store
284 * this difference hopefully doesn't matter too much. */
285
286 if (a == b)
287 return true;
288
289 /* Try to use kcmp() if we have it. */
df0ff127 290 pid = getpid_cached();
3ffd4af2
LP
291 r = kcmp(pid, pid, KCMP_FILE, a, b);
292 if (r == 0)
293 return true;
294 if (r > 0)
295 return false;
9e2acd1d 296 if (!IN_SET(errno, ENOSYS, EACCES, EPERM))
3ffd4af2
LP
297 return -errno;
298
299 /* We don't have kcmp(), use fstat() instead. */
300 if (fstat(a, &sta) < 0)
301 return -errno;
302
303 if (fstat(b, &stb) < 0)
304 return -errno;
305
306 if ((sta.st_mode & S_IFMT) != (stb.st_mode & S_IFMT))
307 return false;
308
309 /* We consider all device fds different, since two device fds
310 * might refer to quite different device contexts even though
311 * they share the same inode and backing dev_t. */
312
313 if (S_ISCHR(sta.st_mode) || S_ISBLK(sta.st_mode))
314 return false;
315
316 if (sta.st_dev != stb.st_dev || sta.st_ino != stb.st_ino)
317 return false;
318
319 /* The fds refer to the same inode on disk, let's also check
320 * if they have the same fd flags. This is useful to
321 * distinguish the read and write side of a pipe created with
322 * pipe(). */
323 fa = fcntl(a, F_GETFL);
324 if (fa < 0)
325 return -errno;
326
327 fb = fcntl(b, F_GETFL);
328 if (fb < 0)
329 return -errno;
330
331 return fa == fb;
332}
333
334void cmsg_close_all(struct msghdr *mh) {
335 struct cmsghdr *cmsg;
336
337 assert(mh);
338
339 CMSG_FOREACH(cmsg, mh)
340 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS)
341 close_many((int*) CMSG_DATA(cmsg), (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
342}
4fee3975
LP
343
344bool fdname_is_valid(const char *s) {
345 const char *p;
346
347 /* Validates a name for $LISTEN_FDNAMES. We basically allow
348 * everything ASCII that's not a control character. Also, as
349 * special exception the ":" character is not allowed, as we
350 * use that as field separator in $LISTEN_FDNAMES.
351 *
352 * Note that the empty string is explicitly allowed
353 * here. However, we limit the length of the names to 255
354 * characters. */
355
356 if (!s)
357 return false;
358
359 for (p = s; *p; p++) {
360 if (*p < ' ')
361 return false;
362 if (*p >= 127)
363 return false;
364 if (*p == ':')
365 return false;
366 }
367
368 return p - s < 256;
369}
4aeb20f5
LP
370
371int fd_get_path(int fd, char **ret) {
f267719c 372 char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
a0fe2a2d 373 int r;
4aeb20f5 374
f267719c
LP
375 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
376 r = readlink_malloc(procfs_path, ret);
377 if (r == -ENOENT) {
378 /* ENOENT can mean two things: that the fd does not exist or that /proc is not mounted. Let's make
379 * things debuggable and distuingish the two. */
4aeb20f5 380
f267719c
LP
381 if (access("/proc/self/fd/", F_OK) < 0)
382 /* /proc is not available or not set up properly, we're most likely in some chroot
383 * environment. */
384 return errno == ENOENT ? -EOPNOTSUPP : -errno;
a0fe2a2d 385
f267719c
LP
386 return -EBADF; /* The directory exists, hence it's the fd that doesn't. */
387 }
a0fe2a2d
LP
388
389 return r;
4aeb20f5 390}
046a82c1
LP
391
392int move_fd(int from, int to, int cloexec) {
393 int r;
394
395 /* Move fd 'from' to 'to', make sure FD_CLOEXEC remains equal if requested, and release the old fd. If
396 * 'cloexec' is passed as -1, the original FD_CLOEXEC is inherited for the new fd. If it is 0, it is turned
397 * off, if it is > 0 it is turned on. */
398
399 if (from < 0)
400 return -EBADF;
401 if (to < 0)
402 return -EBADF;
403
404 if (from == to) {
405
406 if (cloexec >= 0) {
407 r = fd_cloexec(to, cloexec);
408 if (r < 0)
409 return r;
410 }
411
412 return to;
413 }
414
415 if (cloexec < 0) {
416 int fl;
417
418 fl = fcntl(from, F_GETFD, 0);
419 if (fl < 0)
420 return -errno;
421
422 cloexec = !!(fl & FD_CLOEXEC);
423 }
424
425 r = dup3(from, to, cloexec ? O_CLOEXEC : 0);
426 if (r < 0)
427 return -errno;
428
429 assert(r == to);
430
431 safe_close(from);
432
433 return to;
434}
a548e14d
LP
435
436int acquire_data_fd(const void *data, size_t size, unsigned flags) {
437
a548e14d
LP
438 _cleanup_close_pair_ int pipefds[2] = { -1, -1 };
439 char pattern[] = "/dev/shm/data-fd-XXXXXX";
440 _cleanup_close_ int fd = -1;
441 int isz = 0, r;
442 ssize_t n;
443 off_t f;
444
445 assert(data || size == 0);
446
447 /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
448 * complex than I wish it was. But here's why:
449 *
450 * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
451 * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
452 *
453 * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
454 * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
455 * clients can only bump their size to a system-wide limit, which might be quite low.
456 *
457 * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
458 * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
459 * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
460 *
461 * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
462 *
463 * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
464 * figure. */
465
466 if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) {
467 /* As a special case, return /dev/null if we have been called for an empty data block */
468 r = open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY);
469 if (r < 0)
470 return -errno;
471
472 return r;
473 }
474
475 if ((flags & ACQUIRE_NO_MEMFD) == 0) {
476 fd = memfd_new("data-fd");
477 if (fd < 0)
478 goto try_pipe;
479
480 n = write(fd, data, size);
481 if (n < 0)
482 return -errno;
483 if ((size_t) n != size)
484 return -EIO;
485
486 f = lseek(fd, 0, SEEK_SET);
487 if (f != 0)
488 return -errno;
489
490 r = memfd_set_sealed(fd);
491 if (r < 0)
492 return r;
493
c10d6bdb 494 return TAKE_FD(fd);
a548e14d
LP
495 }
496
497try_pipe:
498 if ((flags & ACQUIRE_NO_PIPE) == 0) {
499 if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
500 return -errno;
501
502 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
503 if (isz < 0)
504 return -errno;
505
506 if ((size_t) isz < size) {
507 isz = (int) size;
508 if (isz < 0 || (size_t) isz != size)
509 return -E2BIG;
510
511 /* Try to bump the pipe size */
512 (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
513
514 /* See if that worked */
515 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
516 if (isz < 0)
517 return -errno;
518
519 if ((size_t) isz < size)
520 goto try_dev_shm;
521 }
522
523 n = write(pipefds[1], data, size);
524 if (n < 0)
525 return -errno;
526 if ((size_t) n != size)
527 return -EIO;
528
529 (void) fd_nonblock(pipefds[0], false);
530
c10d6bdb 531 return TAKE_FD(pipefds[0]);
a548e14d
LP
532 }
533
534try_dev_shm:
535 if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
536 fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
537 if (fd < 0)
538 goto try_dev_shm_without_o_tmpfile;
539
540 n = write(fd, data, size);
541 if (n < 0)
542 return -errno;
543 if ((size_t) n != size)
544 return -EIO;
545
546 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
f2324783 547 return fd_reopen(fd, O_RDONLY|O_CLOEXEC);
a548e14d
LP
548 }
549
550try_dev_shm_without_o_tmpfile:
551 if ((flags & ACQUIRE_NO_REGULAR) == 0) {
552 fd = mkostemp_safe(pattern);
553 if (fd < 0)
554 return fd;
555
556 n = write(fd, data, size);
557 if (n < 0) {
558 r = -errno;
559 goto unlink_and_return;
560 }
561 if ((size_t) n != size) {
562 r = -EIO;
563 goto unlink_and_return;
564 }
565
566 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
567 r = open(pattern, O_RDONLY|O_CLOEXEC);
568 if (r < 0)
569 r = -errno;
570
571 unlink_and_return:
572 (void) unlink(pattern);
573 return r;
574 }
575
576 return -EOPNOTSUPP;
577}
7fe2903c 578
4960ce43
LP
579/* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */
580#define DATA_FD_MEMORY_LIMIT (64U*1024U)
581
582/* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */
583#define DATA_FD_TMP_LIMIT (1024U*1024U)
584
585int fd_duplicate_data_fd(int fd) {
586
587 _cleanup_close_ int copy_fd = -1, tmp_fd = -1;
588 _cleanup_free_ void *remains = NULL;
4960ce43
LP
589 size_t remains_size = 0;
590 const char *td;
591 struct stat st;
592 int r;
593
594 /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but
595 * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be
596 * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported
597 * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in
598 * /var/tmp. */
599
600 if (fstat(fd, &st) < 0)
601 return -errno;
602
603 /* For now, let's only accept regular files, sockets, pipes and char devices */
604 if (S_ISDIR(st.st_mode))
605 return -EISDIR;
606 if (S_ISLNK(st.st_mode))
607 return -ELOOP;
608 if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode))
609 return -EBADFD;
610
611 /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note
612 * that we use the reported regular file size only as a hint, given that there are plenty special files in
613 * /proc and /sys which report a zero file size but can be read from. */
614
615 if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) {
616
617 /* Try a memfd first */
618 copy_fd = memfd_new("data-fd");
619 if (copy_fd >= 0) {
620 off_t f;
621
622 r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0);
623 if (r < 0)
624 return r;
625
626 f = lseek(copy_fd, 0, SEEK_SET);
627 if (f != 0)
628 return -errno;
629
630 if (r == 0) {
631 /* Did it fit into the limit? If so, we are done. */
632 r = memfd_set_sealed(copy_fd);
633 if (r < 0)
634 return r;
635
636 return TAKE_FD(copy_fd);
637 }
638
639 /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */
640
641 } else {
642 _cleanup_(close_pairp) int pipefds[2] = { -1, -1 };
643 int isz;
644
645 /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather
646 * then block indefinitely when we hit the pipe size limit */
647
648 if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
649 return -errno;
650
651 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
652 if (isz < 0)
653 return -errno;
654
655 /* Try to enlarge the pipe size if necessary */
656 if ((size_t) isz < DATA_FD_MEMORY_LIMIT) {
657
658 (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT);
659
660 isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
661 if (isz < 0)
662 return -errno;
663 }
664
665 if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) {
666
b3cade0c 667 r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL);
4960ce43
LP
668 if (r < 0 && r != -EAGAIN)
669 return r; /* If we get EAGAIN it could be because of the source or because of
670 * the destination fd, we can't know, as sendfile() and friends won't
671 * tell us. Hence, treat this as reason to fall back, just to be
672 * sure. */
673 if (r == 0) {
674 /* Everything fit in, yay! */
675 (void) fd_nonblock(pipefds[0], false);
676
677 return TAKE_FD(pipefds[0]);
678 }
679
680 /* Things didn't fit in. But we read data into the pipe, let's remember that, so that
681 * when writing the new file we incorporate this first. */
682 copy_fd = TAKE_FD(pipefds[0]);
683 }
684 }
685 }
686
687 /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */
688 if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) &&
689 (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) {
690 off_t f;
691
692 tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC);
693 if (tmp_fd < 0)
694 return tmp_fd;
695
696 if (copy_fd >= 0) {
697 /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the
698 * temporary file first. */
699
700 r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0);
701 if (r < 0)
702 return r;
703
704 assert(r == 0);
705 }
706
707 if (remains_size > 0) {
708 /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the
709 * failed copy operation, let's flush them out next. */
710
711 r = loop_write(tmp_fd, remains, remains_size, false);
712 if (r < 0)
713 return r;
714 }
715
716 r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK);
717 if (r < 0)
718 return r;
719 if (r == 0)
720 goto finish; /* Yay, it fit in */
721
722 /* It didn't fit in. Let's not forget to use what we already used */
723 f = lseek(tmp_fd, 0, SEEK_SET);
724 if (f != 0)
725 return -errno;
726
727 safe_close(copy_fd);
728 copy_fd = TAKE_FD(tmp_fd);
729
730 remains = mfree(remains);
731 remains_size = 0;
732 }
733
734 /* As last fallback use /var/tmp */
735 r = var_tmp_dir(&td);
736 if (r < 0)
737 return r;
738
739 tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC);
740 if (tmp_fd < 0)
741 return tmp_fd;
742
743 if (copy_fd >= 0) {
744 /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this
745 * into the temporary file first. */
746 r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
747 if (r < 0)
748 return r;
749
750 assert(r == 0);
751 }
752
753 if (remains_size > 0) {
754 /* Then, copy in any read but not yet written bytes. */
755 r = loop_write(tmp_fd, remains, remains_size, false);
756 if (r < 0)
757 return r;
758 }
759
760 /* Copy in the rest */
761 r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
762 if (r < 0)
763 return r;
764
765 assert(r == 0);
766
767finish:
768 /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the
769 * file again */
770
771 return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC);
772}
773
7fe2903c
LP
774int fd_move_above_stdio(int fd) {
775 int flags, copy;
776 PROTECT_ERRNO;
777
778 /* Moves the specified file descriptor if possible out of the range [0…2], i.e. the range of
779 * stdin/stdout/stderr. If it can't be moved outside of this range the original file descriptor is
780 * returned. This call is supposed to be used for long-lasting file descriptors we allocate in our code that
781 * might get loaded into foreign code, and where we want ensure our fds are unlikely used accidentally as
782 * stdin/stdout/stderr of unrelated code.
783 *
784 * Note that this doesn't fix any real bugs, it just makes it less likely that our code will be affected by
785 * buggy code from others that mindlessly invokes 'fprintf(stderr, …' or similar in places where stderr has
786 * been closed before.
787 *
788 * This function is written in a "best-effort" and "least-impact" style. This means whenever we encounter an
789 * error we simply return the original file descriptor, and we do not touch errno. */
790
791 if (fd < 0 || fd > 2)
792 return fd;
793
794 flags = fcntl(fd, F_GETFD, 0);
795 if (flags < 0)
796 return fd;
797
798 if (flags & FD_CLOEXEC)
799 copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
800 else
801 copy = fcntl(fd, F_DUPFD, 3);
802 if (copy < 0)
803 return fd;
804
805 assert(copy > 2);
806
807 (void) close(fd);
808 return copy;
809}
aa11e28b
LP
810
811int rearrange_stdio(int original_input_fd, int original_output_fd, int original_error_fd) {
812
813 int fd[3] = { /* Put together an array of fds we work on */
814 original_input_fd,
815 original_output_fd,
816 original_error_fd
817 };
818
819 int r, i,
820 null_fd = -1, /* if we open /dev/null, we store the fd to it here */
821 copy_fd[3] = { -1, -1, -1 }; /* This contains all fds we duplicate here temporarily, and hence need to close at the end */
822 bool null_readable, null_writable;
823
824 /* Sets up stdin, stdout, stderr with the three file descriptors passed in. If any of the descriptors is
825 * specified as -1 it will be connected with /dev/null instead. If any of the file descriptors is passed as
826 * itself (e.g. stdin as STDIN_FILENO) it is left unmodified, but the O_CLOEXEC bit is turned off should it be
827 * on.
828 *
829 * Note that if any of the passed file descriptors are > 2 they will be closed — both on success and on
830 * failure! Thus, callers should assume that when this function returns the input fds are invalidated.
831 *
832 * Note that when this function fails stdin/stdout/stderr might remain half set up!
833 *
834 * O_CLOEXEC is turned off for all three file descriptors (which is how it should be for
835 * stdin/stdout/stderr). */
836
837 null_readable = original_input_fd < 0;
838 null_writable = original_output_fd < 0 || original_error_fd < 0;
839
840 /* First step, open /dev/null once, if we need it */
841 if (null_readable || null_writable) {
842
843 /* Let's open this with O_CLOEXEC first, and convert it to non-O_CLOEXEC when we move the fd to the final position. */
844 null_fd = open("/dev/null", (null_readable && null_writable ? O_RDWR :
845 null_readable ? O_RDONLY : O_WRONLY) | O_CLOEXEC);
846 if (null_fd < 0) {
847 r = -errno;
848 goto finish;
849 }
850
851 /* If this fd is in the 0…2 range, let's move it out of it */
852 if (null_fd < 3) {
853 int copy;
854
855 copy = fcntl(null_fd, F_DUPFD_CLOEXEC, 3); /* Duplicate this with O_CLOEXEC set */
856 if (copy < 0) {
857 r = -errno;
858 goto finish;
859 }
860
861 safe_close(null_fd);
862 null_fd = copy;
863 }
864 }
865
866 /* Let's assemble fd[] with the fds to install in place of stdin/stdout/stderr */
867 for (i = 0; i < 3; i++) {
868
869 if (fd[i] < 0)
870 fd[i] = null_fd; /* A negative parameter means: connect this one to /dev/null */
871 else if (fd[i] != i && fd[i] < 3) {
872 /* This fd is in the 0…2 territory, but not at its intended place, move it out of there, so that we can work there. */
873 copy_fd[i] = fcntl(fd[i], F_DUPFD_CLOEXEC, 3); /* Duplicate this with O_CLOEXEC set */
874 if (copy_fd[i] < 0) {
875 r = -errno;
876 goto finish;
877 }
878
879 fd[i] = copy_fd[i];
880 }
881 }
882
883 /* At this point we now have the fds to use in fd[], and they are all above the stdio range, so that we
884 * have freedom to move them around. If the fds already were at the right places then the specific fds are
885 * -1. Let's now move them to the right places. This is the point of no return. */
886 for (i = 0; i < 3; i++) {
887
888 if (fd[i] == i) {
889
890 /* fd is already in place, but let's make sure O_CLOEXEC is off */
891 r = fd_cloexec(i, false);
892 if (r < 0)
893 goto finish;
894
895 } else {
896 assert(fd[i] > 2);
897
898 if (dup2(fd[i], i) < 0) { /* Turns off O_CLOEXEC on the new fd. */
899 r = -errno;
900 goto finish;
901 }
902 }
903 }
904
905 r = 0;
906
907finish:
908 /* Close the original fds, but only if they were outside of the stdio range. Also, properly check for the same
909 * fd passed in multiple times. */
910 safe_close_above_stdio(original_input_fd);
911 if (original_output_fd != original_input_fd)
912 safe_close_above_stdio(original_output_fd);
913 if (original_error_fd != original_input_fd && original_error_fd != original_output_fd)
914 safe_close_above_stdio(original_error_fd);
915
916 /* Close the copies we moved > 2 */
917 for (i = 0; i < 3; i++)
918 safe_close(copy_fd[i]);
919
920 /* Close our null fd, if it's > 2 */
921 safe_close_above_stdio(null_fd);
922
923 return r;
924}
f2324783
LP
925
926int fd_reopen(int fd, int flags) {
927 char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
928 int new_fd;
929
930 /* Reopens the specified fd with new flags. This is useful for convert an O_PATH fd into a regular one, or to
931 * turn O_RDWR fds into O_RDONLY fds.
932 *
933 * This doesn't work on sockets (since they cannot be open()ed, ever).
934 *
935 * This implicitly resets the file read index to 0. */
936
937 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
938 new_fd = open(procfs_path, flags);
939 if (new_fd < 0)
940 return -errno;
941
942 return new_fd;
943}
9264cc39
LP
944
945int read_nr_open(void) {
946 _cleanup_free_ char *nr_open = NULL;
947 int r;
948
949 /* Returns the kernel's current fd limit, either by reading it of /proc/sys if that works, or using the
950 * hard-coded default compiled-in value of current kernels (1M) if not. This call will never fail. */
951
952 r = read_one_line_file("/proc/sys/fs/nr_open", &nr_open);
953 if (r < 0)
954 log_debug_errno(r, "Failed to read /proc/sys/fs/nr_open, ignoring: %m");
955 else {
956 int v;
957
958 r = safe_atoi(nr_open, &v);
959 if (r < 0)
960 log_debug_errno(r, "Failed to parse /proc/sys/fs/nr_open value '%s', ignoring: %m", nr_open);
961 else
962 return v;
963 }
964
965 /* If we fail, fallback to the hard-coded kernel limit of 1024 * 1024. */
966 return 1024 * 1024;
967}