1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2010 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/resource.h>
24 #include <sys/socket.h>
28 #include "dirent-util.h"
33 #include "memfd-util.h"
35 #include "parse-util.h"
36 #include "path-util.h"
37 #include "process-util.h"
38 #include "socket-util.h"
39 #include "stdio-util.h"
42 int close_nointr(int fd
) {
49 * Just ignore EINTR; a retry loop is the wrong thing to do on
52 * http://lkml.indiana.edu/hypermail/linux/kernel/0509.1/0877.html
53 * https://bugzilla.gnome.org/show_bug.cgi?id=682819
54 * http://utcc.utoronto.ca/~cks/space/blog/unix/CloseEINTR
55 * https://sites.google.com/site/michaelsafyan/software-engineering/checkforeintrwheninvokingclosethinkagain
63 int safe_close(int fd
) {
66 * Like close_nointr() but cannot fail. Guarantees errno is
67 * unchanged. Is a NOP with negative fds passed, and returns
68 * -1, so that it can be used in this syntax:
70 * fd = safe_close(fd);
76 /* The kernel might return pretty much any error code
77 * via close(), but the fd will be closed anyway. The
78 * only condition we want to check for here is whether
79 * the fd was invalid at all... */
81 assert_se(close_nointr(fd
) != -EBADF
);
87 void safe_close_pair(int p
[]) {
91 /* Special case pairs which use the same fd in both
93 p
[0] = p
[1] = safe_close(p
[0]);
97 p
[0] = safe_close(p
[0]);
98 p
[1] = safe_close(p
[1]);
101 void close_many(const int fds
[], unsigned n_fd
) {
104 assert(fds
|| n_fd
<= 0);
106 for (i
= 0; i
< n_fd
; i
++)
110 int fclose_nointr(FILE *f
) {
113 /* Same as close_nointr(), but for fclose() */
124 FILE* safe_fclose(FILE *f
) {
126 /* Same as safe_close(), but for fclose() */
131 assert_se(fclose_nointr(f
) != EBADF
);
137 DIR* safe_closedir(DIR *d
) {
142 assert_se(closedir(d
) >= 0 || errno
!= EBADF
);
148 int fd_nonblock(int fd
, bool nonblock
) {
153 flags
= fcntl(fd
, F_GETFL
, 0);
158 nflags
= flags
| O_NONBLOCK
;
160 nflags
= flags
& ~O_NONBLOCK
;
165 if (fcntl(fd
, F_SETFL
, nflags
) < 0)
171 int fd_cloexec(int fd
, bool cloexec
) {
176 flags
= fcntl(fd
, F_GETFD
, 0);
181 nflags
= flags
| FD_CLOEXEC
;
183 nflags
= flags
& ~FD_CLOEXEC
;
188 if (fcntl(fd
, F_SETFD
, nflags
) < 0)
194 _pure_
static bool fd_in_set(int fd
, const int fdset
[], unsigned n_fdset
) {
197 assert(n_fdset
== 0 || fdset
);
199 for (i
= 0; i
< n_fdset
; i
++)
206 int close_all_fds(const int except
[], unsigned n_except
) {
207 _cleanup_closedir_
DIR *d
= NULL
;
211 assert(n_except
== 0 || except
);
213 d
= opendir("/proc/self/fd");
218 /* When /proc isn't available (for example in chroots)
219 * the fallback is brute forcing through the fd
222 assert_se(getrlimit(RLIMIT_NOFILE
, &rl
) >= 0);
223 for (fd
= 3; fd
< (int) rl
.rlim_max
; fd
++) {
226 if (fd_in_set(fd
, except
, n_except
))
229 q
= close_nointr(fd
);
230 if (q
< 0 && q
!= -EBADF
&& r
>= 0)
237 FOREACH_DIRENT(de
, d
, return -errno
) {
240 if (safe_atoi(de
->d_name
, &fd
) < 0)
241 /* Let's better ignore this, just in case */
250 if (fd_in_set(fd
, except
, n_except
))
253 q
= close_nointr(fd
);
254 if (q
< 0 && q
!= -EBADF
&& r
>= 0) /* Valgrind has its own FD and doesn't want to have it closed */
261 int same_fd(int a
, int b
) {
262 struct stat sta
, stb
;
269 /* Compares two file descriptors. Note that semantics are
270 * quite different depending on whether we have kcmp() or we
271 * don't. If we have kcmp() this will only return true for
272 * dup()ed file descriptors, but not otherwise. If we don't
273 * have kcmp() this will also return true for two fds of the same
274 * file, created by separate open() calls. Since we use this
275 * call mostly for filtering out duplicates in the fd store
276 * this difference hopefully doesn't matter too much. */
281 /* Try to use kcmp() if we have it. */
282 pid
= getpid_cached();
283 r
= kcmp(pid
, pid
, KCMP_FILE
, a
, b
);
291 /* We don't have kcmp(), use fstat() instead. */
292 if (fstat(a
, &sta
) < 0)
295 if (fstat(b
, &stb
) < 0)
298 if ((sta
.st_mode
& S_IFMT
) != (stb
.st_mode
& S_IFMT
))
301 /* We consider all device fds different, since two device fds
302 * might refer to quite different device contexts even though
303 * they share the same inode and backing dev_t. */
305 if (S_ISCHR(sta
.st_mode
) || S_ISBLK(sta
.st_mode
))
308 if (sta
.st_dev
!= stb
.st_dev
|| sta
.st_ino
!= stb
.st_ino
)
311 /* The fds refer to the same inode on disk, let's also check
312 * if they have the same fd flags. This is useful to
313 * distinguish the read and write side of a pipe created with
315 fa
= fcntl(a
, F_GETFL
);
319 fb
= fcntl(b
, F_GETFL
);
326 void cmsg_close_all(struct msghdr
*mh
) {
327 struct cmsghdr
*cmsg
;
331 CMSG_FOREACH(cmsg
, mh
)
332 if (cmsg
->cmsg_level
== SOL_SOCKET
&& cmsg
->cmsg_type
== SCM_RIGHTS
)
333 close_many((int*) CMSG_DATA(cmsg
), (cmsg
->cmsg_len
- CMSG_LEN(0)) / sizeof(int));
336 bool fdname_is_valid(const char *s
) {
339 /* Validates a name for $LISTEN_FDNAMES. We basically allow
340 * everything ASCII that's not a control character. Also, as
341 * special exception the ":" character is not allowed, as we
342 * use that as field separator in $LISTEN_FDNAMES.
344 * Note that the empty string is explicitly allowed
345 * here. However, we limit the length of the names to 255
351 for (p
= s
; *p
; p
++) {
363 int fd_get_path(int fd
, char **ret
) {
364 char procfs_path
[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
367 xsprintf(procfs_path
, "/proc/self/fd/%i", fd
);
369 r
= readlink_malloc(procfs_path
, ret
);
371 if (r
== -ENOENT
) /* If the file doesn't exist the fd is invalid */
377 int move_fd(int from
, int to
, int cloexec
) {
380 /* Move fd 'from' to 'to', make sure FD_CLOEXEC remains equal if requested, and release the old fd. If
381 * 'cloexec' is passed as -1, the original FD_CLOEXEC is inherited for the new fd. If it is 0, it is turned
382 * off, if it is > 0 it is turned on. */
392 r
= fd_cloexec(to
, cloexec
);
403 fl
= fcntl(from
, F_GETFD
, 0);
407 cloexec
= !!(fl
& FD_CLOEXEC
);
410 r
= dup3(from
, to
, cloexec
? O_CLOEXEC
: 0);
421 int acquire_data_fd(const void *data
, size_t size
, unsigned flags
) {
423 char procfs_path
[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
424 _cleanup_close_pair_
int pipefds
[2] = { -1, -1 };
425 char pattern
[] = "/dev/shm/data-fd-XXXXXX";
426 _cleanup_close_
int fd
= -1;
431 assert(data
|| size
== 0);
433 /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
434 * complex than I wish it was. But here's why:
436 * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
437 * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
439 * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
440 * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
441 * clients can only bump their size to a system-wide limit, which might be quite low.
443 * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
444 * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
445 * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
447 * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
449 * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
452 if (size
== 0 && ((flags
& ACQUIRE_NO_DEV_NULL
) == 0)) {
453 /* As a special case, return /dev/null if we have been called for an empty data block */
454 r
= open("/dev/null", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
461 if ((flags
& ACQUIRE_NO_MEMFD
) == 0) {
462 fd
= memfd_new("data-fd");
466 n
= write(fd
, data
, size
);
469 if ((size_t) n
!= size
)
472 f
= lseek(fd
, 0, SEEK_SET
);
476 r
= memfd_set_sealed(fd
);
487 if ((flags
& ACQUIRE_NO_PIPE
) == 0) {
488 if (pipe2(pipefds
, O_CLOEXEC
|O_NONBLOCK
) < 0)
491 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
495 if ((size_t) isz
< size
) {
497 if (isz
< 0 || (size_t) isz
!= size
)
500 /* Try to bump the pipe size */
501 (void) fcntl(pipefds
[1], F_SETPIPE_SZ
, isz
);
503 /* See if that worked */
504 isz
= fcntl(pipefds
[1], F_GETPIPE_SZ
, 0);
508 if ((size_t) isz
< size
)
512 n
= write(pipefds
[1], data
, size
);
515 if ((size_t) n
!= size
)
518 (void) fd_nonblock(pipefds
[0], false);
527 if ((flags
& ACQUIRE_NO_TMPFILE
) == 0) {
528 fd
= open("/dev/shm", O_RDWR
|O_TMPFILE
|O_CLOEXEC
, 0500);
530 goto try_dev_shm_without_o_tmpfile
;
532 n
= write(fd
, data
, size
);
535 if ((size_t) n
!= size
)
538 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
539 xsprintf(procfs_path
, "/proc/self/fd/%i", fd
);
540 r
= open(procfs_path
, O_RDONLY
|O_CLOEXEC
);
547 try_dev_shm_without_o_tmpfile
:
548 if ((flags
& ACQUIRE_NO_REGULAR
) == 0) {
549 fd
= mkostemp_safe(pattern
);
553 n
= write(fd
, data
, size
);
556 goto unlink_and_return
;
558 if ((size_t) n
!= size
) {
560 goto unlink_and_return
;
563 /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
564 r
= open(pattern
, O_RDONLY
|O_CLOEXEC
);
569 (void) unlink(pattern
);
576 int fd_move_above_stdio(int fd
) {
580 /* Moves the specified file descriptor if possible out of the range [0…2], i.e. the range of
581 * stdin/stdout/stderr. If it can't be moved outside of this range the original file descriptor is
582 * returned. This call is supposed to be used for long-lasting file descriptors we allocate in our code that
583 * might get loaded into foreign code, and where we want ensure our fds are unlikely used accidentally as
584 * stdin/stdout/stderr of unrelated code.
586 * Note that this doesn't fix any real bugs, it just makes it less likely that our code will be affected by
587 * buggy code from others that mindlessly invokes 'fprintf(stderr, …' or similar in places where stderr has
588 * been closed before.
590 * This function is written in a "best-effort" and "least-impact" style. This means whenever we encounter an
591 * error we simply return the original file descriptor, and we do not touch errno. */
593 if (fd
< 0 || fd
> 2)
596 flags
= fcntl(fd
, F_GETFD
, 0);
600 if (flags
& FD_CLOEXEC
)
601 copy
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3);
603 copy
= fcntl(fd
, F_DUPFD
, 3);
613 int rearrange_stdio(int original_input_fd
, int original_output_fd
, int original_error_fd
) {
615 int fd
[3] = { /* Put together an array of fds we work on */
622 null_fd
= -1, /* if we open /dev/null, we store the fd to it here */
623 copy_fd
[3] = { -1, -1, -1 }; /* This contains all fds we duplicate here temporarily, and hence need to close at the end */
624 bool null_readable
, null_writable
;
626 /* Sets up stdin, stdout, stderr with the three file descriptors passed in. If any of the descriptors is
627 * specified as -1 it will be connected with /dev/null instead. If any of the file descriptors is passed as
628 * itself (e.g. stdin as STDIN_FILENO) it is left unmodified, but the O_CLOEXEC bit is turned off should it be
631 * Note that if any of the passed file descriptors are > 2 they will be closed — both on success and on
632 * failure! Thus, callers should assume that when this function returns the input fds are invalidated.
634 * Note that when this function fails stdin/stdout/stderr might remain half set up!
636 * O_CLOEXEC is turned off for all three file descriptors (which is how it should be for
637 * stdin/stdout/stderr). */
639 null_readable
= original_input_fd
< 0;
640 null_writable
= original_output_fd
< 0 || original_error_fd
< 0;
642 /* First step, open /dev/null once, if we need it */
643 if (null_readable
|| null_writable
) {
645 /* Let's open this with O_CLOEXEC first, and convert it to non-O_CLOEXEC when we move the fd to the final position. */
646 null_fd
= open("/dev/null", (null_readable
&& null_writable
? O_RDWR
:
647 null_readable
? O_RDONLY
: O_WRONLY
) | O_CLOEXEC
);
653 /* If this fd is in the 0…2 range, let's move it out of it */
657 copy
= fcntl(null_fd
, F_DUPFD_CLOEXEC
, 3); /* Duplicate this with O_CLOEXEC set */
668 /* Let's assemble fd[] with the fds to install in place of stdin/stdout/stderr */
669 for (i
= 0; i
< 3; i
++) {
672 fd
[i
] = null_fd
; /* A negative parameter means: connect this one to /dev/null */
673 else if (fd
[i
] != i
&& fd
[i
] < 3) {
674 /* This fd is in the 0…2 territory, but not at its intended place, move it out of there, so that we can work there. */
675 copy_fd
[i
] = fcntl(fd
[i
], F_DUPFD_CLOEXEC
, 3); /* Duplicate this with O_CLOEXEC set */
676 if (copy_fd
[i
] < 0) {
685 /* At this point we now have the fds to use in fd[], and they are all above the stdio range, so that we
686 * have freedom to move them around. If the fds already were at the right places then the specific fds are
687 * -1. Let's now move them to the right places. This is the point of no return. */
688 for (i
= 0; i
< 3; i
++) {
692 /* fd is already in place, but let's make sure O_CLOEXEC is off */
693 r
= fd_cloexec(i
, false);
700 if (dup2(fd
[i
], i
) < 0) { /* Turns off O_CLOEXEC on the new fd. */
710 /* Close the original fds, but only if they were outside of the stdio range. Also, properly check for the same
711 * fd passed in multiple times. */
712 safe_close_above_stdio(original_input_fd
);
713 if (original_output_fd
!= original_input_fd
)
714 safe_close_above_stdio(original_output_fd
);
715 if (original_error_fd
!= original_input_fd
&& original_error_fd
!= original_output_fd
)
716 safe_close_above_stdio(original_error_fd
);
718 /* Close the copies we moved > 2 */
719 for (i
= 0; i
< 3; i
++)
720 safe_close(copy_fd
[i
]);
722 /* Close our null fd, if it's > 2 */
723 safe_close_above_stdio(null_fd
);