[thirdparty/systemd.git] / src / basic / fd-util.c

/* SPDX-License-Identifier: LGPL-2.1+ */
/***
  This file is part of systemd.

  Copyright 2010 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

#include <errno.h>
#include <fcntl.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <unistd.h>

#include "dirent-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "macro.h"
#include "memfd-util.h"
#include "missing.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
#include "socket-util.h"
#include "stdio-util.h"
#include "util.h"

int close_nointr(int fd) {
        assert(fd >= 0);

        if (close(fd) >= 0)
                return 0;

        /*
         * Just ignore EINTR; a retry loop is the wrong thing to do on
         * Linux.
         *
         * http://lkml.indiana.edu/hypermail/linux/kernel/0509.1/0877.html
         * https://bugzilla.gnome.org/show_bug.cgi?id=682819
         * http://utcc.utoronto.ca/~cks/space/blog/unix/CloseEINTR
         * https://sites.google.com/site/michaelsafyan/software-engineering/checkforeintrwheninvokingclosethinkagain
         */
        if (errno == EINTR)
                return 0;

        return -errno;
}

int safe_close(int fd) {

        /*
         * Like close_nointr() but cannot fail. Guarantees errno is
         * unchanged. Is a NOP with negative fds passed, and returns
         * -1, so that it can be used in this syntax:
         *
         * fd = safe_close(fd);
         */

        if (fd >= 0) {
                PROTECT_ERRNO;

                /* The kernel might return pretty much any error code
                 * via close(), but the fd will be closed anyway. The
                 * only condition we want to check for here is whether
                 * the fd was invalid at all... */

                assert_se(close_nointr(fd) != -EBADF);
        }

        return -1;
}

void safe_close_pair(int p[]) {
        assert(p);

        if (p[0] == p[1]) {
                /* Special case pairs which use the same fd in both
                 * directions... */
                p[0] = p[1] = safe_close(p[0]);
                return;
        }

        p[0] = safe_close(p[0]);
        p[1] = safe_close(p[1]);
}

void close_many(const int fds[], unsigned n_fd) {
        unsigned i;

        assert(fds || n_fd <= 0);

        for (i = 0; i < n_fd; i++)
                safe_close(fds[i]);
}

int fclose_nointr(FILE *f) {
        assert(f);

        /* Same as close_nointr(), but for fclose() */

        if (fclose(f) == 0)
                return 0;

        if (errno == EINTR)
                return 0;

        return -errno;
}

FILE* safe_fclose(FILE *f) {

        /* Same as safe_close(), but for fclose() */

        if (f) {
                PROTECT_ERRNO;

                assert_se(fclose_nointr(f) != EBADF);
        }

        return NULL;
}

DIR* safe_closedir(DIR *d) {

        if (d) {
                PROTECT_ERRNO;

                assert_se(closedir(d) >= 0 || errno != EBADF);
        }

        return NULL;
}

int fd_nonblock(int fd, bool nonblock) {
        int flags, nflags;

        assert(fd >= 0);

        flags = fcntl(fd, F_GETFL, 0);
        if (flags < 0)
                return -errno;

        if (nonblock)
                nflags = flags | O_NONBLOCK;
        else
                nflags = flags & ~O_NONBLOCK;

        if (nflags == flags)
                return 0;

        if (fcntl(fd, F_SETFL, nflags) < 0)
                return -errno;

        return 0;
}

int fd_cloexec(int fd, bool cloexec) {
        int flags, nflags;

        assert(fd >= 0);

        flags = fcntl(fd, F_GETFD, 0);
        if (flags < 0)
                return -errno;

        if (cloexec)
                nflags = flags | FD_CLOEXEC;
        else
                nflags = flags & ~FD_CLOEXEC;

        if (nflags == flags)
                return 0;

        if (fcntl(fd, F_SETFD, nflags) < 0)
                return -errno;

        return 0;
}

void stdio_unset_cloexec(void) {
        (void) fd_cloexec(STDIN_FILENO, false);
        (void) fd_cloexec(STDOUT_FILENO, false);
        (void) fd_cloexec(STDERR_FILENO, false);
}

_pure_ static bool fd_in_set(int fd, const int fdset[], unsigned n_fdset) {
        unsigned i;

        assert(n_fdset == 0 || fdset);

        for (i = 0; i < n_fdset; i++)
                if (fdset[i] == fd)
                        return true;

        return false;
}

int close_all_fds(const int except[], unsigned n_except) {
        _cleanup_closedir_ DIR *d = NULL;
        struct dirent *de;
        int r = 0;

        assert(n_except == 0 || except);

        d = opendir("/proc/self/fd");
        if (!d) {
                int fd;
                struct rlimit rl;

                /* When /proc isn't available (for example in chroots)
                 * the fallback is brute forcing through the fd
                 * table */

                assert_se(getrlimit(RLIMIT_NOFILE, &rl) >= 0);
                for (fd = 3; fd < (int) rl.rlim_max; fd ++) {

                        if (fd_in_set(fd, except, n_except))
                                continue;

                        if (close_nointr(fd) < 0)
                                if (errno != EBADF && r == 0)
                                        r = -errno;
                }

                return r;
        }

        FOREACH_DIRENT(de, d, return -errno) {
                int fd = -1;

                if (safe_atoi(de->d_name, &fd) < 0)
                        /* Let's better ignore this, just in case */
                        continue;

                if (fd < 3)
                        continue;

                if (fd == dirfd(d))
                        continue;

                if (fd_in_set(fd, except, n_except))
                        continue;

                if (close_nointr(fd) < 0) {
                        /* Valgrind has its own FD and doesn't want to have it closed */
                        if (errno != EBADF && r == 0)
                                r = -errno;
                }
        }

        return r;
}

int same_fd(int a, int b) {
        struct stat sta, stb;
        pid_t pid;
        int r, fa, fb;

        assert(a >= 0);
        assert(b >= 0);

        /* Compares two file descriptors. Note that semantics are
         * quite different depending on whether we have kcmp() or we
         * don't. If we have kcmp() this will only return true for
         * dup()ed file descriptors, but not otherwise. If we don't
         * have kcmp() this will also return true for two fds of the same
         * file, created by separate open() calls. Since we use this
         * call mostly for filtering out duplicates in the fd store
         * this difference hopefully doesn't matter too much. */

        if (a == b)
                return true;

        /* Try to use kcmp() if we have it. */
        pid = getpid_cached();
        r = kcmp(pid, pid, KCMP_FILE, a, b);
        if (r == 0)
                return true;
        if (r > 0)
                return false;
        if (errno != ENOSYS)
                return -errno;

        /* We don't have kcmp(), use fstat() instead. */
        if (fstat(a, &sta) < 0)
                return -errno;

        if (fstat(b, &stb) < 0)
                return -errno;

        if ((sta.st_mode & S_IFMT) != (stb.st_mode & S_IFMT))
                return false;

        /* We consider all device fds different, since two device fds
         * might refer to quite different device contexts even though
         * they share the same inode and backing dev_t. */

        if (S_ISCHR(sta.st_mode) || S_ISBLK(sta.st_mode))
                return false;

        if (sta.st_dev != stb.st_dev || sta.st_ino != stb.st_ino)
                return false;

        /* The fds refer to the same inode on disk, let's also check
         * if they have the same fd flags. This is useful to
         * distinguish the read and write side of a pipe created with
         * pipe(). */
        fa = fcntl(a, F_GETFL);
        if (fa < 0)
                return -errno;

        fb = fcntl(b, F_GETFL);
        if (fb < 0)
                return -errno;

        return fa == fb;
}

void cmsg_close_all(struct msghdr *mh) {
        struct cmsghdr *cmsg;

        assert(mh);

        CMSG_FOREACH(cmsg, mh)
                if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS)
                        close_many((int*) CMSG_DATA(cmsg), (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
}

bool fdname_is_valid(const char *s) {
        const char *p;

        /* Validates a name for $LISTEN_FDNAMES. We basically allow
         * everything ASCII that's not a control character. Also, as
         * special exception the ":" character is not allowed, as we
         * use that as field separator in $LISTEN_FDNAMES.
         *
         * Note that the empty string is explicitly allowed
         * here. However, we limit the length of the names to 255
         * characters. */

        if (!s)
                return false;

        for (p = s; *p; p++) {
                if (*p < ' ')
                        return false;
                if (*p >= 127)
                        return false;
                if (*p == ':')
                        return false;
        }

        return p - s < 256;
}

int fd_get_path(int fd, char **ret) {
        char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
        int r;

        xsprintf(procfs_path, "/proc/self/fd/%i", fd);

        r = readlink_malloc(procfs_path, ret);

        if (r == -ENOENT) /* If the file doesn't exist the fd is invalid */
                return -EBADF;

        return r;
}

int move_fd(int from, int to, int cloexec) {
        int r;

        /* Move fd 'from' to 'to', make sure FD_CLOEXEC remains equal if requested, and release the old fd. If
         * 'cloexec' is passed as -1, the original FD_CLOEXEC is inherited for the new fd. If it is 0, it is turned
         * off, if it is > 0 it is turned on. */

        if (from < 0)
                return -EBADF;
        if (to < 0)
                return -EBADF;

        if (from == to) {

                if (cloexec >= 0) {
                        r = fd_cloexec(to, cloexec);
                        if (r < 0)
                                return r;
                }

                return to;
        }

        if (cloexec < 0) {
                int fl;

                fl = fcntl(from, F_GETFD, 0);
                if (fl < 0)
                        return -errno;

                cloexec = !!(fl & FD_CLOEXEC);
        }

        r = dup3(from, to, cloexec ? O_CLOEXEC : 0);
        if (r < 0)
                return -errno;

        assert(r == to);

        safe_close(from);

        return to;
}

int acquire_data_fd(const void *data, size_t size, unsigned flags) {

        char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
        _cleanup_close_pair_ int pipefds[2] = { -1, -1 };
        char pattern[] = "/dev/shm/data-fd-XXXXXX";
        _cleanup_close_ int fd = -1;
        int isz = 0, r;
        ssize_t n;
        off_t f;

        assert(data || size == 0);

        /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
         * complex than I wish it was. But here's why:
         *
         * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
         *    read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
         *
         * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
         *    a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
         *    clients can only bump their size to a system-wide limit, which might be quite low.
         *
         * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
         *    earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
         *    /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
         *
         * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
         *
         * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
         * figure. */

        if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) {
                /* As a special case, return /dev/null if we have been called for an empty data block */
                r = open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY);
                if (r < 0)
                        return -errno;

                return r;
        }

        if ((flags & ACQUIRE_NO_MEMFD) == 0) {
                fd = memfd_new("data-fd");
                if (fd < 0)
                        goto try_pipe;

                n = write(fd, data, size);
                if (n < 0)
                        return -errno;
                if ((size_t) n != size)
                        return -EIO;

                f = lseek(fd, 0, SEEK_SET);
                if (f != 0)
                        return -errno;

                r = memfd_set_sealed(fd);
                if (r < 0)
                        return r;

                r = fd;
                fd = -1;

                return r;
        }

try_pipe:
        if ((flags & ACQUIRE_NO_PIPE) == 0) {
                if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
                        return -errno;

                isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
                if (isz < 0)
                        return -errno;

                if ((size_t) isz < size) {
                        isz = (int) size;
                        if (isz < 0 || (size_t) isz != size)
                                return -E2BIG;

                        /* Try to bump the pipe size */
                        (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);

                        /* See if that worked */
                        isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
                        if (isz < 0)
                                return -errno;

                        if ((size_t) isz < size)
                                goto try_dev_shm;
                }

                n = write(pipefds[1], data, size);
                if (n < 0)
                        return -errno;
                if ((size_t) n != size)
                        return -EIO;

                (void) fd_nonblock(pipefds[0], false);

                r = pipefds[0];
                pipefds[0] = -1;

                return r;
        }

try_dev_shm:
        if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
                fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
                if (fd < 0)
                        goto try_dev_shm_without_o_tmpfile;

                n = write(fd, data, size);
                if (n < 0)
                        return -errno;
                if ((size_t) n != size)
                        return -EIO;

                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
                xsprintf(procfs_path, "/proc/self/fd/%i", fd);
                r = open(procfs_path, O_RDONLY|O_CLOEXEC);
                if (r < 0)
                        return -errno;

                return r;
        }

try_dev_shm_without_o_tmpfile:
        if ((flags & ACQUIRE_NO_REGULAR) == 0) {
                fd = mkostemp_safe(pattern);
                if (fd < 0)
                        return fd;

                n = write(fd, data, size);
                if (n < 0) {
                        r = -errno;
                        goto unlink_and_return;
                }
                if ((size_t) n != size) {
                        r = -EIO;
                        goto unlink_and_return;
                }

                /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
                r = open(pattern, O_RDONLY|O_CLOEXEC);
                if (r < 0)
                        r = -errno;

        unlink_and_return:
                (void) unlink(pattern);
                return r;
        }

        return -EOPNOTSUPP;
}
Commit	Line	Data
53e1b683	1	/* SPDX-License-Identifier: LGPL-2.1+ */
3ffd4af2 LP	2	/***
	3	This file is part of systemd.
	4
	5	Copyright 2010 Lennart Poettering
	6
	7	systemd is free software; you can redistribute it and/or modify it
	8	under the terms of the GNU Lesser General Public License as published by
	9	the Free Software Foundation; either version 2.1 of the License, or
	10	(at your option) any later version.
	11
	12	systemd is distributed in the hope that it will be useful, but
	13	WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	Lesser General Public License for more details.
	16
	17	You should have received a copy of the GNU Lesser General Public License
	18	along with systemd; If not, see <http://www.gnu.org/licenses/>.
	19	***/
	20
11c3a366 TA	21	#include <errno.h>
	22	#include <fcntl.h>
	23	#include <sys/resource.h>
	24	#include <sys/socket.h>
	25	#include <sys/stat.h>
	26	#include <unistd.h>
	27
8fb3f009	28	#include "dirent-util.h"
3ffd4af2	29	#include "fd-util.h"
a548e14d	30	#include "fileio.h"
4aeb20f5	31	#include "fs-util.h"
11c3a366	32	#include "macro.h"
a548e14d	33	#include "memfd-util.h"
11c3a366	34	#include "missing.h"
93cc7779	35	#include "parse-util.h"
11c3a366	36	#include "path-util.h"
df0ff127	37	#include "process-util.h"
93cc7779	38	#include "socket-util.h"
4aeb20f5	39	#include "stdio-util.h"
3ffd4af2 LP	40	#include "util.h"
	41
	42	int close_nointr(int fd) {
	43	assert(fd >= 0);
	44
	45	if (close(fd) >= 0)
	46	return 0;
	47
	48	/*
	49	* Just ignore EINTR; a retry loop is the wrong thing to do on
	50	* Linux.
	51	*
	52	* http://lkml.indiana.edu/hypermail/linux/kernel/0509.1/0877.html
	53	* https://bugzilla.gnome.org/show_bug.cgi?id=682819
	54	* http://utcc.utoronto.ca/~cks/space/blog/unix/CloseEINTR
	55	* https://sites.google.com/site/michaelsafyan/software-engineering/checkforeintrwheninvokingclosethinkagain
	56	*/
	57	if (errno == EINTR)
	58	return 0;
	59
	60	return -errno;
	61	}
	62
	63	int safe_close(int fd) {
	64
	65	/*
	66	* Like close_nointr() but cannot fail. Guarantees errno is
	67	* unchanged. Is a NOP with negative fds passed, and returns
	68	* -1, so that it can be used in this syntax:
	69	*
	70	* fd = safe_close(fd);
	71	*/
	72
	73	if (fd >= 0) {
	74	PROTECT_ERRNO;
	75
	76	/* The kernel might return pretty much any error code
	77	* via close(), but the fd will be closed anyway. The
	78	* only condition we want to check for here is whether
	79	* the fd was invalid at all... */
	80
	81	assert_se(close_nointr(fd) != -EBADF);
	82	}
	83
	84	return -1;
	85	}
	86
	87	void safe_close_pair(int p[]) {
	88	assert(p);
	89
	90	if (p[0] == p[1]) {
	91	/* Special case pairs which use the same fd in both
	92	* directions... */
	93	p[0] = p[1] = safe_close(p[0]);
	94	return;
	95	}
	96
	97	p[0] = safe_close(p[0]);
	98	p[1] = safe_close(p[1]);
	99	}
	100
	101	void close_many(const int fds[], unsigned n_fd) {
	102	unsigned i;
	103
104	assert(fds \|\| n_fd <= 0);
105
106	for (i = 0; i < n_fd; i++)
107	safe_close(fds[i]);
108	}
109
110	int fclose_nointr(FILE *f) {
111	assert(f);
112
113	/* Same as close_nointr(), but for fclose() */
114
115	if (fclose(f) == 0)
116	return 0;
117
118	if (errno == EINTR)
119	return 0;
120
121	return -errno;
122	}
123
124	FILE* safe_fclose(FILE *f) {
125
126	/* Same as safe_close(), but for fclose() */
127
128	if (f) {
129	PROTECT_ERRNO;
130
131	assert_se(fclose_nointr(f) != EBADF);
132	}
133
134	return NULL;
135	}
136
137	DIR* safe_closedir(DIR *d) {
138
139	if (d) {
140	PROTECT_ERRNO;
141
142	assert_se(closedir(d) >= 0 \|\| errno != EBADF);
143	}
144
145	return NULL;
146	}
147
148	int fd_nonblock(int fd, bool nonblock) {
149	int flags, nflags;
150
151	assert(fd >= 0);
152
153	flags = fcntl(fd, F_GETFL, 0);
154	if (flags < 0)
155	return -errno;
156
157	if (nonblock)
158	nflags = flags \| O_NONBLOCK;
159	else
160	nflags = flags & ~O_NONBLOCK;
161
162	if (nflags == flags)
163	return 0;
164
165	if (fcntl(fd, F_SETFL, nflags) < 0)
166	return -errno;
167
168	return 0;
169	}
170
171	int fd_cloexec(int fd, bool cloexec) {
172	int flags, nflags;
173
174	assert(fd >= 0);
175
176	flags = fcntl(fd, F_GETFD, 0);
177	if (flags < 0)
178	return -errno;
179
180	if (cloexec)
181	nflags = flags \| FD_CLOEXEC;
182	else
183	nflags = flags & ~FD_CLOEXEC;
184
185	if (nflags == flags)
186	return 0;
187
188	if (fcntl(fd, F_SETFD, nflags) < 0)
189	return -errno;
190
191	return 0;
192	}
193
3b9a1d87	194	void stdio_unset_cloexec(void) {
61ccf772 LP	195	(void) fd_cloexec(STDIN_FILENO, false);
	196	(void) fd_cloexec(STDOUT_FILENO, false);
	197	(void) fd_cloexec(STDERR_FILENO, false);
3b9a1d87 AK	198	}
3b9a1d87 AK	199
3ffd4af2 LP	200	_pure_ static bool fd_in_set(int fd, const int fdset[], unsigned n_fdset) {
	201	unsigned i;
	202
	203	assert(n_fdset == 0 \|\| fdset);
	204
	205	for (i = 0; i < n_fdset; i++)
	206	if (fdset[i] == fd)
	207	return true;
	208
	209	return false;
	210	}
	211
	212	int close_all_fds(const int except[], unsigned n_except) {
	213	_cleanup_closedir_ DIR *d = NULL;
	214	struct dirent *de;
	215	int r = 0;
	216
	217	assert(n_except == 0 \|\| except);
	218
	219	d = opendir("/proc/self/fd");
	220	if (!d) {
	221	int fd;
	222	struct rlimit rl;
	223
	224	/* When /proc isn't available (for example in chroots)
	225	* the fallback is brute forcing through the fd
	226	* table */
	227
	228	assert_se(getrlimit(RLIMIT_NOFILE, &rl) >= 0);
	229	for (fd = 3; fd < (int) rl.rlim_max; fd ++) {
	230
	231	if (fd_in_set(fd, except, n_except))
	232	continue;
	233
	234	if (close_nointr(fd) < 0)
	235	if (errno != EBADF && r == 0)
	236	r = -errno;
	237	}
	238
	239	return r;
	240	}
	241
8fb3f009	242	FOREACH_DIRENT(de, d, return -errno) {
3ffd4af2 LP	243	int fd = -1;
3ffd4af2 LP	244
3ffd4af2 LP	245	if (safe_atoi(de->d_name, &fd) < 0)
	246	/* Let's better ignore this, just in case */
	247	continue;
	248
	249	if (fd < 3)
	250	continue;
	251
	252	if (fd == dirfd(d))
	253	continue;
	254
	255	if (fd_in_set(fd, except, n_except))
	256	continue;
	257
	258	if (close_nointr(fd) < 0) {
	259	/* Valgrind has its own FD and doesn't want to have it closed */
	260	if (errno != EBADF && r == 0)
	261	r = -errno;
	262	}
	263	}
	264
	265	return r;
	266	}
	267
	268	int same_fd(int a, int b) {
	269	struct stat sta, stb;
	270	pid_t pid;
	271	int r, fa, fb;
	272
	273	assert(a >= 0);
	274	assert(b >= 0);
	275
	276	/* Compares two file descriptors. Note that semantics are
	277	* quite different depending on whether we have kcmp() or we
	278	* don't. If we have kcmp() this will only return true for
	279	* dup()ed file descriptors, but not otherwise. If we don't
	280	* have kcmp() this will also return true for two fds of the same
	281	* file, created by separate open() calls. Since we use this
	282	* call mostly for filtering out duplicates in the fd store
	283	* this difference hopefully doesn't matter too much. */
	284
	285	if (a == b)
	286	return true;
	287
	288	/* Try to use kcmp() if we have it. */
df0ff127	289	pid = getpid_cached();
3ffd4af2 LP	290	r = kcmp(pid, pid, KCMP_FILE, a, b);
	291	if (r == 0)
	292	return true;
	293	if (r > 0)
	294	return false;
	295	if (errno != ENOSYS)
	296	return -errno;
	297
	298	/* We don't have kcmp(), use fstat() instead. */
	299	if (fstat(a, &sta) < 0)
	300	return -errno;
	301
	302	if (fstat(b, &stb) < 0)
	303	return -errno;
	304
	305	if ((sta.st_mode & S_IFMT) != (stb.st_mode & S_IFMT))
	306	return false;
	307
	308	/* We consider all device fds different, since two device fds
	309	* might refer to quite different device contexts even though
	310	* they share the same inode and backing dev_t. */
	311
	312	if (S_ISCHR(sta.st_mode) \|\| S_ISBLK(sta.st_mode))
	313	return false;
	314
	315	if (sta.st_dev != stb.st_dev \|\| sta.st_ino != stb.st_ino)
	316	return false;
	317
	318	/* The fds refer to the same inode on disk, let's also check
	319	* if they have the same fd flags. This is useful to
	320	* distinguish the read and write side of a pipe created with
	321	* pipe(). */
	322	fa = fcntl(a, F_GETFL);
	323	if (fa < 0)
	324	return -errno;
	325
	326	fb = fcntl(b, F_GETFL);
	327	if (fb < 0)
	328	return -errno;
	329
	330	return fa == fb;
	331	}
	332
	333	void cmsg_close_all(struct msghdr *mh) {
	334	struct cmsghdr *cmsg;
	335
	336	assert(mh);
	337
	338	CMSG_FOREACH(cmsg, mh)
	339	if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS)
	340	close_many((int*) CMSG_DATA(cmsg), (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
	341	}
4fee3975 LP	342
	343	bool fdname_is_valid(const char *s) {
	344	const char *p;
	345
	346	/* Validates a name for $LISTEN_FDNAMES. We basically allow
	347	* everything ASCII that's not a control character. Also, as
	348	* special exception the ":" character is not allowed, as we
	349	* use that as field separator in $LISTEN_FDNAMES.
	350	*
	351	* Note that the empty string is explicitly allowed
	352	* here. However, we limit the length of the names to 255
	353	* characters. */
	354
	355	if (!s)
	356	return false;
	357
	358	for (p = s; *p; p++) {
	359	if (*p < ' ')
	360	return false;
	361	if (*p >= 127)
	362	return false;
	363	if (*p == ':')
	364	return false;
	365	}
	366
	367	return p - s < 256;
	368	}
4aeb20f5 LP	369
4aeb20f5 LP	370	int fd_get_path(int fd, char **ret) {
dbcb4a90	371	char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
a0fe2a2d	372	int r;
4aeb20f5 LP	373
	374	xsprintf(procfs_path, "/proc/self/fd/%i", fd);
	375
a0fe2a2d LP	376	r = readlink_malloc(procfs_path, ret);
	377
	378	if (r == -ENOENT) /* If the file doesn't exist the fd is invalid */
	379	return -EBADF;
	380
	381	return r;
4aeb20f5	382	}
046a82c1 LP	383
	384	int move_fd(int from, int to, int cloexec) {
	385	int r;
	386
	387	/* Move fd 'from' to 'to', make sure FD_CLOEXEC remains equal if requested, and release the old fd. If
	388	* 'cloexec' is passed as -1, the original FD_CLOEXEC is inherited for the new fd. If it is 0, it is turned
	389	* off, if it is > 0 it is turned on. */
	390
	391	if (from < 0)
	392	return -EBADF;
	393	if (to < 0)
	394	return -EBADF;
	395
	396	if (from == to) {
	397
	398	if (cloexec >= 0) {
	399	r = fd_cloexec(to, cloexec);
	400	if (r < 0)
	401	return r;
	402	}
	403
	404	return to;
	405	}
	406
	407	if (cloexec < 0) {
	408	int fl;
	409
	410	fl = fcntl(from, F_GETFD, 0);
	411	if (fl < 0)
	412	return -errno;
	413
	414	cloexec = !!(fl & FD_CLOEXEC);
	415	}
	416
	417	r = dup3(from, to, cloexec ? O_CLOEXEC : 0);
	418	if (r < 0)
	419	return -errno;
	420
	421	assert(r == to);
	422
	423	safe_close(from);
	424
	425	return to;
	426	}
a548e14d LP	427
	428	int acquire_data_fd(const void *data, size_t size, unsigned flags) {
	429
fbd0b64f	430	char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
a548e14d LP	431	_cleanup_close_pair_ int pipefds[2] = { -1, -1 };
	432	char pattern[] = "/dev/shm/data-fd-XXXXXX";
	433	_cleanup_close_ int fd = -1;
	434	int isz = 0, r;
	435	ssize_t n;
	436	off_t f;
	437
	438	assert(data \|\| size == 0);
	439
	440	/* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
	441	* complex than I wish it was. But here's why:
	442	*
	443	* a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
	444	* read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
	445	*
	446	* b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
	447	* a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
	448	* clients can only bump their size to a system-wide limit, which might be quite low.
	449	*
	450	* c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
	451	* earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
	452	* /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
	453	*
	454	* d) Finally, we try creating a regular file in /dev/shm, which we then delete.
	455	*
	456	* It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
	457	* figure. */
	458
	459	if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) {
	460	/* As a special case, return /dev/null if we have been called for an empty data block */
	461	r = open("/dev/null", O_RDONLY\|O_CLOEXEC\|O_NOCTTY);
	462	if (r < 0)
	463	return -errno;
	464
	465	return r;
	466	}
	467
	468	if ((flags & ACQUIRE_NO_MEMFD) == 0) {
	469	fd = memfd_new("data-fd");
	470	if (fd < 0)
	471	goto try_pipe;
	472
	473	n = write(fd, data, size);
	474	if (n < 0)
	475	return -errno;
	476	if ((size_t) n != size)
	477	return -EIO;
	478
	479	f = lseek(fd, 0, SEEK_SET);
	480	if (f != 0)
	481	return -errno;
	482
	483	r = memfd_set_sealed(fd);
	484	if (r < 0)
	485	return r;
	486
	487	r = fd;
	488	fd = -1;
	489
	490	return r;
	491	}
	492
	493	try_pipe:
	494	if ((flags & ACQUIRE_NO_PIPE) == 0) {
495	if (pipe2(pipefds, O_CLOEXEC\|O_NONBLOCK) < 0)
496	return -errno;
497
498	isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
499	if (isz < 0)
500	return -errno;
501
502	if ((size_t) isz < size) {
503	isz = (int) size;
504	if (isz < 0 \|\| (size_t) isz != size)
505	return -E2BIG;
506
507	/* Try to bump the pipe size */
508	(void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
509
510	/* See if that worked */
511	isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
512	if (isz < 0)
513	return -errno;
514
515	if ((size_t) isz < size)
516	goto try_dev_shm;
517	}
518
519	n = write(pipefds[1], data, size);
520	if (n < 0)
521	return -errno;
522	if ((size_t) n != size)
523	return -EIO;
524
525	(void) fd_nonblock(pipefds[0], false);
526
527	r = pipefds[0];
528	pipefds[0] = -1;
529
530	return r;
531	}
532
533	try_dev_shm:
534	if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
535	fd = open("/dev/shm", O_RDWR\|O_TMPFILE\|O_CLOEXEC, 0500);
536	if (fd < 0)
537	goto try_dev_shm_without_o_tmpfile;
538
539	n = write(fd, data, size);
540	if (n < 0)
541	return -errno;
542	if ((size_t) n != size)
543	return -EIO;
544
545	/* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
546	xsprintf(procfs_path, "/proc/self/fd/%i", fd);
547	r = open(procfs_path, O_RDONLY\|O_CLOEXEC);
548	if (r < 0)
549	return -errno;
550
551	return r;
552	}
553
554	try_dev_shm_without_o_tmpfile:
555	if ((flags & ACQUIRE_NO_REGULAR) == 0) {
556	fd = mkostemp_safe(pattern);
557	if (fd < 0)
558	return fd;
559
560	n = write(fd, data, size);
561	if (n < 0) {
562	r = -errno;
563	goto unlink_and_return;
564	}
565	if ((size_t) n != size) {
566	r = -EIO;
567	goto unlink_and_return;
568	}
569
570	/* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
571	r = open(pattern, O_RDONLY\|O_CLOEXEC);
572	if (r < 0)
573	r = -errno;
574
575	unlink_and_return:
576	(void) unlink(pattern);
577	return r;
578	}
579
580	return -EOPNOTSUPP;
581	}