]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/fd-util.c
man/systemd-sysext: list ephemeral/ephemeral-import in the list of options
[thirdparty/systemd.git] / src / basic / fd-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
3ffd4af2 2
11c3a366 3#include <fcntl.h>
fbf68368 4#include <linux/fs.h>
0f633e52 5#include <linux/kcmp.h>
65ddc2c5 6#include <sys/ioctl.h>
11c3a366 7#include <sys/resource.h>
11c3a366
TA
8#include <sys/stat.h>
9#include <unistd.h>
10
4960ce43 11#include "alloc-util.h"
8fb3f009 12#include "dirent-util.h"
0c15577a 13#include "errno-util.h"
3ffd4af2 14#include "fd-util.h"
a548e14d 15#include "fileio.h"
0c15577a 16#include "format-util.h"
4aeb20f5 17#include "fs-util.h"
93a1f792 18#include "log.h"
0499585f 19#include "missing_fcntl.h"
f5947a5e 20#include "missing_syscall.h"
af423b4b 21#include "mountpoint-util.h"
93cc7779 22#include "parse-util.h"
11c3a366 23#include "path-util.h"
df0ff127 24#include "process-util.h"
93cc7779 25#include "socket-util.h"
b8cfa2da 26#include "sort-util.h"
f8606626 27#include "stat-util.h"
4aeb20f5 28#include "stdio-util.h"
0c15577a 29#include "string-util.h"
3ffd4af2 30
6a461d1f
ZJS
31/* The maximum number of iterations in the loop to close descriptors in the fallback case
32 * when /proc/self/fd/ is inaccessible. */
33#define MAX_FD_LOOP_LIMIT (1024*1024)
34
3ffd4af2
LP
35int close_nointr(int fd) {
36 assert(fd >= 0);
37
38 if (close(fd) >= 0)
39 return 0;
40
41 /*
42 * Just ignore EINTR; a retry loop is the wrong thing to do on
43 * Linux.
44 *
45 * http://lkml.indiana.edu/hypermail/linux/kernel/0509.1/0877.html
46 * https://bugzilla.gnome.org/show_bug.cgi?id=682819
47 * http://utcc.utoronto.ca/~cks/space/blog/unix/CloseEINTR
48 * https://sites.google.com/site/michaelsafyan/software-engineering/checkforeintrwheninvokingclosethinkagain
49 */
50 if (errno == EINTR)
51 return 0;
52
53 return -errno;
54}
55
56int safe_close(int fd) {
3ffd4af2 57 /*
254d1313
ZJS
58 * Like close_nointr() but cannot fail. Guarantees errno is unchanged. Is a noop for negative fds,
59 * and returns -EBADF, so that it can be used in this syntax:
3ffd4af2
LP
60 *
61 * fd = safe_close(fd);
62 */
63
64 if (fd >= 0) {
65 PROTECT_ERRNO;
66
67 /* The kernel might return pretty much any error code
68 * via close(), but the fd will be closed anyway. The
69 * only condition we want to check for here is whether
70 * the fd was invalid at all... */
71
72 assert_se(close_nointr(fd) != -EBADF);
73 }
74
254d1313 75 return -EBADF;
3ffd4af2
LP
76}
77
3042bbeb 78void safe_close_pair(int p[static 2]) {
3ffd4af2
LP
79 assert(p);
80
81 if (p[0] == p[1]) {
82 /* Special case pairs which use the same fd in both
83 * directions... */
84 p[0] = p[1] = safe_close(p[0]);
85 return;
86 }
87
88 p[0] = safe_close(p[0]);
89 p[1] = safe_close(p[1]);
90}
91
1276e633
MY
92void close_many(const int fds[], size_t n_fds) {
93 assert(fds || n_fds == 0);
3ffd4af2 94
1276e633
MY
95 FOREACH_ARRAY(fd, fds, n_fds)
96 safe_close(*fd);
3ffd4af2
LP
97}
98
1276e633
MY
99void close_many_unset(int fds[], size_t n_fds) {
100 assert(fds || n_fds == 0);
d3eb74f8 101
1276e633
MY
102 FOREACH_ARRAY(fd, fds, n_fds)
103 *fd = safe_close(*fd);
d3eb74f8
LP
104}
105
3b444970 106void close_many_and_free(int *fds, size_t n_fds) {
1276e633 107 assert(fds || n_fds == 0);
3b444970
LP
108
109 close_many(fds, n_fds);
110 free(fds);
111}
112
3ffd4af2
LP
113int fclose_nointr(FILE *f) {
114 assert(f);
115
116 /* Same as close_nointr(), but for fclose() */
117
75f6d5d8
LP
118 errno = 0; /* Extra safety: if the FILE* object is not encapsulating an fd, it might not set errno
119 * correctly. Let's hence initialize it to zero first, so that we aren't confused by any
120 * prior errno here */
3ffd4af2
LP
121 if (fclose(f) == 0)
122 return 0;
123
124 if (errno == EINTR)
125 return 0;
126
75f6d5d8 127 return errno_or_else(EIO);
3ffd4af2
LP
128}
129
130FILE* safe_fclose(FILE *f) {
131
132 /* Same as safe_close(), but for fclose() */
133
134 if (f) {
135 PROTECT_ERRNO;
136
6dce3bb4 137 assert_se(fclose_nointr(f) != -EBADF);
3ffd4af2
LP
138 }
139
140 return NULL;
141}
142
143DIR* safe_closedir(DIR *d) {
144
145 if (d) {
146 PROTECT_ERRNO;
147
148 assert_se(closedir(d) >= 0 || errno != EBADF);
149 }
150
151 return NULL;
152}
153
154int fd_nonblock(int fd, bool nonblock) {
155 int flags, nflags;
156
157 assert(fd >= 0);
158
159 flags = fcntl(fd, F_GETFL, 0);
160 if (flags < 0)
161 return -errno;
162
0da96503 163 nflags = UPDATE_FLAG(flags, O_NONBLOCK, nonblock);
3ffd4af2
LP
164 if (nflags == flags)
165 return 0;
166
e6724664
DDM
167 if (fcntl(fd, F_SETFL, nflags) < 0)
168 return -errno;
169
170 return 1;
3ffd4af2
LP
171}
172
3b1e80f7
LP
173int stdio_disable_nonblock(void) {
174 int ret = 0;
175
176 /* stdin/stdout/stderr really should have O_NONBLOCK, which would confuse apps if left on, as
177 * write()s might unexpectedly fail with EAGAIN. */
178
179 RET_GATHER(ret, fd_nonblock(STDIN_FILENO, false));
180 RET_GATHER(ret, fd_nonblock(STDOUT_FILENO, false));
181 RET_GATHER(ret, fd_nonblock(STDERR_FILENO, false));
182
183 return ret;
184}
185
3ffd4af2
LP
186int fd_cloexec(int fd, bool cloexec) {
187 int flags, nflags;
188
189 assert(fd >= 0);
190
191 flags = fcntl(fd, F_GETFD, 0);
192 if (flags < 0)
193 return -errno;
194
0da96503 195 nflags = UPDATE_FLAG(flags, FD_CLOEXEC, cloexec);
3ffd4af2
LP
196 if (nflags == flags)
197 return 0;
198
7c248223 199 return RET_NERRNO(fcntl(fd, F_SETFD, nflags));
3ffd4af2
LP
200}
201
ed18c22c 202int fd_cloexec_many(const int fds[], size_t n_fds, bool cloexec) {
6b9cac87 203 int r = 0;
ed18c22c 204
1276e633 205 assert(fds || n_fds == 0);
ed18c22c 206
1276e633
MY
207 FOREACH_ARRAY(fd, fds, n_fds) {
208 if (*fd < 0) /* Skip gracefully over already invalidated fds */
ed18c22c
LP
209 continue;
210
6b9cac87 211 RET_GATHER(r, fd_cloexec(*fd, cloexec));
ed18c22c
LP
212 }
213
6b9cac87 214 return r;
ed18c22c
LP
215}
216
1276e633
MY
217static bool fd_in_set(int fd, const int fds[], size_t n_fds) {
218 assert(fd >= 0);
219 assert(fds || n_fds == 0);
3ffd4af2 220
1276e633
MY
221 FOREACH_ARRAY(i, fds, n_fds) {
222 if (*i < 0)
d11c14a9
LP
223 continue;
224
1276e633 225 if (*i == fd)
3ffd4af2 226 return true;
d11c14a9 227 }
3ffd4af2
LP
228
229 return false;
230}
231
73fc0cbc 232int get_max_fd(void) {
498e265d
LP
233 struct rlimit rl;
234 rlim_t m;
235
236 /* Return the highest possible fd, based RLIMIT_NOFILE, but enforcing FD_SETSIZE-1 as lower boundary
237 * and INT_MAX as upper boundary. */
238
239 if (getrlimit(RLIMIT_NOFILE, &rl) < 0)
240 return -errno;
241
242 m = MAX(rl.rlim_cur, rl.rlim_max);
243 if (m < FD_SETSIZE) /* Let's always cover at least 1024 fds */
244 return FD_SETSIZE-1;
245
246 if (m == RLIM_INFINITY || m > INT_MAX) /* Saturate on overflow. After all fds are "int", hence can
247 * never be above INT_MAX */
248 return INT_MAX;
249
250 return (int) (m - 1);
251}
252
5cfa0798 253static int close_all_fds_frugal(const int except[], size_t n_except) {
11966552
LP
254 int max_fd, r = 0;
255
1276e633 256 assert(except || n_except == 0);
11966552
LP
257
258 /* This is the inner fallback core of close_all_fds(). This never calls malloc() or opendir() or so
259 * and hence is safe to be called in signal handler context. Most users should call close_all_fds(),
260 * but when we assume we are called from signal handler context, then use this simpler call
261 * instead. */
262
263 max_fd = get_max_fd();
264 if (max_fd < 0)
265 return max_fd;
266
267 /* Refuse to do the loop over more too many elements. It's better to fail immediately than to
268 * spin the CPU for a long time. */
269 if (max_fd > MAX_FD_LOOP_LIMIT)
270 return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
1276e633 271 "Refusing to loop over %d potential fds.", max_fd);
11966552 272
5bb1d7fb 273 for (int fd = 3; fd >= 0; fd = fd < max_fd ? fd + 1 : -EBADF) {
11966552
LP
274 int q;
275
276 if (fd_in_set(fd, except, n_except))
277 continue;
278
279 q = close_nointr(fd);
1276e633
MY
280 if (q != -EBADF)
281 RET_GATHER(r, q);
11966552
LP
282 }
283
284 return r;
285}
286
5cfa0798 287static bool have_close_range = true; /* Assume we live in the future */
3ffd4af2 288
5cfa0798 289static int close_all_fds_special_case(const int except[], size_t n_except) {
3ffd4af2
LP
290 assert(n_except == 0 || except);
291
5cfa0798
LP
292 /* Handles a few common special cases separately, since they are common and can be optimized really
293 * nicely, since we won't need sorting for them. Returns > 0 if the special casing worked, 0
294 * otherwise. */
b8cfa2da 295
5cfa0798
LP
296 if (!have_close_range)
297 return 0;
b8cfa2da 298
d11c14a9
LP
299 if (n_except == 1 && except[0] < 0) /* Minor optimization: if we only got one fd, and it's invalid,
300 * we got none */
301 n_except = 0;
302
5cfa0798 303 switch (n_except) {
b8cfa2da 304
5cfa0798
LP
305 case 0:
306 /* Close everything. Yay! */
b8cfa2da 307
39d69836 308 if (close_range(3, INT_MAX, 0) >= 0)
5cfa0798 309 return 1;
f498720a 310
5cfa0798
LP
311 if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
312 have_close_range = false;
313 return 0;
314 }
f498720a 315
5cfa0798 316 return -errno;
f498720a 317
5cfa0798
LP
318 case 1:
319 /* Close all but exactly one, then we don't need no sorting. This is a pretty common
320 * case, hence let's handle it specially. */
f498720a 321
5cfa0798
LP
322 if ((except[0] <= 3 || close_range(3, except[0]-1, 0) >= 0) &&
323 (except[0] >= INT_MAX || close_range(MAX(3, except[0]+1), -1, 0) >= 0))
324 return 1;
f498720a 325
5cfa0798 326 if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
f498720a 327 have_close_range = false;
5cfa0798
LP
328 return 0;
329 }
f498720a 330
5cfa0798 331 return -errno;
c85cb3bc 332
5cfa0798
LP
333 default:
334 return 0;
335 }
336}
c85cb3bc 337
5cfa0798
LP
338int close_all_fds_without_malloc(const int except[], size_t n_except) {
339 int r;
c85cb3bc 340
5cfa0798 341 assert(n_except == 0 || except);
c85cb3bc 342
5cfa0798
LP
343 r = close_all_fds_special_case(except, n_except);
344 if (r < 0)
345 return r;
346 if (r > 0) /* special case worked! */
347 return 0;
b8cfa2da 348
5cfa0798
LP
349 return close_all_fds_frugal(except, n_except);
350}
b8cfa2da 351
5cfa0798
LP
352int close_all_fds(const int except[], size_t n_except) {
353 _cleanup_closedir_ DIR *d = NULL;
5cfa0798
LP
354 int r = 0;
355
356 assert(n_except == 0 || except);
357
358 r = close_all_fds_special_case(except, n_except);
359 if (r < 0)
360 return r;
361 if (r > 0) /* special case worked! */
362 return 0;
363
364 if (have_close_range) {
365 _cleanup_free_ int *sorted_malloc = NULL;
366 size_t n_sorted;
367 int *sorted;
368
369 /* In the best case we have close_range() to close all fds between a start and an end fd,
370 * which we can use on the "inverted" exception array, i.e. all intervals between all
371 * adjacent pairs from the sorted exception array. This changes loop complexity from O(n)
372 * where n is number of open fds to O(m⋅log(m)) where m is the number of fds to keep
373 * open. Given that we assume n ≫ m that's preferable to us. */
b8cfa2da 374
5cfa0798
LP
375 assert(n_except < SIZE_MAX);
376 n_sorted = n_except + 1;
c85cb3bc 377
5cfa0798
LP
378 if (n_sorted > 64) /* Use heap for large numbers of fds, stack otherwise */
379 sorted = sorted_malloc = new(int, n_sorted);
380 else
381 sorted = newa(int, n_sorted);
c85cb3bc 382
5cfa0798
LP
383 if (sorted) {
384 memcpy(sorted, except, n_except * sizeof(int));
c85cb3bc 385
5cfa0798
LP
386 /* Let's add fd 2 to the list of fds, to simplify the loop below, as this
387 * allows us to cover the head of the array the same way as the body */
388 sorted[n_sorted-1] = 2;
b8cfa2da 389
5cfa0798
LP
390 typesafe_qsort(sorted, n_sorted, cmp_int);
391
392 for (size_t i = 0; i < n_sorted-1; i++) {
393 int start, end;
b8cfa2da 394
5cfa0798
LP
395 start = MAX(sorted[i], 2); /* The first three fds shall always remain open */
396 end = MAX(sorted[i+1], 2);
b8cfa2da 397
5cfa0798 398 assert(end >= start);
b8cfa2da 399
5cfa0798
LP
400 if (end - start <= 1)
401 continue;
b8cfa2da 402
5cfa0798
LP
403 /* Close everything between the start and end fds (both of which shall stay open) */
404 if (close_range(start + 1, end - 1, 0) < 0) {
c85cb3bc
LP
405 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
406 return -errno;
b8cfa2da 407
c85cb3bc 408 have_close_range = false;
5cfa0798 409 break;
c85cb3bc
LP
410 }
411 }
5cfa0798
LP
412
413 if (have_close_range) {
414 /* The loop succeeded. Let's now close everything beyond the end */
415
416 if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
417 return 0;
418
39d69836 419 if (close_range(sorted[n_sorted-1] + 1, INT_MAX, 0) >= 0)
5cfa0798
LP
420 return 0;
421
422 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
423 return -errno;
424
425 have_close_range = false;
426 }
b8cfa2da 427 }
c85cb3bc
LP
428
429 /* Fallback on OOM or if close_range() is not supported */
b8cfa2da
LP
430 }
431
e7e7c07c 432 d = opendir("/proc/self/fd");
11966552 433 if (!d)
5cfa0798 434 return close_all_fds_frugal(except, n_except); /* ultimate fallback if /proc/ is not available */
3ffd4af2 435
c85cb3bc 436 FOREACH_DIRENT(de, d, return -errno) {
254d1313 437 int fd = -EBADF, q;
3ffd4af2 438
1f6639ea
LP
439 if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN))
440 continue;
441
e652663a
DT
442 fd = parse_fd(de->d_name);
443 if (fd < 0)
c85cb3bc
LP
444 /* Let's better ignore this, just in case */
445 continue;
3ffd4af2 446
c85cb3bc
LP
447 if (fd < 3)
448 continue;
3ffd4af2 449
c85cb3bc
LP
450 if (fd == dirfd(d))
451 continue;
3ffd4af2
LP
452
453 if (fd_in_set(fd, except, n_except))
454 continue;
455
e43bc9f5 456 q = close_nointr(fd);
c85cb3bc 457 if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */
e43bc9f5 458 r = q;
3ffd4af2
LP
459 }
460
461 return r;
462}
463
85f660d4
AV
464int pack_fds(int fds[], size_t n_fds) {
465 if (n_fds <= 0)
466 return 0;
467
468 /* Shifts around the fds in the provided array such that they
469 * all end up packed next to each-other, in order, starting
470 * from SD_LISTEN_FDS_START. This must be called after close_all_fds();
471 * it is likely to freeze up otherwise. You should probably use safe_fork_full
472 * with FORK_CLOSE_ALL_FDS|FORK_PACK_FDS set, to ensure that this is done correctly.
473 * The fds array is modified in place with the new FD numbers. */
474
475 assert(fds);
476
477 for (int start = 0;;) {
478 int restart_from = -1;
479
480 for (int i = start; i < (int) n_fds; i++) {
481 int nfd;
482
483 /* Already at right index? */
484 if (fds[i] == i + 3)
485 continue;
486
487 nfd = fcntl(fds[i], F_DUPFD, i + 3);
488 if (nfd < 0)
489 return -errno;
490
491 safe_close(fds[i]);
492 fds[i] = nfd;
493
494 /* Hmm, the fd we wanted isn't free? Then
495 * let's remember that and try again from here */
496 if (nfd != i + 3 && restart_from < 0)
497 restart_from = i;
498 }
499
500 if (restart_from < 0)
501 break;
502
503 start = restart_from;
504 }
505
506 assert(fds[0] == 3);
507
508 return 0;
509}
510
6056663a
LP
511int fd_validate(int fd) {
512 if (fd < 0)
513 return -EBADF;
514
515 if (fcntl(fd, F_GETFD) < 0)
516 return -errno;
517
518 return 0;
519}
520
3ffd4af2
LP
521int same_fd(int a, int b) {
522 struct stat sta, stb;
523 pid_t pid;
524 int r, fa, fb;
525
526 assert(a >= 0);
527 assert(b >= 0);
528
675e7fc2 529 /* Compares two file descriptors. Note that semantics are quite different depending on whether we
e7f90534
LP
530 * have F_DUPFD_QUERY/kcmp() or we don't. If we have F_DUPFD_QUERY/kcmp() this will only return true
531 * for dup()ed file descriptors, but not otherwise. If we don't have F_DUPFD_QUERY/kcmp() this will
532 * also return true for two fds of the same file, created by separate open() calls. Since we use this
533 * call mostly for filtering out duplicates in the fd store this difference hopefully doesn't matter
534 * too much.
535 *
536 * Guarantees that if either of the passed fds is not allocated we'll return -EBADF. */
537
538 if (a == b) {
539 /* Let's validate that the fd is valid */
540 r = fd_validate(a);
541 if (r < 0)
542 return r;
3ffd4af2 543
3ffd4af2 544 return true;
e7f90534
LP
545 }
546
547 /* Try to use F_DUPFD_QUERY if we have it first, as it is the nicest API */
548 r = fcntl(a, F_DUPFD_QUERY, b);
549 if (r > 0)
550 return true;
551 if (r == 0) {
552 /* The kernel will return 0 in case the first fd is allocated, but the 2nd is not. (Which is different in the kcmp() case) Explicitly validate it hence. */
553 r = fd_validate(b);
554 if (r < 0)
555 return r;
556
557 return false;
558 }
559 /* On old kernels (< 6.10) that do not support F_DUPFD_QUERY this will return EINVAL for regular fds, and EBADF on O_PATH fds. Confusing. */
560 if (errno == EBADF) {
561 /* EBADF could mean two things: the first fd is not valid, or it is valid and is O_PATH and
562 * F_DUPFD_QUERY is not supported. Let's validate the fd explicitly, to distinguish this
563 * case. */
564 r = fd_validate(a);
565 if (r < 0)
566 return r;
567
568 /* If the fd is valid, but we got EBADF, then let's try kcmp(). */
569 } else if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno) && errno != EINVAL)
570 return -errno;
3ffd4af2
LP
571
572 /* Try to use kcmp() if we have it. */
df0ff127 573 pid = getpid_cached();
3ffd4af2 574 r = kcmp(pid, pid, KCMP_FILE, a, b);
e7f90534
LP
575 if (r >= 0)
576 return !r;
675e7fc2 577 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3ffd4af2
LP
578 return -errno;
579
e7f90534 580 /* We have neither F_DUPFD_QUERY nor kcmp(), use fstat() instead. */
3ffd4af2
LP
581 if (fstat(a, &sta) < 0)
582 return -errno;
583
584 if (fstat(b, &stb) < 0)
585 return -errno;
586
a9dac7a6 587 if (!stat_inode_same(&sta, &stb))
3ffd4af2
LP
588 return false;
589
675e7fc2
LP
590 /* We consider all device fds different, since two device fds might refer to quite different device
591 * contexts even though they share the same inode and backing dev_t. */
3ffd4af2
LP
592
593 if (S_ISCHR(sta.st_mode) || S_ISBLK(sta.st_mode))
594 return false;
595
a9dac7a6
LP
596 /* The fds refer to the same inode on disk, let's also check if they have the same fd flags. This is
597 * useful to distinguish the read and write side of a pipe created with pipe(). */
3ffd4af2
LP
598 fa = fcntl(a, F_GETFL);
599 if (fa < 0)
600 return -errno;
601
602 fb = fcntl(b, F_GETFL);
603 if (fb < 0)
604 return -errno;
605
606 return fa == fb;
607}
608
4fee3975
LP
609bool fdname_is_valid(const char *s) {
610 const char *p;
611
612 /* Validates a name for $LISTEN_FDNAMES. We basically allow
613 * everything ASCII that's not a control character. Also, as
614 * special exception the ":" character is not allowed, as we
615 * use that as field separator in $LISTEN_FDNAMES.
616 *
617 * Note that the empty string is explicitly allowed
618 * here. However, we limit the length of the names to 255
619 * characters. */
620
621 if (!s)
622 return false;
623
624 for (p = s; *p; p++) {
625 if (*p < ' ')
626 return false;
627 if (*p >= 127)
628 return false;
629 if (*p == ':')
630 return false;
631 }
632
ae3f4bae 633 return p - s <= FDNAME_MAX;
4fee3975 634}
4aeb20f5
LP
635
636int fd_get_path(int fd, char **ret) {
a0fe2a2d 637 int r;
4aeb20f5 638
46693a79
YW
639 assert(fd >= 0 || fd == AT_FDCWD);
640
641 if (fd == AT_FDCWD)
642 return safe_getcwd(ret);
643
ddb6eeaf 644 r = readlink_malloc(FORMAT_PROC_FD_PATH(fd), ret);
d19b3c5d
MY
645 if (r == -ENOENT)
646 return proc_fd_enoent_errno();
a0fe2a2d 647 return r;
4aeb20f5 648}
046a82c1
LP
649
650int move_fd(int from, int to, int cloexec) {
651 int r;
652
653 /* Move fd 'from' to 'to', make sure FD_CLOEXEC remains equal if requested, and release the old fd. If
654 * 'cloexec' is passed as -1, the original FD_CLOEXEC is inherited for the new fd. If it is 0, it is turned
655 * off, if it is > 0 it is turned on. */
656
657 if (from < 0)
658 return -EBADF;
659 if (to < 0)
660 return -EBADF;
661
662 if (from == to) {
663
664 if (cloexec >= 0) {
665 r = fd_cloexec(to, cloexec);
666 if (r < 0)
667 return r;
668 }
669
670 return to;
671 }
672
673 if (cloexec < 0) {
674 int fl;
675
676 fl = fcntl(from, F_GETFD, 0);
677 if (fl < 0)
678 return -errno;
679
1276e633 680 cloexec = FLAGS_SET(fl, FD_CLOEXEC);
046a82c1
LP
681 }
682
683 r = dup3(from, to, cloexec ? O_CLOEXEC : 0);
684 if (r < 0)
685 return -errno;
686
687 assert(r == to);
688
689 safe_close(from);
690
691 return to;
692}
a548e14d 693
7fe2903c
LP
694int fd_move_above_stdio(int fd) {
695 int flags, copy;
696 PROTECT_ERRNO;
697
698 /* Moves the specified file descriptor if possible out of the range [0…2], i.e. the range of
699 * stdin/stdout/stderr. If it can't be moved outside of this range the original file descriptor is
700 * returned. This call is supposed to be used for long-lasting file descriptors we allocate in our code that
701 * might get loaded into foreign code, and where we want ensure our fds are unlikely used accidentally as
702 * stdin/stdout/stderr of unrelated code.
703 *
704 * Note that this doesn't fix any real bugs, it just makes it less likely that our code will be affected by
705 * buggy code from others that mindlessly invokes 'fprintf(stderr, …' or similar in places where stderr has
706 * been closed before.
707 *
708 * This function is written in a "best-effort" and "least-impact" style. This means whenever we encounter an
709 * error we simply return the original file descriptor, and we do not touch errno. */
710
711 if (fd < 0 || fd > 2)
712 return fd;
713
714 flags = fcntl(fd, F_GETFD, 0);
715 if (flags < 0)
716 return fd;
717
718 if (flags & FD_CLOEXEC)
719 copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
720 else
721 copy = fcntl(fd, F_DUPFD, 3);
722 if (copy < 0)
723 return fd;
724
725 assert(copy > 2);
726
727 (void) close(fd);
728 return copy;
729}
aa11e28b
LP
730
731int rearrange_stdio(int original_input_fd, int original_output_fd, int original_error_fd) {
351293b3
ZJS
732 int fd[3] = { original_input_fd, /* Put together an array of fds we work on */
733 original_output_fd,
734 original_error_fd },
735 null_fd = -EBADF, /* If we open /dev/null, we store the fd to it here */
71136404 736 copy_fd[3] = EBADF_TRIPLET, /* This contains all fds we duplicate here
351293b3
ZJS
737 * temporarily, and hence need to close at the end. */
738 r;
aa11e28b
LP
739 bool null_readable, null_writable;
740
254d1313
ZJS
741 /* Sets up stdin, stdout, stderr with the three file descriptors passed in. If any of the descriptors
742 * is specified as -EBADF it will be connected with /dev/null instead. If any of the file descriptors
743 * is passed as itself (e.g. stdin as STDIN_FILENO) it is left unmodified, but the O_CLOEXEC bit is
744 * turned off should it be on.
aa11e28b 745 *
254d1313
ZJS
746 * Note that if any of the passed file descriptors are > 2 they will be closed — both on success and
747 * on failure! Thus, callers should assume that when this function returns the input fds are
748 * invalidated.
aa11e28b
LP
749 *
750 * Note that when this function fails stdin/stdout/stderr might remain half set up!
751 *
752 * O_CLOEXEC is turned off for all three file descriptors (which is how it should be for
753 * stdin/stdout/stderr). */
754
755 null_readable = original_input_fd < 0;
756 null_writable = original_output_fd < 0 || original_error_fd < 0;
757
758 /* First step, open /dev/null once, if we need it */
759 if (null_readable || null_writable) {
760
761 /* Let's open this with O_CLOEXEC first, and convert it to non-O_CLOEXEC when we move the fd to the final position. */
762 null_fd = open("/dev/null", (null_readable && null_writable ? O_RDWR :
763 null_readable ? O_RDONLY : O_WRONLY) | O_CLOEXEC);
764 if (null_fd < 0) {
765 r = -errno;
766 goto finish;
767 }
768
769 /* If this fd is in the 0…2 range, let's move it out of it */
770 if (null_fd < 3) {
771 int copy;
772
773 copy = fcntl(null_fd, F_DUPFD_CLOEXEC, 3); /* Duplicate this with O_CLOEXEC set */
774 if (copy < 0) {
775 r = -errno;
776 goto finish;
777 }
778
ee3455cf 779 close_and_replace(null_fd, copy);
aa11e28b
LP
780 }
781 }
782
783 /* Let's assemble fd[] with the fds to install in place of stdin/stdout/stderr */
4199866a 784 for (int i = 0; i < 3; i++)
aa11e28b
LP
785 if (fd[i] < 0)
786 fd[i] = null_fd; /* A negative parameter means: connect this one to /dev/null */
787 else if (fd[i] != i && fd[i] < 3) {
788 /* This fd is in the 0…2 territory, but not at its intended place, move it out of there, so that we can work there. */
789 copy_fd[i] = fcntl(fd[i], F_DUPFD_CLOEXEC, 3); /* Duplicate this with O_CLOEXEC set */
790 if (copy_fd[i] < 0) {
791 r = -errno;
792 goto finish;
793 }
794
795 fd[i] = copy_fd[i];
796 }
aa11e28b 797
254d1313
ZJS
798 /* At this point we now have the fds to use in fd[], and they are all above the stdio range, so that
799 * we have freedom to move them around. If the fds already were at the right places then the specific
800 * fds are -EBADF. Let's now move them to the right places. This is the point of no return. */
4199866a 801 for (int i = 0; i < 3; i++)
aa11e28b 802 if (fd[i] == i) {
aa11e28b
LP
803 /* fd is already in place, but let's make sure O_CLOEXEC is off */
804 r = fd_cloexec(i, false);
805 if (r < 0)
806 goto finish;
aa11e28b
LP
807 } else {
808 assert(fd[i] > 2);
809
810 if (dup2(fd[i], i) < 0) { /* Turns off O_CLOEXEC on the new fd. */
811 r = -errno;
812 goto finish;
813 }
814 }
aa11e28b
LP
815
816 r = 0;
817
818finish:
819 /* Close the original fds, but only if they were outside of the stdio range. Also, properly check for the same
820 * fd passed in multiple times. */
821 safe_close_above_stdio(original_input_fd);
822 if (original_output_fd != original_input_fd)
823 safe_close_above_stdio(original_output_fd);
824 if (original_error_fd != original_input_fd && original_error_fd != original_output_fd)
825 safe_close_above_stdio(original_error_fd);
826
827 /* Close the copies we moved > 2 */
cf45db55 828 close_many(copy_fd, 3);
aa11e28b
LP
829
830 /* Close our null fd, if it's > 2 */
831 safe_close_above_stdio(null_fd);
832
833 return r;
834}
f2324783
LP
835
836int fd_reopen(int fd, int flags) {
42db4a8d 837 assert(fd >= 0 || fd == AT_FDCWD);
05314c9c 838 assert(!FLAGS_SET(flags, O_CREAT));
42db4a8d 839
f2324783
LP
840 /* Reopens the specified fd with new flags. This is useful for convert an O_PATH fd into a regular one, or to
841 * turn O_RDWR fds into O_RDONLY fds.
842 *
843 * This doesn't work on sockets (since they cannot be open()ed, ever).
844 *
42db4a8d
LP
845 * This implicitly resets the file read index to 0.
846 *
fdb583e6
LP
847 * If AT_FDCWD is specified as file descriptor gets an fd to the current cwd.
848 *
849 * If the specified file descriptor refers to a symlink via O_PATH, then this function cannot be used
850 * to follow that symlink. Because we cannot have non-O_PATH fds to symlinks reopening it without
851 * O_PATH will always result in -ELOOP. Or in other words: if you have an O_PATH fd to a symlink you
852 * can reopen it only if you pass O_PATH again. */
853
854 if (FLAGS_SET(flags, O_NOFOLLOW))
855 /* O_NOFOLLOW is not allowed in fd_reopen(), because after all this is primarily implemented
856 * via a symlink-based interface in /proc/self/fd. Let's refuse this here early. Note that
857 * the kernel would generate ELOOP here too, hence this manual check is mostly redundant –
858 * the only reason we add it here is so that the O_DIRECTORY special case (see below) behaves
859 * the same way as the non-O_DIRECTORY case. */
860 return -ELOOP;
f2324783 861
05314c9c 862 if (FLAGS_SET(flags, O_DIRECTORY) || fd == AT_FDCWD)
b4f73d1e
LP
863 /* If we shall reopen the fd as directory we can just go via "." and thus bypass the whole
864 * magic /proc/ directory, and make ourselves independent of that being mounted. */
05314c9c 865 return RET_NERRNO(openat(fd, ".", flags | O_DIRECTORY));
42db4a8d 866
05314c9c 867 int new_fd = open(FORMAT_PROC_FD_PATH(fd), flags);
f8606626
LP
868 if (new_fd < 0) {
869 if (errno != ENOENT)
870 return -errno;
871
d19b3c5d 872 return proc_fd_enoent_errno();
f8606626 873 }
f2324783
LP
874
875 return new_fd;
876}
9264cc39 877
b8e25bff
LE
878int fd_reopen_propagate_append_and_position(int fd, int flags) {
879 /* Invokes fd_reopen(fd, flags), but propagates O_APPEND if set on original fd, and also tries to
880 * keep current file position.
881 *
882 * You should use this if the original fd potentially is O_APPEND, otherwise we get rather
883 * "unexpected" behavior. Unless you intentionally want to overwrite pre-existing data, and have
884 * your output overwritten by the next user.
885 *
886 * Use case: "systemd-run --pty >> some-log".
887 *
888 * The "keep position" part is obviously nonsense for the O_APPEND case, but should reduce surprises
889 * if someone carefully pre-positioned the passed in original input or non-append output FDs. */
890
891 assert(fd >= 0);
892 assert(!(flags & (O_APPEND|O_DIRECTORY)));
893
894 int existing_flags = fcntl(fd, F_GETFL);
895 if (existing_flags < 0)
896 return -errno;
897
898 int new_fd = fd_reopen(fd, flags | (existing_flags & O_APPEND));
899 if (new_fd < 0)
900 return new_fd;
901
2599b32e 902 /* Try to adjust the offset, but ignore errors. */
b8e25bff 903 off_t p = lseek(fd, 0, SEEK_CUR);
2599b32e
MY
904 if (p > 0) {
905 off_t new_p = lseek(new_fd, p, SEEK_SET);
906 if (new_p < 0)
907 log_debug_errno(errno,
908 "Failed to propagate file position for re-opened fd %d, ignoring: %m",
909 fd);
910 else if (new_p != p)
911 log_debug("Failed to propagate file position for re-opened fd %d (%lld != %lld), ignoring.",
912 fd, (long long) new_p, (long long) p);
913 }
b8e25bff
LE
914
915 return new_fd;
916}
917
5f5865f0
LP
918int fd_reopen_condition(
919 int fd,
920 int flags,
921 int mask,
922 int *ret_new_fd) {
923
924 int r, new_fd;
925
926 assert(fd >= 0);
05314c9c 927 assert(!FLAGS_SET(flags, O_CREAT));
5f5865f0
LP
928
929 /* Invokes fd_reopen(fd, flags), but only if the existing F_GETFL flags don't match the specified
930 * flags (masked by the specified mask). This is useful for converting O_PATH fds into real fds if
931 * needed, but only then. */
932
933 r = fcntl(fd, F_GETFL);
934 if (r < 0)
935 return -errno;
936
937 if ((r & mask) == (flags & mask)) {
254d1313 938 *ret_new_fd = -EBADF;
5f5865f0
LP
939 return fd;
940 }
941
942 new_fd = fd_reopen(fd, flags);
943 if (new_fd < 0)
944 return new_fd;
945
946 *ret_new_fd = new_fd;
947 return new_fd;
948}
949
ea61e2e9
YW
950int fd_is_opath(int fd) {
951 int r;
952
953 assert(fd >= 0);
954
955 r = fcntl(fd, F_GETFL);
956 if (r < 0)
957 return -errno;
958
959 return FLAGS_SET(r, O_PATH);
960}
961
9f65355b 962int fd_verify_safe_flags_full(int fd, int extra_flags) {
14f38d17
MY
963 int flags, unexpected_flags;
964
965 /* Check if an extrinsic fd is safe to work on (by a privileged service). This ensures that clients
966 * can't trick a privileged service into giving access to a file the client doesn't already have
967 * access to (especially via something like O_PATH).
968 *
9f65355b 969 * O_NOFOLLOW: For some reason the kernel will return this flag from fcntl(); it doesn't go away
14f38d17
MY
970 * immediately after open(). It should have no effect whatsoever to an already-opened FD,
971 * and since we refuse O_PATH it should be safe.
972 *
973 * RAW_O_LARGEFILE: glibc secretly sets this and neglects to hide it from us if we call fcntl.
974 * See comment in missing_fcntl.h for more details about this.
4a5aa684 975 *
9f65355b 976 * If 'extra_flags' is specified as non-zero the included flags are also allowed.
14f38d17
MY
977 */
978
979 assert(fd >= 0);
980
981 flags = fcntl(fd, F_GETFL);
982 if (flags < 0)
983 return -errno;
984
b1236ce3 985 unexpected_flags = flags & ~(O_ACCMODE_STRICT|O_NOFOLLOW|RAW_O_LARGEFILE|extra_flags);
14f38d17
MY
986 if (unexpected_flags != 0)
987 return log_debug_errno(SYNTHETIC_ERRNO(EREMOTEIO),
988 "Unexpected flags set for extrinsic fd: 0%o",
989 (unsigned) unexpected_flags);
990
b1236ce3 991 return flags & (O_ACCMODE_STRICT | extra_flags); /* return the flags variable, but remove the noise */
14f38d17
MY
992}
993
9264cc39
LP
994int read_nr_open(void) {
995 _cleanup_free_ char *nr_open = NULL;
996 int r;
997
998 /* Returns the kernel's current fd limit, either by reading it of /proc/sys if that works, or using the
999 * hard-coded default compiled-in value of current kernels (1M) if not. This call will never fail. */
1000
1001 r = read_one_line_file("/proc/sys/fs/nr_open", &nr_open);
1002 if (r < 0)
1003 log_debug_errno(r, "Failed to read /proc/sys/fs/nr_open, ignoring: %m");
1004 else {
1005 int v;
1006
1007 r = safe_atoi(nr_open, &v);
1008 if (r < 0)
1009 log_debug_errno(r, "Failed to parse /proc/sys/fs/nr_open value '%s', ignoring: %m", nr_open);
1010 else
1011 return v;
1012 }
1013
2aed63f4 1014 /* If we fail, fall back to the hard-coded kernel limit of 1024 * 1024. */
9264cc39
LP
1015 return 1024 * 1024;
1016}
65ddc2c5 1017
7e93a658
YW
1018int fd_get_diskseq(int fd, uint64_t *ret) {
1019 uint64_t diskseq;
1020
1021 assert(fd >= 0);
1022 assert(ret);
1023
1024 if (ioctl(fd, BLKGETDISKSEQ, &diskseq) < 0) {
1025 /* Note that the kernel is weird: non-existing ioctls currently return EINVAL
1026 * rather than ENOTTY on loopback block devices. They should fix that in the kernel,
1027 * but in the meantime we accept both here. */
1028 if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL)
1029 return -errno;
1030
1031 return -EOPNOTSUPP;
1032 }
1033
1034 *ret = diskseq;
1035
1036 return 0;
1037}
af423b4b 1038
8a65b0b2 1039int path_is_root_at(int dir_fd, const char *path) {
5134e546 1040 _cleanup_close_ int fd = -EBADF, pfd = -EBADF;
af423b4b 1041
8a65b0b2
DDM
1042 assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
1043
1044 if (!isempty(path)) {
549a9a67 1045 fd = openat(dir_fd, path, O_PATH|O_DIRECTORY|O_CLOEXEC);
8a65b0b2 1046 if (fd < 0)
549a9a67 1047 return errno == ENOTDIR ? false : -errno;
8a65b0b2
DDM
1048
1049 dir_fd = fd;
1050 }
af423b4b 1051
5134e546
LB
1052 pfd = openat(dir_fd, "..", O_PATH|O_DIRECTORY|O_CLOEXEC);
1053 if (pfd < 0)
1054 return errno == ENOTDIR ? false : -errno;
1055
1056 /* Even if the parent directory has the same inode, the fd may not point to the root directory "/",
1057 * and we also need to check that the mount ids are the same. Otherwise, a construct like the
1058 * following could be used to trick us:
1059 *
1060 * $ mkdir /tmp/x /tmp/x/y
1061 * $ mount --bind /tmp/x /tmp/x/y
1062 */
1063
1064 return fds_are_same_mount(dir_fd, pfd);
1065}
1066
1067int fds_are_same_mount(int fd1, int fd2) {
4424e6c8 1068 struct statx sx1 = {}, sx2 = {}; /* explicitly initialize the struct to make msan silent. */
5134e546
LB
1069 int r;
1070
1071 assert(fd1 >= 0);
1072 assert(fd2 >= 0);
1073
d5ddc0e0
YW
1074 if (statx(fd1, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sx1) < 0)
1075 return -errno;
af423b4b 1076
d5ddc0e0
YW
1077 if (statx(fd2, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sx2) < 0)
1078 return -errno;
b4cb4c5c
YW
1079
1080 /* First, compare inode. If these are different, the fd does not point to the root directory "/". */
4424e6c8 1081 if (!statx_inode_same(&sx1, &sx2))
b4cb4c5c
YW
1082 return false;
1083
5134e546 1084 /* Note, statx() does not provide the mount ID and path_get_mnt_id_at() does not work when an old
bd96111d
YW
1085 * kernel is used. In that case, let's assume that we do not have such spurious mount points in an
1086 * early boot stage, and silently skip the following check. */
8d3c49b1 1087
4424e6c8 1088 if (!FLAGS_SET(sx1.stx_mask, STATX_MNT_ID)) {
af423b4b
DDM
1089 int mntid;
1090
5134e546 1091 r = path_get_mnt_id_at_fallback(fd1, "", &mntid);
bb44fd07 1092 if (r < 0)
af423b4b
DDM
1093 return r;
1094 assert(mntid >= 0);
1095
4424e6c8
YW
1096 sx1.stx_mnt_id = mntid;
1097 sx1.stx_mask |= STATX_MNT_ID;
af423b4b
DDM
1098 }
1099
4424e6c8 1100 if (!FLAGS_SET(sx2.stx_mask, STATX_MNT_ID)) {
af423b4b
DDM
1101 int mntid;
1102
5134e546 1103 r = path_get_mnt_id_at_fallback(fd2, "", &mntid);
bb44fd07 1104 if (r < 0)
af423b4b
DDM
1105 return r;
1106 assert(mntid >= 0);
1107
4424e6c8
YW
1108 sx2.stx_mnt_id = mntid;
1109 sx2.stx_mask |= STATX_MNT_ID;
af423b4b
DDM
1110 }
1111
4424e6c8 1112 return statx_mount_same(&sx1, &sx2);
af423b4b 1113}
b2b84f4b 1114
0c15577a
DDM
1115char* format_proc_fd_path(char buf[static PROC_FD_PATH_MAX], int fd) {
1116 assert(buf);
1117 assert(fd >= 0);
1118 assert_se(snprintf_ok(buf, PROC_FD_PATH_MAX, "/proc/self/fd/%i", fd));
1119 return buf;
1120}
1121
bfd5a068 1122const char* accmode_to_string(int flags) {
b1236ce3 1123 switch (flags & O_ACCMODE_STRICT) {
b2b84f4b
LP
1124 case O_RDONLY:
1125 return "ro";
1126 case O_WRONLY:
1127 return "wo";
1128 case O_RDWR:
1129 return "rw";
1130 default:
1131 return NULL;
1132 }
1133}
61c062f8 1134
ff3f2953 1135char* format_proc_pid_fd_path(char buf[static PROC_PID_FD_PATH_MAX], pid_t pid, int fd) {
61c062f8
LP
1136 assert(buf);
1137 assert(fd >= 0);
1138 assert(pid >= 0);
1139 assert_se(snprintf_ok(buf, PROC_PID_FD_PATH_MAX, "/proc/" PID_FMT "/fd/%i", pid == 0 ? getpid_cached() : pid, fd));
1140 return buf;
1141}
d19b3c5d
MY
1142
1143int proc_fd_enoent_errno(void) {
1144 int r;
1145
1146 /* When ENOENT is returned during the use of FORMAT_PROC_FD_PATH, it can mean two things:
1147 * that the fd does not exist or that /proc/ is not mounted.
1148 * Let's make things debuggable and figure out the most appropriate errno. */
1149
1150 r = proc_mounted();
1151 if (r == 0)
1152 return -ENOSYS; /* /proc/ is not available or not set up properly, we're most likely
1153 in some chroot environment. */
8dc303d3
MY
1154 if (r > 0)
1155 return -EBADF; /* If /proc/ is definitely around then this means the fd is not valid. */
1156
1157 return -ENOENT; /* Otherwise let's propagate the original ENOENT. */
d19b3c5d 1158}