src/basic/socket-util.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <arpa/inet.h>
   4 #include <errno.h>
   5 #include <limits.h>
   6 #include <net/if.h>
   7 #include <netdb.h>
   8 #include <netinet/ip.h>
   9 #include <poll.h>
  10 #include <stddef.h>
  11 #include <stdint.h>
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <sys/ioctl.h>
  15 #include <unistd.h>
  16 #include <linux/if.h>
  17
  18 #include "alloc-util.h"
  19 #include "errno-util.h"
  20 #include "escape.h"
  21 #include "fd-util.h"
  22 #include "fileio.h"
  23 #include "format-util.h"
  24 #include "log.h"
  25 #include "macro.h"
  26 #include "memory-util.h"
  27 #include "missing_socket.h"
  28 #include "parse-util.h"
  29 #include "path-util.h"
  30 #include "process-util.h"
  31 #include "socket-util.h"
  32 #include "string-table.h"
  33 #include "string-util.h"
  34 #include "strv.h"
  35 #include "user-util.h"
  36 #include "utf8.h"
  37
  38 #if ENABLE_IDN
  39 #  define IDN_FLAGS NI_IDN
  40 #else
  41 #  define IDN_FLAGS 0
  42 #endif
  43
  44 static const char* const socket_address_type_table[] = {
  45         [SOCK_STREAM] =    "Stream",
  46         [SOCK_DGRAM] =     "Datagram",
  47         [SOCK_RAW] =       "Raw",
  48         [SOCK_RDM] =       "ReliableDatagram",
  49         [SOCK_SEQPACKET] = "SequentialPacket",
  50         [SOCK_DCCP] =      "DatagramCongestionControl",
  51 };
  52
  53 DEFINE_STRING_TABLE_LOOKUP(socket_address_type, int);
  54
  55 int socket_address_verify(const SocketAddress *a, bool strict) {
  56         assert(a);
  57
  58         /* With 'strict' we enforce additional sanity constraints which are not set by the standard,
  59          * but should only apply to sockets we create ourselves. */
  60
  61         switch (socket_address_family(a)) {
  62
  63         case AF_INET:
  64                 if (a->size != sizeof(struct sockaddr_in))
  65                         return -EINVAL;
  66
  67                 if (a->sockaddr.in.sin_port == 0)
  68                         return -EINVAL;
  69
  70                 if (!IN_SET(a->type, SOCK_STREAM, SOCK_DGRAM))
  71                         return -EINVAL;
  72
  73                 return 0;
  74
  75         case AF_INET6:
  76                 if (a->size != sizeof(struct sockaddr_in6))
  77                         return -EINVAL;
  78
  79                 if (a->sockaddr.in6.sin6_port == 0)
  80                         return -EINVAL;
  81
  82                 if (!IN_SET(a->type, SOCK_STREAM, SOCK_DGRAM))
  83                         return -EINVAL;
  84
  85                 return 0;
  86
  87         case AF_UNIX:
  88                 if (a->size < offsetof(struct sockaddr_un, sun_path))
  89                         return -EINVAL;
  90                 if (a->size > sizeof(struct sockaddr_un) + !strict)
  91                         /* If !strict, allow one extra byte, since getsockname() on Linux will append
  92                          * a NUL byte if we have path sockets that are above sun_path's full size. */
  93                         return -EINVAL;
  94
  95                 if (a->size > offsetof(struct sockaddr_un, sun_path) &&
  96                     a->sockaddr.un.sun_path[0] != 0 &&
  97                     strict) {
  98                         /* Only validate file system sockets here, and only in strict mode */
  99                         const char *e;
 100
 101                         e = memchr(a->sockaddr.un.sun_path, 0, sizeof(a->sockaddr.un.sun_path));
 102                         if (e) {
 103                                 /* If there's an embedded NUL byte, make sure the size of the socket address matches it */
 104                                 if (a->size != offsetof(struct sockaddr_un, sun_path) + (e - a->sockaddr.un.sun_path) + 1)
 105                                         return -EINVAL;
 106                         } else {
 107                                 /* If there's no embedded NUL byte, then then the size needs to match the whole
 108                                  * structure or the structure with one extra NUL byte suffixed. (Yeah, Linux is awful,
 109                                  * and considers both equivalent: getsockname() even extends sockaddr_un beyond its
 110                                  * size if the path is non NUL terminated.)*/
 111                                 if (!IN_SET(a->size, sizeof(a->sockaddr.un.sun_path), sizeof(a->sockaddr.un.sun_path)+1))
 112                                         return -EINVAL;
 113                         }
 114                 }
 115
 116                 if (!IN_SET(a->type, SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET))
 117                         return -EINVAL;
 118
 119                 return 0;
 120
 121         case AF_NETLINK:
 122
 123                 if (a->size != sizeof(struct sockaddr_nl))
 124                         return -EINVAL;
 125
 126                 if (!IN_SET(a->type, SOCK_RAW, SOCK_DGRAM))
 127                         return -EINVAL;
 128
 129                 return 0;
 130
 131         case AF_VSOCK:
 132                 if (a->size != sizeof(struct sockaddr_vm))
 133                         return -EINVAL;
 134
 135                 if (!IN_SET(a->type, SOCK_STREAM, SOCK_DGRAM))
 136                         return -EINVAL;
 137
 138                 return 0;
 139
 140         default:
 141                 return -EAFNOSUPPORT;
 142         }
 143 }
 144
 145 int socket_address_print(const SocketAddress *a, char **ret) {
 146         int r;
 147
 148         assert(a);
 149         assert(ret);
 150
 151         r = socket_address_verify(a, false); /* We do non-strict validation, because we want to be
 152                                               * able to pretty-print any socket the kernel considers
 153                                               * valid. We still need to do validation to know if we
 154                                               * can meaningfully print the address. */
 155         if (r < 0)
 156                 return r;
 157
 158         if (socket_address_family(a) == AF_NETLINK) {
 159                 _cleanup_free_ char *sfamily = NULL;
 160
 161                 r = netlink_family_to_string_alloc(a->protocol, &sfamily);
 162                 if (r < 0)
 163                         return r;
 164
 165                 r = asprintf(ret, "%s %u", sfamily, a->sockaddr.nl.nl_groups);
 166                 if (r < 0)
 167                         return -ENOMEM;
 168
 169                 return 0;
 170         }
 171
 172         return sockaddr_pretty(&a->sockaddr.sa, a->size, false, true, ret);
 173 }
 174
 175 bool socket_address_can_accept(const SocketAddress *a) {
 176         assert(a);
 177
 178         return
 179                 IN_SET(a->type, SOCK_STREAM, SOCK_SEQPACKET);
 180 }
 181
 182 bool socket_address_equal(const SocketAddress *a, const SocketAddress *b) {
 183         assert(a);
 184         assert(b);
 185
 186         /* Invalid addresses are unequal to all */
 187         if (socket_address_verify(a, false) < 0 ||
 188             socket_address_verify(b, false) < 0)
 189                 return false;
 190
 191         if (a->type != b->type)
 192                 return false;
 193
 194         if (socket_address_family(a) != socket_address_family(b))
 195                 return false;
 196
 197         switch (socket_address_family(a)) {
 198
 199         case AF_INET:
 200                 if (a->sockaddr.in.sin_addr.s_addr != b->sockaddr.in.sin_addr.s_addr)
 201                         return false;
 202
 203                 if (a->sockaddr.in.sin_port != b->sockaddr.in.sin_port)
 204                         return false;
 205
 206                 break;
 207
 208         case AF_INET6:
 209                 if (memcmp(&a->sockaddr.in6.sin6_addr, &b->sockaddr.in6.sin6_addr, sizeof(a->sockaddr.in6.sin6_addr)) != 0)
 210                         return false;
 211
 212                 if (a->sockaddr.in6.sin6_port != b->sockaddr.in6.sin6_port)
 213                         return false;
 214
 215                 break;
 216
 217         case AF_UNIX:
 218                 if (a->size <= offsetof(struct sockaddr_un, sun_path) ||
 219                     b->size <= offsetof(struct sockaddr_un, sun_path))
 220                         return false;
 221
 222                 if ((a->sockaddr.un.sun_path[0] == 0) != (b->sockaddr.un.sun_path[0] == 0))
 223                         return false;
 224
 225                 if (a->sockaddr.un.sun_path[0]) {
 226                         if (!path_equal_or_files_same(a->sockaddr.un.sun_path, b->sockaddr.un.sun_path, 0))
 227                                 return false;
 228                 } else {
 229                         if (a->size != b->size)
 230                                 return false;
 231
 232                         if (memcmp(a->sockaddr.un.sun_path, b->sockaddr.un.sun_path, a->size) != 0)
 233                                 return false;
 234                 }
 235
 236                 break;
 237
 238         case AF_NETLINK:
 239                 if (a->protocol != b->protocol)
 240                         return false;
 241
 242                 if (a->sockaddr.nl.nl_groups != b->sockaddr.nl.nl_groups)
 243                         return false;
 244
 245                 break;
 246
 247         case AF_VSOCK:
 248                 if (a->sockaddr.vm.svm_cid != b->sockaddr.vm.svm_cid)
 249                         return false;
 250
 251                 if (a->sockaddr.vm.svm_port != b->sockaddr.vm.svm_port)
 252                         return false;
 253
 254                 break;
 255
 256         default:
 257                 /* Cannot compare, so we assume the addresses are different */
 258                 return false;
 259         }
 260
 261         return true;
 262 }
 263
 264 const char* socket_address_get_path(const SocketAddress *a) {
 265         assert(a);
 266
 267         if (socket_address_family(a) != AF_UNIX)
 268                 return NULL;
 269
 270         if (a->sockaddr.un.sun_path[0] == 0)
 271                 return NULL;
 272
 273         /* Note that this is only safe because we know that there's an extra NUL byte after the sockaddr_un
 274          * structure. On Linux AF_UNIX file system socket addresses don't have to be NUL terminated if they take up the
 275          * full sun_path space. */
 276         assert_cc(sizeof(union sockaddr_union) >= sizeof(struct sockaddr_un)+1);
 277         return a->sockaddr.un.sun_path;
 278 }
 279
 280 bool socket_ipv6_is_supported(void) {
 281         if (access("/proc/net/if_inet6", F_OK) != 0)
 282                 return false;
 283
 284         return true;
 285 }
 286
 287 bool socket_address_matches_fd(const SocketAddress *a, int fd) {
 288         SocketAddress b;
 289         socklen_t solen;
 290
 291         assert(a);
 292         assert(fd >= 0);
 293
 294         b.size = sizeof(b.sockaddr);
 295         if (getsockname(fd, &b.sockaddr.sa, &b.size) < 0)
 296                 return false;
 297
 298         if (b.sockaddr.sa.sa_family != a->sockaddr.sa.sa_family)
 299                 return false;
 300
 301         solen = sizeof(b.type);
 302         if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &b.type, &solen) < 0)
 303                 return false;
 304
 305         if (b.type != a->type)
 306                 return false;
 307
 308         if (a->protocol != 0)  {
 309                 solen = sizeof(b.protocol);
 310                 if (getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &b.protocol, &solen) < 0)
 311                         return false;
 312
 313                 if (b.protocol != a->protocol)
 314                         return false;
 315         }
 316
 317         return socket_address_equal(a, &b);
 318 }
 319
 320 int sockaddr_port(const struct sockaddr *_sa, unsigned *ret_port) {
 321         union sockaddr_union *sa = (union sockaddr_union*) _sa;
 322
 323         /* Note, this returns the port as 'unsigned' rather than 'uint16_t', as AF_VSOCK knows larger ports */
 324
 325         assert(sa);
 326
 327         switch (sa->sa.sa_family) {
 328
 329         case AF_INET:
 330                 *ret_port = be16toh(sa->in.sin_port);
 331                 return 0;
 332
 333         case AF_INET6:
 334                 *ret_port = be16toh(sa->in6.sin6_port);
 335                 return 0;
 336
 337         case AF_VSOCK:
 338                 *ret_port = sa->vm.svm_port;
 339                 return 0;
 340
 341         default:
 342                 return -EAFNOSUPPORT;
 343         }
 344 }
 345
 346 int sockaddr_pretty(
 347                 const struct sockaddr *_sa,
 348                 socklen_t salen,
 349                 bool translate_ipv6,
 350                 bool include_port,
 351                 char **ret) {
 352
 353         union sockaddr_union *sa = (union sockaddr_union*) _sa;
 354         char *p;
 355         int r;
 356
 357         assert(sa);
 358         assert(salen >= sizeof(sa->sa.sa_family));
 359
 360         switch (sa->sa.sa_family) {
 361
 362         case AF_INET: {
 363                 uint32_t a;
 364
 365                 a = be32toh(sa->in.sin_addr.s_addr);
 366
 367                 if (include_port)
 368                         r = asprintf(&p,
 369                                      "%u.%u.%u.%u:%u",
 370                                      a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF,
 371                                      be16toh(sa->in.sin_port));
 372                 else
 373                         r = asprintf(&p,
 374                                      "%u.%u.%u.%u",
 375                                      a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF);
 376                 if (r < 0)
 377                         return -ENOMEM;
 378                 break;
 379         }
 380
 381         case AF_INET6: {
 382                 static const unsigned char ipv4_prefix[] = {
 383                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF
 384                 };
 385
 386                 if (translate_ipv6 &&
 387                     memcmp(&sa->in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0) {
 388                         const uint8_t *a = sa->in6.sin6_addr.s6_addr+12;
 389                         if (include_port)
 390                                 r = asprintf(&p,
 391                                              "%u.%u.%u.%u:%u",
 392                                              a[0], a[1], a[2], a[3],
 393                                              be16toh(sa->in6.sin6_port));
 394                         else
 395                                 r = asprintf(&p,
 396                                              "%u.%u.%u.%u",
 397                                              a[0], a[1], a[2], a[3]);
 398                         if (r < 0)
 399                                 return -ENOMEM;
 400                 } else {
 401                         char a[INET6_ADDRSTRLEN];
 402
 403                         inet_ntop(AF_INET6, &sa->in6.sin6_addr, a, sizeof(a));
 404
 405                         if (include_port) {
 406                                 r = asprintf(&p,
 407                                              "[%s]:%u",
 408                                              a,
 409                                              be16toh(sa->in6.sin6_port));
 410                                 if (r < 0)
 411                                         return -ENOMEM;
 412                         } else {
 413                                 p = strdup(a);
 414                                 if (!p)
 415                                         return -ENOMEM;
 416                         }
 417                 }
 418
 419                 break;
 420         }
 421
 422         case AF_UNIX:
 423                 if (salen <= offsetof(struct sockaddr_un, sun_path) ||
 424                     (sa->un.sun_path[0] == 0 && salen == offsetof(struct sockaddr_un, sun_path) + 1))
 425                         /* The name must have at least one character (and the leading NUL does not count) */
 426                         p = strdup("<unnamed>");
 427                 else {
 428                         /* Note that we calculate the path pointer here through the .un_buffer[] field, in order to
 429                          * outtrick bounds checking tools such as ubsan, which are too smart for their own good: on
 430                          * Linux the kernel may return sun_path[] data one byte longer than the declared size of the
 431                          * field. */
 432                         char *path = (char*) sa->un_buffer + offsetof(struct sockaddr_un, sun_path);
 433                         size_t path_len = salen - offsetof(struct sockaddr_un, sun_path);
 434
 435                         if (path[0] == 0) {
 436                                 /* Abstract socket. When parsing address information from, we
 437                                  * explicitly reject overly long paths and paths with embedded NULs.
 438                                  * But we might get such a socket from the outside. Let's return
 439                                  * something meaningful and printable in this case. */
 440
 441                                 _cleanup_free_ char *e = NULL;
 442
 443                                 e = cescape_length(path + 1, path_len - 1);
 444                                 if (!e)
 445                                         return -ENOMEM;
 446
 447                                 p = strjoin("@", e);
 448                         } else {
 449                                 if (path[path_len - 1] == '\0')
 450                                         /* We expect a terminating NUL and don't print it */
 451                                         path_len --;
 452
 453                                 p = cescape_length(path, path_len);
 454                         }
 455                 }
 456                 if (!p)
 457                         return -ENOMEM;
 458
 459                 break;
 460
 461         case AF_VSOCK:
 462                 if (include_port) {
 463                         if (sa->vm.svm_cid == VMADDR_CID_ANY)
 464                                 r = asprintf(&p, "vsock::%u", sa->vm.svm_port);
 465                         else
 466                                 r = asprintf(&p, "vsock:%u:%u", sa->vm.svm_cid, sa->vm.svm_port);
 467                 } else
 468                         r = asprintf(&p, "vsock:%u", sa->vm.svm_cid);
 469                 if (r < 0)
 470                         return -ENOMEM;
 471                 break;
 472
 473         default:
 474                 return -EOPNOTSUPP;
 475         }
 476
 477         *ret = p;
 478         return 0;
 479 }
 480
 481 int getpeername_pretty(int fd, bool include_port, char **ret) {
 482         union sockaddr_union sa;
 483         socklen_t salen = sizeof(sa);
 484         int r;
 485
 486         assert(fd >= 0);
 487         assert(ret);
 488
 489         if (getpeername(fd, &sa.sa, &salen) < 0)
 490                 return -errno;
 491
 492         if (sa.sa.sa_family == AF_UNIX) {
 493                 struct ucred ucred = {};
 494
 495                 /* UNIX connection sockets are anonymous, so let's use
 496                  * PID/UID as pretty credentials instead */
 497
 498                 r = getpeercred(fd, &ucred);
 499                 if (r < 0)
 500                         return r;
 501
 502                 if (asprintf(ret, "PID "PID_FMT"/UID "UID_FMT, ucred.pid, ucred.uid) < 0)
 503                         return -ENOMEM;
 504
 505                 return 0;
 506         }
 507
 508         /* For remote sockets we translate IPv6 addresses back to IPv4
 509          * if applicable, since that's nicer. */
 510
 511         return sockaddr_pretty(&sa.sa, salen, true, include_port, ret);
 512 }
 513
 514 int getsockname_pretty(int fd, char **ret) {
 515         union sockaddr_union sa;
 516         socklen_t salen = sizeof(sa);
 517
 518         assert(fd >= 0);
 519         assert(ret);
 520
 521         if (getsockname(fd, &sa.sa, &salen) < 0)
 522                 return -errno;
 523
 524         /* For local sockets we do not translate IPv6 addresses back
 525          * to IPv6 if applicable, since this is usually used for
 526          * listening sockets where the difference between IPv4 and
 527          * IPv6 matters. */
 528
 529         return sockaddr_pretty(&sa.sa, salen, false, true, ret);
 530 }
 531
 532 int socknameinfo_pretty(union sockaddr_union *sa, socklen_t salen, char **_ret) {
 533         int r;
 534         char host[NI_MAXHOST], *ret;
 535
 536         assert(_ret);
 537
 538         r = getnameinfo(&sa->sa, salen, host, sizeof(host), NULL, 0, IDN_FLAGS);
 539         if (r != 0) {
 540                 int saved_errno = errno;
 541
 542                 r = sockaddr_pretty(&sa->sa, salen, true, true, &ret);
 543                 if (r < 0)
 544                         return r;
 545
 546                 log_debug_errno(saved_errno, "getnameinfo(%s) failed: %m", ret);
 547         } else {
 548                 ret = strdup(host);
 549                 if (!ret)
 550                         return -ENOMEM;
 551         }
 552
 553         *_ret = ret;
 554         return 0;
 555 }
 556
 557 static const char* const netlink_family_table[] = {
 558         [NETLINK_ROUTE] = "route",
 559         [NETLINK_FIREWALL] = "firewall",
 560         [NETLINK_INET_DIAG] = "inet-diag",
 561         [NETLINK_NFLOG] = "nflog",
 562         [NETLINK_XFRM] = "xfrm",
 563         [NETLINK_SELINUX] = "selinux",
 564         [NETLINK_ISCSI] = "iscsi",
 565         [NETLINK_AUDIT] = "audit",
 566         [NETLINK_FIB_LOOKUP] = "fib-lookup",
 567         [NETLINK_CONNECTOR] = "connector",
 568         [NETLINK_NETFILTER] = "netfilter",
 569         [NETLINK_IP6_FW] = "ip6-fw",
 570         [NETLINK_DNRTMSG] = "dnrtmsg",
 571         [NETLINK_KOBJECT_UEVENT] = "kobject-uevent",
 572         [NETLINK_GENERIC] = "generic",
 573         [NETLINK_SCSITRANSPORT] = "scsitransport",
 574         [NETLINK_ECRYPTFS] = "ecryptfs",
 575         [NETLINK_RDMA] = "rdma",
 576 };
 577
 578 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(netlink_family, int, INT_MAX);
 579
 580 static const char* const socket_address_bind_ipv6_only_table[_SOCKET_ADDRESS_BIND_IPV6_ONLY_MAX] = {
 581         [SOCKET_ADDRESS_DEFAULT] = "default",
 582         [SOCKET_ADDRESS_BOTH] = "both",
 583         [SOCKET_ADDRESS_IPV6_ONLY] = "ipv6-only"
 584 };
 585
 586 DEFINE_STRING_TABLE_LOOKUP(socket_address_bind_ipv6_only, SocketAddressBindIPv6Only);
 587
 588 SocketAddressBindIPv6Only socket_address_bind_ipv6_only_or_bool_from_string(const char *n) {
 589         int r;
 590
 591         r = parse_boolean(n);
 592         if (r > 0)
 593                 return SOCKET_ADDRESS_IPV6_ONLY;
 594         if (r == 0)
 595                 return SOCKET_ADDRESS_BOTH;
 596
 597         return socket_address_bind_ipv6_only_from_string(n);
 598 }
 599
 600 bool sockaddr_equal(const union sockaddr_union *a, const union sockaddr_union *b) {
 601         assert(a);
 602         assert(b);
 603
 604         if (a->sa.sa_family != b->sa.sa_family)
 605                 return false;
 606
 607         if (a->sa.sa_family == AF_INET)
 608                 return a->in.sin_addr.s_addr == b->in.sin_addr.s_addr;
 609
 610         if (a->sa.sa_family == AF_INET6)
 611                 return memcmp(&a->in6.sin6_addr, &b->in6.sin6_addr, sizeof(a->in6.sin6_addr)) == 0;
 612
 613         if (a->sa.sa_family == AF_VSOCK)
 614                 return a->vm.svm_cid == b->vm.svm_cid;
 615
 616         return false;
 617 }
 618
 619 int fd_inc_sndbuf(int fd, size_t n) {
 620         int r, value;
 621         socklen_t l = sizeof(value);
 622
 623         r = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &value, &l);
 624         if (r >= 0 && l == sizeof(value) && (size_t) value >= n*2)
 625                 return 0;
 626
 627         /* If we have the privileges we will ignore the kernel limit. */
 628
 629         if (setsockopt_int(fd, SOL_SOCKET, SO_SNDBUF, n) < 0) {
 630                 r = setsockopt_int(fd, SOL_SOCKET, SO_SNDBUFFORCE, n);
 631                 if (r < 0)
 632                         return r;
 633         }
 634
 635         return 1;
 636 }
 637
 638 int fd_inc_rcvbuf(int fd, size_t n) {
 639         int r, value;
 640         socklen_t l = sizeof(value);
 641
 642         r = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &value, &l);
 643         if (r >= 0 && l == sizeof(value) && (size_t) value >= n*2)
 644                 return 0;
 645
 646         /* If we have the privileges we will ignore the kernel limit. */
 647
 648         if (setsockopt_int(fd, SOL_SOCKET, SO_RCVBUF, n) < 0) {
 649                 r = setsockopt_int(fd, SOL_SOCKET, SO_RCVBUFFORCE, n);
 650                 if (r < 0)
 651                         return r;
 652         }
 653
 654         return 1;
 655 }
 656
 657 static const char* const ip_tos_table[] = {
 658         [IPTOS_LOWDELAY] = "low-delay",
 659         [IPTOS_THROUGHPUT] = "throughput",
 660         [IPTOS_RELIABILITY] = "reliability",
 661         [IPTOS_LOWCOST] = "low-cost",
 662 };
 663
 664 DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(ip_tos, int, 0xff);
 665
 666 bool ifname_valid_full(const char *p, bool alternative) {
 667         bool numeric = true;
 668
 669         /* Checks whether a network interface name is valid. This is inspired by dev_valid_name() in the kernel sources
 670          * but slightly stricter, as we only allow non-control, non-space ASCII characters in the interface name. We
 671          * also don't permit names that only container numbers, to avoid confusion with numeric interface indexes. */
 672
 673         if (isempty(p))
 674                 return false;
 675
 676         if (alternative) {
 677                 if (strlen(p) >= ALTIFNAMSIZ)
 678                         return false;
 679         } else {
 680                 if (strlen(p) >= IFNAMSIZ)
 681                         return false;
 682         }
 683
 684         if (dot_or_dot_dot(p))
 685                 return false;
 686
 687         while (*p) {
 688                 if ((unsigned char) *p >= 127U)
 689                         return false;
 690
 691                 if ((unsigned char) *p <= 32U)
 692                         return false;
 693
 694                 if (IN_SET(*p, ':', '/'))
 695                         return false;
 696
 697                 numeric = numeric && (*p >= '0' && *p <= '9');
 698                 p++;
 699         }
 700
 701         if (numeric)
 702                 return false;
 703
 704         return true;
 705 }
 706
 707 bool address_label_valid(const char *p) {
 708
 709         if (isempty(p))
 710                 return false;
 711
 712         if (strlen(p) >= IFNAMSIZ)
 713                 return false;
 714
 715         while (*p) {
 716                 if ((uint8_t) *p >= 127U)
 717                         return false;
 718
 719                 if ((uint8_t) *p <= 31U)
 720                         return false;
 721                 p++;
 722         }
 723
 724         return true;
 725 }
 726
 727 int getpeercred(int fd, struct ucred *ucred) {
 728         socklen_t n = sizeof(struct ucred);
 729         struct ucred u;
 730         int r;
 731
 732         assert(fd >= 0);
 733         assert(ucred);
 734
 735         r = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &u, &n);
 736         if (r < 0)
 737                 return -errno;
 738
 739         if (n != sizeof(struct ucred))
 740                 return -EIO;
 741
 742         /* Check if the data is actually useful and not suppressed due to namespacing issues */
 743         if (!pid_is_valid(u.pid))
 744                 return -ENODATA;
 745
 746         /* Note that we don't check UID/GID here, as namespace translation works differently there: instead of
 747          * receiving in "invalid" user/group we get the overflow UID/GID. */
 748
 749         *ucred = u;
 750         return 0;
 751 }
 752
 753 int getpeersec(int fd, char **ret) {
 754         _cleanup_free_ char *s = NULL;
 755         socklen_t n = 64;
 756
 757         assert(fd >= 0);
 758         assert(ret);
 759
 760         for (;;) {
 761                 s = new0(char, n+1);
 762                 if (!s)
 763                         return -ENOMEM;
 764
 765                 if (getsockopt(fd, SOL_SOCKET, SO_PEERSEC, s, &n) >= 0)
 766                         break;
 767
 768                 if (errno != ERANGE)
 769                         return -errno;
 770
 771                 s = mfree(s);
 772         }
 773
 774         if (isempty(s))
 775                 return -EOPNOTSUPP;
 776
 777         *ret = TAKE_PTR(s);
 778
 779         return 0;
 780 }
 781
 782 int getpeergroups(int fd, gid_t **ret) {
 783         socklen_t n = sizeof(gid_t) * 64;
 784         _cleanup_free_ gid_t *d = NULL;
 785
 786         assert(fd >= 0);
 787         assert(ret);
 788
 789         for (;;) {
 790                 d = malloc(n);
 791                 if (!d)
 792                         return -ENOMEM;
 793
 794                 if (getsockopt(fd, SOL_SOCKET, SO_PEERGROUPS, d, &n) >= 0)
 795                         break;
 796
 797                 if (errno != ERANGE)
 798                         return -errno;
 799
 800                 d = mfree(d);
 801         }
 802
 803         assert_se(n % sizeof(gid_t) == 0);
 804         n /= sizeof(gid_t);
 805
 806         if ((socklen_t) (int) n != n)
 807                 return -E2BIG;
 808
 809         *ret = TAKE_PTR(d);
 810
 811         return (int) n;
 812 }
 813
 814 ssize_t send_one_fd_iov_sa(
 815                 int transport_fd,
 816                 int fd,
 817                 struct iovec *iov, size_t iovlen,
 818                 const struct sockaddr *sa, socklen_t len,
 819                 int flags) {
 820
 821         CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control = {};
 822         struct msghdr mh = {
 823                 .msg_name = (struct sockaddr*) sa,
 824                 .msg_namelen = len,
 825                 .msg_iov = iov,
 826                 .msg_iovlen = iovlen,
 827         };
 828         ssize_t k;
 829
 830         assert(transport_fd >= 0);
 831
 832         /*
 833          * We need either an FD or data to send.
 834          * If there's nothing, return an error.
 835          */
 836         if (fd < 0 && !iov)
 837                 return -EINVAL;
 838
 839         if (fd >= 0) {
 840                 struct cmsghdr *cmsg;
 841
 842                 mh.msg_control = &control;
 843                 mh.msg_controllen = sizeof(control);
 844
 845                 cmsg = CMSG_FIRSTHDR(&mh);
 846                 cmsg->cmsg_level = SOL_SOCKET;
 847                 cmsg->cmsg_type = SCM_RIGHTS;
 848                 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
 849                 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
 850         }
 851         k = sendmsg(transport_fd, &mh, MSG_NOSIGNAL | flags);
 852         if (k < 0)
 853                 return (ssize_t) -errno;
 854
 855         return k;
 856 }
 857
 858 int send_one_fd_sa(
 859                 int transport_fd,
 860                 int fd,
 861                 const struct sockaddr *sa, socklen_t len,
 862                 int flags) {
 863
 864         assert(fd >= 0);
 865
 866         return (int) send_one_fd_iov_sa(transport_fd, fd, NULL, 0, sa, len, flags);
 867 }
 868
 869 ssize_t receive_one_fd_iov(
 870                 int transport_fd,
 871                 struct iovec *iov, size_t iovlen,
 872                 int flags,
 873                 int *ret_fd) {
 874
 875         CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control;
 876         struct msghdr mh = {
 877                 .msg_control = &control,
 878                 .msg_controllen = sizeof(control),
 879                 .msg_iov = iov,
 880                 .msg_iovlen = iovlen,
 881         };
 882         struct cmsghdr *found;
 883         ssize_t k;
 884
 885         assert(transport_fd >= 0);
 886         assert(ret_fd);
 887
 888         /*
 889          * Receive a single FD via @transport_fd. We don't care for
 890          * the transport-type. We retrieve a single FD at most, so for
 891          * packet-based transports, the caller must ensure to send
 892          * only a single FD per packet.  This is best used in
 893          * combination with send_one_fd().
 894          */
 895
 896         k = recvmsg_safe(transport_fd, &mh, MSG_CMSG_CLOEXEC | flags);
 897         if (k < 0)
 898                 return k;
 899
 900         found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
 901         if (!found) {
 902                 cmsg_close_all(&mh);
 903
 904                 /* If didn't receive an FD or any data, return an error. */
 905                 if (k == 0)
 906                         return -EIO;
 907         }
 908
 909         if (found)
 910                 *ret_fd = *(int*) CMSG_DATA(found);
 911         else
 912                 *ret_fd = -1;
 913
 914         return k;
 915 }
 916
 917 int receive_one_fd(int transport_fd, int flags) {
 918         int fd;
 919         ssize_t k;
 920
 921         k = receive_one_fd_iov(transport_fd, NULL, 0, flags, &fd);
 922         if (k == 0)
 923                 return fd;
 924
 925         /* k must be negative, since receive_one_fd_iov() only returns
 926          * a positive value if data was received through the iov. */
 927         assert(k < 0);
 928         return (int) k;
 929 }
 930
 931 ssize_t next_datagram_size_fd(int fd) {
 932         ssize_t l;
 933         int k;
 934
 935         /* This is a bit like FIONREAD/SIOCINQ, however a bit more powerful. The difference being: recv(MSG_PEEK) will
 936          * actually cause the next datagram in the queue to be validated regarding checksums, which FIONREAD doesn't
 937          * do. This difference is actually of major importance as we need to be sure that the size returned here
 938          * actually matches what we will read with recvmsg() next, as otherwise we might end up allocating a buffer of
 939          * the wrong size. */
 940
 941         l = recv(fd, NULL, 0, MSG_PEEK|MSG_TRUNC);
 942         if (l < 0) {
 943                 if (IN_SET(errno, EOPNOTSUPP, EFAULT))
 944                         goto fallback;
 945
 946                 return -errno;
 947         }
 948         if (l == 0)
 949                 goto fallback;
 950
 951         return l;
 952
 953 fallback:
 954         k = 0;
 955
 956         /* Some sockets (AF_PACKET) do not support null-sized recv() with MSG_TRUNC set, let's fall back to FIONREAD
 957          * for them. Checksums don't matter for raw sockets anyway, hence this should be fine. */
 958
 959         if (ioctl(fd, FIONREAD, &k) < 0)
 960                 return -errno;
 961
 962         return (ssize_t) k;
 963 }
 964
 965 /* Put a limit on how many times will attempt to call accept4(). We loop
 966  * only on "transient" errors, but let's make sure we don't loop forever. */
 967 #define MAX_FLUSH_ITERATIONS 1024
 968
 969 int flush_accept(int fd) {
 970
 971         struct pollfd pollfd = {
 972                 .fd = fd,
 973                 .events = POLLIN,
 974         };
 975         int r, b;
 976         socklen_t l = sizeof(b);
 977
 978         /* Similar to flush_fd() but flushes all incoming connections by accepting and immediately closing
 979          * them. */
 980
 981         if (getsockopt(fd, SOL_SOCKET, SO_ACCEPTCONN, &b, &l) < 0)
 982                 return -errno;
 983
 984         assert(l == sizeof(b));
 985         if (!b) /* Let's check if this socket accepts connections before calling accept(). accept4() can
 986                  * return EOPNOTSUPP if the fd is not a listening socket, which we should treat as a fatal
 987                  * error, or in case the incoming TCP connection triggered a network issue, which we want to
 988                  * treat as a transient error. Thus, let's rule out the first reason for EOPNOTSUPP early, so
 989                  * we can loop safely on transient errors below. */
 990                 return -ENOTTY;
 991
 992         for (unsigned iteration = 0;; iteration++) {
 993                 int cfd;
 994
 995                 r = poll(&pollfd, 1, 0);
 996                 if (r < 0) {
 997                         if (errno == EINTR)
 998                                 continue;
 999
1000                         return -errno;
1001                 }
1002                 if (r == 0)
1003                         return 0;
1004
1005                 if (iteration >= MAX_FLUSH_ITERATIONS)
1006                         return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
1007                                                "Failed to flush connections within " STRINGIFY(MAX_FLUSH_ITERATIONS) " iterations.");
1008
1009                 cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC);
1010                 if (cfd < 0) {
1011                         if (errno == EAGAIN)
1012                                 return 0;
1013
1014                         if (ERRNO_IS_ACCEPT_AGAIN(errno))
1015                                 continue;
1016
1017                         return -errno;
1018                 }
1019
1020                 safe_close(cfd);
1021         }
1022 }
1023
1024 struct cmsghdr* cmsg_find(struct msghdr *mh, int level, int type, socklen_t length) {
1025         struct cmsghdr *cmsg;
1026
1027         assert(mh);
1028
1029         CMSG_FOREACH(cmsg, mh)
1030                 if (cmsg->cmsg_level == level &&
1031                     cmsg->cmsg_type == type &&
1032                     (length == (socklen_t) -1 || length == cmsg->cmsg_len))
1033                         return cmsg;
1034
1035         return NULL;
1036 }
1037
1038 int socket_ioctl_fd(void) {
1039         int fd;
1040
1041         /* Create a socket to invoke the various network interface ioctl()s on. Traditionally only AF_INET was good for
1042          * that. Since kernel 4.6 AF_NETLINK works for this too. We first try to use AF_INET hence, but if that's not
1043          * available (for example, because it is made unavailable via SECCOMP or such), we'll fall back to the more
1044          * generic AF_NETLINK. */
1045
1046         fd = socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC, 0);
1047         if (fd < 0)
1048                 fd = socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC, NETLINK_GENERIC);
1049         if (fd < 0)
1050                 return -errno;
1051
1052         return fd;
1053 }
1054
1055 int sockaddr_un_unlink(const struct sockaddr_un *sa) {
1056         const char *p, * nul;
1057
1058         assert(sa);
1059
1060         if (sa->sun_family != AF_UNIX)
1061                 return -EPROTOTYPE;
1062
1063         if (sa->sun_path[0] == 0) /* Nothing to do for abstract sockets */
1064                 return 0;
1065
1066         /* The path in .sun_path is not necessarily NUL terminated. Let's fix that. */
1067         nul = memchr(sa->sun_path, 0, sizeof(sa->sun_path));
1068         if (nul)
1069                 p = sa->sun_path;
1070         else
1071                 p = memdupa_suffix0(sa->sun_path, sizeof(sa->sun_path));
1072
1073         if (unlink(p) < 0)
1074                 return -errno;
1075
1076         return 1;
1077 }
1078
1079 int sockaddr_un_set_path(struct sockaddr_un *ret, const char *path) {
1080         size_t l;
1081
1082         assert(ret);
1083         assert(path);
1084
1085         /* Initialize ret->sun_path from the specified argument. This will interpret paths starting with '@' as
1086          * abstract namespace sockets, and those starting with '/' as regular filesystem sockets. It won't accept
1087          * anything else (i.e. no relative paths), to avoid ambiguities. Note that this function cannot be used to
1088          * reference paths in the abstract namespace that include NUL bytes in the name. */
1089
1090         l = strlen(path);
1091         if (l == 0)
1092                 return -EINVAL;
1093         if (!IN_SET(path[0], '/', '@'))
1094                 return -EINVAL;
1095         if (path[1] == 0)
1096                 return -EINVAL;
1097
1098         /* Don't allow paths larger than the space in sockaddr_un. Note that we are a tiny bit more restrictive than
1099          * the kernel is: we insist on NUL termination (both for abstract namespace and regular file system socket
1100          * addresses!), which the kernel doesn't. We do this to reduce chance of incompatibility with other apps that
1101          * do not expect non-NUL terminated file system path*/
1102         if (l+1 > sizeof(ret->sun_path))
1103                 return -EINVAL;
1104
1105         *ret = (struct sockaddr_un) {
1106                 .sun_family = AF_UNIX,
1107         };
1108
1109         if (path[0] == '@') {
1110                 /* Abstract namespace socket */
1111                 memcpy(ret->sun_path + 1, path + 1, l); /* copy *with* trailing NUL byte */
1112                 return (int) (offsetof(struct sockaddr_un, sun_path) + l); /* 🔥 *don't* 🔥 include trailing NUL in size */
1113
1114         } else {
1115                 assert(path[0] == '/');
1116
1117                 /* File system socket */
1118                 memcpy(ret->sun_path, path, l + 1); /* copy *with* trailing NUL byte */
1119                 return (int) (offsetof(struct sockaddr_un, sun_path) + l + 1); /* include trailing NUL in size */
1120         }
1121 }
1122
1123 int socket_bind_to_ifname(int fd, const char *ifname) {
1124         assert(fd >= 0);
1125
1126         /* Call with NULL to drop binding */
1127
1128         if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen_ptr(ifname)) < 0)
1129                 return -errno;
1130
1131         return 0;
1132 }
1133
1134 int socket_bind_to_ifindex(int fd, int ifindex) {
1135         char ifname[IF_NAMESIZE + 1];
1136
1137         assert(fd >= 0);
1138
1139         if (ifindex <= 0) {
1140                 /* Drop binding */
1141                 if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, NULL, 0) < 0)
1142                         return -errno;
1143
1144                 return 0;
1145         }
1146
1147         if (setsockopt(fd, SOL_SOCKET, SO_BINDTOIFINDEX, &ifindex, sizeof(ifindex)) >= 0)
1148                 return 0;
1149         if (errno != ENOPROTOOPT)
1150                 return -errno;
1151
1152         /* Fall back to SO_BINDTODEVICE on kernels < 5.0 which didn't have SO_BINDTOIFINDEX */
1153         if (!format_ifname(ifindex, ifname))
1154                 return -errno;
1155
1156         return socket_bind_to_ifname(fd, ifname);
1157 }
1158
1159 ssize_t recvmsg_safe(int sockfd, struct msghdr *msg, int flags) {
1160         ssize_t n;
1161
1162         /* A wrapper around recvmsg() that checks for MSG_CTRUNC, and turns it into an error, in a reasonably
1163          * safe way, closing any SCM_RIGHTS fds in the error path.
1164          *
1165          * Note that unlike our usual coding style this might modify *msg on failure. */
1166
1167         n = recvmsg(sockfd, msg, flags);
1168         if (n < 0)
1169                 return -errno;
1170
1171         if (FLAGS_SET(msg->msg_flags, MSG_CTRUNC)) {
1172                 cmsg_close_all(msg);
1173                 return -EXFULL; /* a recognizable error code */
1174         }
1175
1176         return n;
1177
1178 }