+++ /dev/null
-/*
- * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, you can obtain one at https://mozilla.org/MPL/2.0/.
- *
- * See the COPYRIGHT file distributed with this work for additional
- * information regarding copyright ownership.
- */
-
-/*! \file */
-
-#include <inttypes.h>
-#include <stdbool.h>
-#include <sys/param.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
-#include <sys/sysctl.h>
-#endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
-#include <sys/time.h>
-#include <sys/uio.h>
-
-#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <sys/un.h>
-#include <unistd.h>
-
-#include <isc/app.h>
-#include <isc/buffer.h>
-#include <isc/condition.h>
-#include <isc/formatcheck.h>
-#include <isc/list.h>
-#include <isc/log.h>
-#include <isc/mem.h>
-#include <isc/mutex.h>
-#include <isc/net.h>
-#include <isc/once.h>
-#include <isc/print.h>
-#include <isc/refcount.h>
-#include <isc/region.h>
-#include <isc/resource.h>
-#include <isc/socket.h>
-#include <isc/stats.h>
-#include <isc/strerr.h>
-#include <isc/string.h>
-#include <isc/task.h>
-#include <isc/thread.h>
-#include <isc/util.h>
-
-#ifdef HAVE_KQUEUE
-#include <sys/event.h>
-#endif /* ifdef HAVE_KQUEUE */
-#ifdef HAVE_EPOLL_CREATE1
-#include <sys/epoll.h>
-#endif /* ifdef HAVE_EPOLL_CREATE1 */
-#if defined(HAVE_SYS_DEVPOLL_H)
-#include <sys/devpoll.h>
-#elif defined(HAVE_DEVPOLL_H)
-#include <devpoll.h>
-#endif /* if defined(HAVE_SYS_DEVPOLL_H) */
-
-#include <netinet/tcp.h>
-
-#include "errno2result.h"
-#include "socket_p.h"
-
-#ifdef ENABLE_TCP_FASTOPEN
-#include <netinet/tcp.h>
-#endif /* ifdef ENABLE_TCP_FASTOPEN */
-
-#ifdef HAVE_JSON_C
-#include <json_object.h>
-#endif /* HAVE_JSON_C */
-
-#ifdef HAVE_LIBXML2
-#include <libxml/xmlwriter.h>
-#define ISC_XMLCHAR (const xmlChar *)
-#endif /* HAVE_LIBXML2 */
-
-/*%
- * Choose the most preferable multiplex method.
- */
-#if defined(HAVE_KQUEUE)
-#define USE_KQUEUE
-#elif defined(HAVE_EPOLL_CREATE1)
-#define USE_EPOLL
-#elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
-#define USE_DEVPOLL
-typedef struct {
- unsigned int want_read : 1, want_write : 1;
-} pollinfo_t;
-#else /* if defined(HAVE_KQUEUE) */
-#define USE_SELECT
-#endif /* HAVE_KQUEUE */
-
-/*
- * Set by the -T dscp option on the command line. If set to a value
- * other than -1, we check to make sure DSCP values match it, and
- * assert if not.
- */
-int isc_dscp_check_value = -1;
-
-/*%
- * Maximum number of allowable open sockets. This is also the maximum
- * allowable socket file descriptor.
- *
- * Care should be taken before modifying this value for select():
- * The API standard doesn't ensure select() accept more than (the system default
- * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
- * the vast majority of cases. This constant should therefore be increased only
- * when absolutely necessary and possible, i.e., the server is exhausting all
- * available file descriptors (up to FD_SETSIZE) and the select() function
- * and FD_xxx macros support larger values than FD_SETSIZE (which may not
- * always by true, but we keep using some of them to ensure as much
- * portability as possible). Note also that overall server performance
- * may be rather worsened with a larger value of this constant due to
- * inherent scalability problems of select().
- *
- * As a special note, this value shouldn't have to be touched if
- * this is a build for an authoritative only DNS server.
- */
-#ifndef ISC_SOCKET_MAXSOCKETS
-#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
-#ifdef TUNE_LARGE
-#define ISC_SOCKET_MAXSOCKETS 21000
-#else /* ifdef TUNE_LARGE */
-#define ISC_SOCKET_MAXSOCKETS 4096
-#endif /* TUNE_LARGE */
-#elif defined(USE_SELECT)
-#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
-#endif /* USE_KQUEUE... */
-#endif /* ISC_SOCKET_MAXSOCKETS */
-
-#ifdef USE_SELECT
-/*%
- * Mac OS X needs a special definition to support larger values in select().
- * We always define this because a larger value can be specified run-time.
- */
-#ifdef __APPLE__
-#define _DARWIN_UNLIMITED_SELECT
-#endif /* __APPLE__ */
-#endif /* USE_SELECT */
-
-#ifdef ISC_SOCKET_USE_POLLWATCH
-/*%
- * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
- * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
- * some of the specified FD. The idea is based on the observation that it's
- * likely for a busy server to keep receiving packets. It specifically works
- * as follows: the socket watcher is first initialized with the state of
- * "poll_idle". While it's in the idle state it keeps sleeping until a socket
- * event occurs. When it wakes up for a socket I/O event, it moves to the
- * poll_active state, and sets the poll timeout to a short period
- * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the
- * watcher goes to the poll_checking state with the same timeout period.
- * In this state, the watcher tries to detect whether this is a break
- * during intermittent events or the kernel bug is triggered. If the next
- * polling reports an event within the short period, the previous timeout is
- * likely to be a kernel bug, and so the watcher goes back to the active state.
- * Otherwise, it moves to the idle state again.
- *
- * It's not clear whether this is a thread-related bug, but since we've only
- * seen this with threads, this workaround is used only when enabling threads.
- */
-
-typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
-
-#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
-#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
-#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
-#endif /* ISC_SOCKET_USE_POLLWATCH */
-
-/*%
- * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
- */
-#define FDLOCK_BITS 10
-#define FDLOCK_COUNT (1 << FDLOCK_BITS)
-#define FDLOCK_ID(fd) \
- (((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
- (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
-
-/*%
- * Maximum number of events communicated with the kernel. There should normally
- * be no need for having a large number.
- */
-#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
-#ifndef ISC_SOCKET_MAXEVENTS
-#ifdef TUNE_LARGE
-#define ISC_SOCKET_MAXEVENTS 2048
-#else /* ifdef TUNE_LARGE */
-#define ISC_SOCKET_MAXEVENTS 64
-#endif /* TUNE_LARGE */
-#endif /* ifndef ISC_SOCKET_MAXEVENTS */
-#endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
- * */
-
-/*%
- * Some systems define the socket length argument as an int, some as size_t,
- * some as socklen_t. This is here so it can be easily changed if needed.
- */
-#ifndef socklen_t
-#define socklen_t unsigned int
-#endif /* ifndef socklen_t */
-
-/*%
- * Define what the possible "soft" errors can be. These are non-fatal returns
- * of various network related functions, like recv() and so on.
- *
- * For some reason, BSDI (and perhaps others) will sometimes return <0
- * from recv() but will have errno==0. This is broken, but we have to
- * work around it here.
- */
-#define SOFT_ERROR(e) \
- ((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
- (e) == EINTR || (e) == 0)
-
-#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
-
-/*!<
- * DLVL(90) -- Function entry/exit and other tracing.
- * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
- * DLVL(60) -- Socket data send/receive
- * DLVL(50) -- Event tracing, including receiving/sending completion events.
- * DLVL(20) -- Socket creation/destruction.
- */
-#define TRACE_LEVEL 90
-#define CORRECTNESS_LEVEL 70
-#define IOEVENT_LEVEL 60
-#define EVENT_LEVEL 50
-#define CREATION_LEVEL 20
-
-#define TRACE DLVL(TRACE_LEVEL)
-#define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
-#define IOEVENT DLVL(IOEVENT_LEVEL)
-#define EVENT DLVL(EVENT_LEVEL)
-#define CREATION DLVL(CREATION_LEVEL)
-
-typedef isc_event_t intev_t;
-
-#define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
-#define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
-
-/*!
- * IPv6 control information. If the socket is an IPv6 socket we want
- * to collect the destination address and interface so the client can
- * set them on outgoing packets.
- */
-#ifndef USE_CMSG
-#define USE_CMSG 1
-#endif /* ifndef USE_CMSG */
-
-/*%
- * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have
- * a setsockopt() like interface to request timestamps, and if the OS
- * doesn't do it for us, call gettimeofday() on every UDP receive?
- */
-#ifdef SO_TIMESTAMP
-#ifndef USE_CMSG
-#define USE_CMSG 1
-#endif /* ifndef USE_CMSG */
-#endif /* ifdef SO_TIMESTAMP */
-
-#if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
-#define SET_RCVBUF
-#endif
-
-#if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
-#define SET_SNDBUF
-#endif
-
-/*%
- * Instead of calculating the cmsgbuf lengths every time we take
- * a rule of thumb approach - sizes are taken from x86_64 linux,
- * multiplied by 2, everything should fit. Those sizes are not
- * large enough to cause any concern.
- */
-#if defined(USE_CMSG)
-#define CMSG_SP_IN6PKT 40
-#else /* if defined(USE_CMSG) */
-#define CMSG_SP_IN6PKT 0
-#endif /* if defined(USE_CMSG) */
-
-#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
-#define CMSG_SP_TIMESTAMP 32
-#else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
-#define CMSG_SP_TIMESTAMP 0
-#endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
-
-#if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
-#define CMSG_SP_TCTOS 24
-#else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
-#define CMSG_SP_TCTOS 0
-#endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
-
-#define CMSG_SP_INT 24
-
-/* Align cmsg buffers to be safe on SPARC etc. */
-#define RECVCMSGBUFLEN \
- ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
- 1, \
- sizeof(void *))
-#define SENDCMSGBUFLEN \
- ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
- sizeof(void *))
-
-/*%
- * The number of times a send operation is repeated if the result is EINTR.
- */
-#define NRETRIES 10
-
-typedef struct isc__socketthread isc__socketthread_t;
-
-#define NEWCONNSOCK(ev) ((ev)->newsocket)
-
-struct isc_socket {
- /* Not locked. */
- unsigned int magic;
- isc_socketmgr_t *manager;
- isc_mutex_t lock;
- isc_sockettype_t type;
- const isc_statscounter_t *statsindex;
- isc_refcount_t references;
-
- /* Locked by socket lock. */
- ISC_LINK(isc_socket_t) link;
- int fd;
- int pf;
- int threadid;
- char name[16];
- void *tag;
-
- ISC_LIST(isc_socketevent_t) send_list;
- ISC_LIST(isc_socketevent_t) recv_list;
- ISC_LIST(isc_socket_newconnev_t) accept_list;
- ISC_LIST(isc_socket_connev_t) connect_list;
-
- isc_sockaddr_t peer_address; /* remote address */
-
- unsigned int listener : 1, /* listener socket */
- connected : 1, connecting : 1, /* connect pending
- * */
- bound : 1, /* bound to local addr */
- active : 1, /* currently active */
- pktdscp : 1; /* per packet dscp */
-
-#ifdef ISC_PLATFORM_RECVOVERFLOW
- unsigned char overflow; /* used for MSG_TRUNC fake */
-#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
-
- unsigned int dscp;
-};
-
-#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
-#define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
-
-struct isc_socketmgr {
- /* Not locked. */
- unsigned int magic;
- isc_mem_t *mctx;
- isc_mutex_t lock;
- isc_stats_t *stats;
- int nthreads;
- isc__socketthread_t *threads;
- unsigned int maxsocks;
- /* Locked by manager lock. */
- ISC_LIST(isc_socket_t) socklist;
- int reserved; /* unlocked */
- isc_condition_t shutdown_ok;
- size_t maxudp;
-};
-
-struct isc__socketthread {
- isc_socketmgr_t *manager;
- int threadid;
- isc_thread_t thread;
- int pipe_fds[2];
- isc_mutex_t *fdlock;
- /* Locked by fdlock. */
- isc_socket_t **fds;
- int *fdstate;
-#ifdef USE_KQUEUE
- int kqueue_fd;
- int nevents;
- struct kevent *events;
-#endif /* USE_KQUEUE */
-#ifdef USE_EPOLL
- int epoll_fd;
- int nevents;
- struct epoll_event *events;
- uint32_t *epoll_events;
-#endif /* USE_EPOLL */
-#ifdef USE_DEVPOLL
- int devpoll_fd;
- isc_resourcevalue_t open_max;
- unsigned int calls;
- int nevents;
- struct pollfd *events;
- pollinfo_t *fdpollinfo;
-#endif /* USE_DEVPOLL */
-#ifdef USE_SELECT
- int fd_bufsize;
- fd_set *read_fds;
- fd_set *read_fds_copy;
- fd_set *write_fds;
- fd_set *write_fds_copy;
- int maxfd;
-#endif /* USE_SELECT */
-};
-
-#define CLOSED 0 /* this one must be zero */
-#define MANAGED 1
-#define CLOSE_PENDING 2
-
-/*
- * send() and recv() iovec counts
- */
-#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
-#ifdef ISC_PLATFORM_RECVOVERFLOW
-#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
-#else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
-#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
-#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
-
-static isc_result_t
-socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
- isc_socket_t **socketp);
-static void
-send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
-static void
-send_senddone_event(isc_socket_t *, isc_socketevent_t **);
-static void
-send_connectdone_event(isc_socket_t *, isc_socket_connev_t **);
-static void
-free_socket(isc_socket_t **);
-static isc_result_t
-allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **);
-static void
-destroy(isc_socket_t **);
-static void
-internal_accept(isc_socket_t *);
-static void
-internal_connect(isc_socket_t *);
-static void
-internal_recv(isc_socket_t *);
-static void
-internal_send(isc_socket_t *);
-static void
-process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
-static void
-build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
- struct iovec *, size_t *);
-static void
-build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
- struct iovec *, size_t *);
-static bool
-process_ctlfd(isc__socketthread_t *thread);
-static void
-setdscp(isc_socket_t *sock, isc_dscp_t dscp);
-
-#define SELECT_POKE_SHUTDOWN (-1)
-#define SELECT_POKE_NOTHING (-2)
-#define SELECT_POKE_READ (-3)
-#define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */
-#define SELECT_POKE_WRITE (-4)
-#define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */
-#define SELECT_POKE_CLOSE (-5)
-
-/*%
- * Shortcut index arrays to get access to statistics counters.
- */
-enum {
- STATID_OPEN = 0,
- STATID_OPENFAIL = 1,
- STATID_CLOSE = 2,
- STATID_BINDFAIL = 3,
- STATID_CONNECTFAIL = 4,
- STATID_CONNECT = 5,
- STATID_ACCEPTFAIL = 6,
- STATID_ACCEPT = 7,
- STATID_SENDFAIL = 8,
- STATID_RECVFAIL = 9,
- STATID_ACTIVE = 10
-};
-static const isc_statscounter_t udp4statsindex[] = {
- isc_sockstatscounter_udp4open,
- isc_sockstatscounter_udp4openfail,
- isc_sockstatscounter_udp4close,
- isc_sockstatscounter_udp4bindfail,
- isc_sockstatscounter_udp4connectfail,
- isc_sockstatscounter_udp4connect,
- -1,
- -1,
- isc_sockstatscounter_udp4sendfail,
- isc_sockstatscounter_udp4recvfail,
- isc_sockstatscounter_udp4active
-};
-static const isc_statscounter_t udp6statsindex[] = {
- isc_sockstatscounter_udp6open,
- isc_sockstatscounter_udp6openfail,
- isc_sockstatscounter_udp6close,
- isc_sockstatscounter_udp6bindfail,
- isc_sockstatscounter_udp6connectfail,
- isc_sockstatscounter_udp6connect,
- -1,
- -1,
- isc_sockstatscounter_udp6sendfail,
- isc_sockstatscounter_udp6recvfail,
- isc_sockstatscounter_udp6active
-};
-static const isc_statscounter_t tcp4statsindex[] = {
- isc_sockstatscounter_tcp4open, isc_sockstatscounter_tcp4openfail,
- isc_sockstatscounter_tcp4close, isc_sockstatscounter_tcp4bindfail,
- isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
- isc_sockstatscounter_tcp4acceptfail, isc_sockstatscounter_tcp4accept,
- isc_sockstatscounter_tcp4sendfail, isc_sockstatscounter_tcp4recvfail,
- isc_sockstatscounter_tcp4active
-};
-static const isc_statscounter_t tcp6statsindex[] = {
- isc_sockstatscounter_tcp6open, isc_sockstatscounter_tcp6openfail,
- isc_sockstatscounter_tcp6close, isc_sockstatscounter_tcp6bindfail,
- isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
- isc_sockstatscounter_tcp6acceptfail, isc_sockstatscounter_tcp6accept,
- isc_sockstatscounter_tcp6sendfail, isc_sockstatscounter_tcp6recvfail,
- isc_sockstatscounter_tcp6active
-};
-static const isc_statscounter_t unixstatsindex[] = {
- isc_sockstatscounter_unixopen, isc_sockstatscounter_unixopenfail,
- isc_sockstatscounter_unixclose, isc_sockstatscounter_unixbindfail,
- isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
- isc_sockstatscounter_unixacceptfail, isc_sockstatscounter_unixaccept,
- isc_sockstatscounter_unixsendfail, isc_sockstatscounter_unixrecvfail,
- isc_sockstatscounter_unixactive
-};
-static const isc_statscounter_t rawstatsindex[] = {
- isc_sockstatscounter_rawopen,
- isc_sockstatscounter_rawopenfail,
- isc_sockstatscounter_rawclose,
- -1,
- -1,
- -1,
- -1,
- -1,
- -1,
- isc_sockstatscounter_rawrecvfail,
- isc_sockstatscounter_rawactive
-};
-
-static int
-gen_threadid(isc_socket_t *sock);
-
-static int
-gen_threadid(isc_socket_t *sock) {
- return (sock->fd % sock->manager->nthreads);
-}
-
-static void
-manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
- isc_logmodule_t *module, int level, const char *fmt, ...)
- ISC_FORMAT_PRINTF(5, 6);
-static void
-manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
- isc_logmodule_t *module, int level, const char *fmt, ...) {
- char msgbuf[2048];
- va_list ap;
-
- if (!isc_log_wouldlog(isc_lctx, level)) {
- return;
- }
-
- va_start(ap, fmt);
- vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
- va_end(ap);
-
- isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
- sockmgr, msgbuf);
-}
-
-static void
-thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
- isc_logmodule_t *module, int level, const char *fmt, ...)
- ISC_FORMAT_PRINTF(5, 6);
-static void
-thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
- isc_logmodule_t *module, int level, const char *fmt, ...) {
- char msgbuf[2048];
- va_list ap;
-
- if (!isc_log_wouldlog(isc_lctx, level)) {
- return;
- }
-
- va_start(ap, fmt);
- vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
- va_end(ap);
-
- isc_log_write(isc_lctx, category, module, level,
- "sockmgr %p thread %d: %s", thread->manager,
- thread->threadid, msgbuf);
-}
-
-static void
-socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
- isc_logcategory_t *category, isc_logmodule_t *module, int level,
- const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
-static void
-socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
- isc_logcategory_t *category, isc_logmodule_t *module, int level,
- const char *fmt, ...) {
- char msgbuf[2048];
- char peerbuf[ISC_SOCKADDR_FORMATSIZE];
- va_list ap;
-
- if (!isc_log_wouldlog(isc_lctx, level)) {
- return;
- }
-
- va_start(ap, fmt);
- vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
- va_end(ap);
-
- if (address == NULL) {
- isc_log_write(isc_lctx, category, module, level,
- "socket %p: %s", sock, msgbuf);
- } else {
- isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
- isc_log_write(isc_lctx, category, module, level,
- "socket %p %s: %s", sock, peerbuf, msgbuf);
- }
-}
-
-/*%
- * Increment socket-related statistics counters.
- */
-static inline void
-inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
- REQUIRE(counterid != -1);
-
- if (stats != NULL) {
- isc_stats_increment(stats, counterid);
- }
-}
-
-/*%
- * Decrement socket-related statistics counters.
- */
-static inline void
-dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
- REQUIRE(counterid != -1);
-
- if (stats != NULL) {
- isc_stats_decrement(stats, counterid);
- }
-}
-
-static inline isc_result_t
-watch_fd(isc__socketthread_t *thread, int fd, int msg) {
- isc_result_t result = ISC_R_SUCCESS;
-
-#ifdef USE_KQUEUE
- struct kevent evchange;
-
- memset(&evchange, 0, sizeof(evchange));
- if (msg == SELECT_POKE_READ) {
- evchange.filter = EVFILT_READ;
- } else {
- evchange.filter = EVFILT_WRITE;
- }
- evchange.flags = EV_ADD;
- evchange.ident = fd;
- if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
- result = isc__errno2result(errno);
- }
-
- return (result);
-#elif defined(USE_EPOLL)
- struct epoll_event event;
- uint32_t oldevents;
- int ret;
- int op;
-
- oldevents = thread->epoll_events[fd];
- if (msg == SELECT_POKE_READ) {
- thread->epoll_events[fd] |= EPOLLIN;
- } else {
- thread->epoll_events[fd] |= EPOLLOUT;
- }
-
- event.events = thread->epoll_events[fd];
- memset(&event.data, 0, sizeof(event.data));
- event.data.fd = fd;
-
- op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
- if (thread->fds[fd] != NULL) {
- LOCK(&thread->fds[fd]->lock);
- }
- ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
- if (thread->fds[fd] != NULL) {
- UNLOCK(&thread->fds[fd]->lock);
- }
- if (ret == -1) {
- if (errno == EEXIST) {
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "epoll_ctl(ADD/MOD) returned "
- "EEXIST for fd %d",
- fd);
- }
- result = isc__errno2result(errno);
- }
-
- return (result);
-#elif defined(USE_DEVPOLL)
- struct pollfd pfd;
-
- memset(&pfd, 0, sizeof(pfd));
- if (msg == SELECT_POKE_READ) {
- pfd.events = POLLIN;
- } else {
- pfd.events = POLLOUT;
- }
- pfd.fd = fd;
- pfd.revents = 0;
- if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
- result = isc__errno2result(errno);
- } else {
- if (msg == SELECT_POKE_READ) {
- thread->fdpollinfo[fd].want_read = 1;
- } else {
- thread->fdpollinfo[fd].want_write = 1;
- }
- }
-
- return (result);
-#elif defined(USE_SELECT)
- LOCK(&thread->manager->lock);
- if (msg == SELECT_POKE_READ) {
- FD_SET(fd, thread->read_fds);
- }
- if (msg == SELECT_POKE_WRITE) {
- FD_SET(fd, thread->write_fds);
- }
- UNLOCK(&thread->manager->lock);
-
- return (result);
-#endif /* ifdef USE_KQUEUE */
-}
-
-static inline isc_result_t
-unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
- isc_result_t result = ISC_R_SUCCESS;
-
-#ifdef USE_KQUEUE
- struct kevent evchange;
-
- memset(&evchange, 0, sizeof(evchange));
- if (msg == SELECT_POKE_READ) {
- evchange.filter = EVFILT_READ;
- } else {
- evchange.filter = EVFILT_WRITE;
- }
- evchange.flags = EV_DELETE;
- evchange.ident = fd;
- if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
- result = isc__errno2result(errno);
- }
-
- return (result);
-#elif defined(USE_EPOLL)
- struct epoll_event event;
- int ret;
- int op;
-
- if (msg == SELECT_POKE_READ) {
- thread->epoll_events[fd] &= ~(EPOLLIN);
- } else {
- thread->epoll_events[fd] &= ~(EPOLLOUT);
- }
-
- event.events = thread->epoll_events[fd];
- memset(&event.data, 0, sizeof(event.data));
- event.data.fd = fd;
-
- op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
- ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
- if (ret == -1 && errno != ENOENT) {
- char strbuf[ISC_STRERRORSIZE];
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
- fd, strbuf);
- result = ISC_R_UNEXPECTED;
- }
- return (result);
-#elif defined(USE_DEVPOLL)
- struct pollfd pfds[2];
- size_t writelen = sizeof(pfds[0]);
-
- memset(pfds, 0, sizeof(pfds));
- pfds[0].events = POLLREMOVE;
- pfds[0].fd = fd;
-
- /*
- * Canceling read or write polling via /dev/poll is tricky. Since it
- * only provides a way of canceling per FD, we may need to re-poll the
- * socket for the other operation.
- */
- if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
- pfds[1].events = POLLOUT;
- pfds[1].fd = fd;
- writelen += sizeof(pfds[1]);
- }
- if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
- pfds[1].events = POLLIN;
- pfds[1].fd = fd;
- writelen += sizeof(pfds[1]);
- }
-
- if (write(thread->devpoll_fd, pfds, writelen) == -1) {
- result = isc__errno2result(errno);
- } else {
- if (msg == SELECT_POKE_READ) {
- thread->fdpollinfo[fd].want_read = 0;
- } else {
- thread->fdpollinfo[fd].want_write = 0;
- }
- }
-
- return (result);
-#elif defined(USE_SELECT)
- LOCK(&thread->manager->lock);
- if (msg == SELECT_POKE_READ) {
- FD_CLR(fd, thread->read_fds);
- } else if (msg == SELECT_POKE_WRITE) {
- FD_CLR(fd, thread->write_fds);
- }
- UNLOCK(&thread->manager->lock);
-
- return (result);
-#endif /* ifdef USE_KQUEUE */
-}
-
-/*
- * A poke message was received, perform a proper watch/unwatch
- * on a fd provided
- */
-static void
-wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
- isc_result_t result;
- int lockid = FDLOCK_ID(fd);
-
- /*
- * This is a wakeup on a socket. If the socket is not in the
- * process of being closed, start watching it for either reads
- * or writes.
- */
-
- INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
-
- if (msg == SELECT_POKE_CLOSE) {
- LOCK(&thread->fdlock[lockid]);
- INSIST(thread->fdstate[fd] == CLOSE_PENDING);
- thread->fdstate[fd] = CLOSED;
- (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
- (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
- (void)close(fd);
- UNLOCK(&thread->fdlock[lockid]);
- return;
- }
-
- LOCK(&thread->fdlock[lockid]);
- if (thread->fdstate[fd] == CLOSE_PENDING) {
- /*
- * We accept (and ignore) any error from unwatch_fd() as we are
- * closing the socket, hoping it doesn't leave dangling state in
- * the kernel.
- * Note that unwatch_fd() must be called after releasing the
- * fdlock; otherwise it could cause deadlock due to a lock order
- * reversal.
- */
- (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
- (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
- UNLOCK(&thread->fdlock[lockid]);
- return;
- }
- if (thread->fdstate[fd] != MANAGED) {
- UNLOCK(&thread->fdlock[lockid]);
- return;
- }
-
- /*
- * Set requested bit.
- */
- result = watch_fd(thread, fd, msg);
- if (result != ISC_R_SUCCESS) {
- /*
- * XXXJT: what should we do? Ignoring the failure of watching
- * a socket will make the application dysfunctional, but there
- * seems to be no reasonable recovery process.
- */
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
- "failed to start watching FD (%d): %s", fd,
- isc_result_totext(result));
- }
- UNLOCK(&thread->fdlock[lockid]);
-}
-
-/*
- * Poke the select loop when there is something for us to do.
- * The write is required (by POSIX) to complete. That is, we
- * will not get partial writes.
- */
-static void
-select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) {
- int cc;
- int buf[2];
- char strbuf[ISC_STRERRORSIZE];
-
- buf[0] = fd;
- buf[1] = msg;
-
- do {
- cc = write(mgr->threads[threadid].pipe_fds[1], buf,
- sizeof(buf));
-#ifdef ENOSR
- /*
- * Treat ENOSR as EAGAIN but loop slowly as it is
- * unlikely to clear fast.
- */
- if (cc < 0 && errno == ENOSR) {
- sleep(1);
- errno = EAGAIN;
- }
-#endif /* ifdef ENOSR */
- } while (cc < 0 && SOFT_ERROR(errno));
-
- if (cc < 0) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- FATAL_ERROR(__FILE__, __LINE__,
- "write() failed during watcher poke: %s", strbuf);
- }
-
- INSIST(cc == sizeof(buf));
-}
-
-/*
- * Read a message on the internal fd.
- */
-static void
-select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
- int buf[2];
- int cc;
- char strbuf[ISC_STRERRORSIZE];
-
- cc = read(thread->pipe_fds[0], buf, sizeof(buf));
- if (cc < 0) {
- *msg = SELECT_POKE_NOTHING;
- *fd = -1; /* Silence compiler. */
- if (SOFT_ERROR(errno)) {
- return;
- }
-
- strerror_r(errno, strbuf, sizeof(strbuf));
- FATAL_ERROR(__FILE__, __LINE__,
- "read() failed during watcher poke: %s", strbuf);
- }
- INSIST(cc == sizeof(buf));
-
- *fd = buf[0];
- *msg = buf[1];
-}
-
-/*
- * Make a fd non-blocking.
- */
-static isc_result_t
-make_nonblock(int fd) {
- int ret;
- char strbuf[ISC_STRERRORSIZE];
-#ifdef USE_FIONBIO_IOCTL
- int on = 1;
-#else /* ifdef USE_FIONBIO_IOCTL */
- int flags;
-#endif /* ifdef USE_FIONBIO_IOCTL */
-
-#ifdef USE_FIONBIO_IOCTL
- ret = ioctl(fd, FIONBIO, (char *)&on);
-#else /* ifdef USE_FIONBIO_IOCTL */
- flags = fcntl(fd, F_GETFL, 0);
- flags |= O_NONBLOCK;
- ret = fcntl(fd, F_SETFL, flags);
-#endif /* ifdef USE_FIONBIO_IOCTL */
-
- if (ret == -1) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
-#ifdef USE_FIONBIO_IOCTL
- "ioctl(%d, FIONBIO, &on): %s", fd,
-#else /* ifdef USE_FIONBIO_IOCTL */
- "fcntl(%d, F_SETFL, %d): %s", fd, flags,
-#endif /* ifdef USE_FIONBIO_IOCTL */
- strbuf);
-
- return (ISC_R_UNEXPECTED);
- }
-
- return (ISC_R_SUCCESS);
-}
-
-#ifdef USE_CMSG
-/*
- * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
- * In order to ensure as much portability as possible, we provide wrapper
- * functions of these macros.
- * Note that cmsg_space() could run slow on OSes that do not have
- * CMSG_SPACE.
- */
-static inline socklen_t
-cmsg_len(socklen_t len) {
-#ifdef CMSG_LEN
- return (CMSG_LEN(len));
-#else /* ifdef CMSG_LEN */
- socklen_t hdrlen;
-
- /*
- * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
- * is correct.
- */
- hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
- return (hdrlen + len);
-#endif /* ifdef CMSG_LEN */
-}
-
-static inline socklen_t
-cmsg_space(socklen_t len) {
-#ifdef CMSG_SPACE
- return (CMSG_SPACE(len));
-#else /* ifdef CMSG_SPACE */
- struct msghdr msg;
- struct cmsghdr *cmsgp;
- /*
- * XXX: The buffer length is an ad-hoc value, but should be enough
- * in a practical sense.
- */
- char dummybuf[sizeof(struct cmsghdr) + 1024];
-
- memset(&msg, 0, sizeof(msg));
- msg.msg_control = dummybuf;
- msg.msg_controllen = sizeof(dummybuf);
-
- cmsgp = (struct cmsghdr *)dummybuf;
- cmsgp->cmsg_len = cmsg_len(len);
-
- cmsgp = CMSG_NXTHDR(&msg, cmsgp);
- if (cmsgp != NULL) {
- return ((char *)cmsgp - (char *)msg.msg_control);
- } else {
- return (0);
- }
-#endif /* ifdef CMSG_SPACE */
-}
-#endif /* USE_CMSG */
-
-/*
- * Process control messages received on a socket.
- */
-static void
-process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
-#ifdef USE_CMSG
- struct cmsghdr *cmsgp;
- struct in6_pktinfo *pktinfop;
-#ifdef SO_TIMESTAMP
- void *timevalp;
-#endif /* ifdef SO_TIMESTAMP */
-#endif /* ifdef USE_CMSG */
-
- /*
- * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
- * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
- * They are all here, outside of the CPP tests, because it is
- * more consistent with the usual ISC coding style.
- */
- UNUSED(sock);
- UNUSED(msg);
- UNUSED(dev);
-
-#ifdef MSG_TRUNC
- if ((msg->msg_flags & MSG_TRUNC) != 0) {
- dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
- }
-#endif /* ifdef MSG_TRUNC */
-
-#ifdef MSG_CTRUNC
- if ((msg->msg_flags & MSG_CTRUNC) != 0) {
- dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
- }
-#endif /* ifdef MSG_CTRUNC */
-
-#ifndef USE_CMSG
- return;
-#else /* ifndef USE_CMSG */
- if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
- return;
- }
-
-#ifdef SO_TIMESTAMP
- timevalp = NULL;
-#endif /* ifdef SO_TIMESTAMP */
- pktinfop = NULL;
-
- cmsgp = CMSG_FIRSTHDR(msg);
- while (cmsgp != NULL) {
- socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
-
- if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
- cmsgp->cmsg_type == IPV6_PKTINFO) {
- pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
- memmove(&dev->pktinfo, pktinfop,
- sizeof(struct in6_pktinfo));
- dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
- socket_log(sock, NULL, TRACE,
- "interface received on ifindex %u",
- dev->pktinfo.ipi6_ifindex);
- if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
- dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
- }
- goto next;
- }
-
-#ifdef SO_TIMESTAMP
- if (cmsgp->cmsg_level == SOL_SOCKET &&
- cmsgp->cmsg_type == SCM_TIMESTAMP) {
- struct timeval tv;
- timevalp = CMSG_DATA(cmsgp);
- memmove(&tv, timevalp, sizeof(tv));
- dev->timestamp.seconds = tv.tv_sec;
- dev->timestamp.nanoseconds = tv.tv_usec * 1000;
- dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
- goto next;
- }
-#endif /* ifdef SO_TIMESTAMP */
-
-#ifdef IPV6_TCLASS
- if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
- cmsgp->cmsg_type == IPV6_TCLASS) {
- dev->dscp = *(int *)CMSG_DATA(cmsgp);
- dev->dscp >>= 2;
- dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
- goto next;
- }
-#endif /* ifdef IPV6_TCLASS */
-
-#ifdef IP_TOS
- if (cmsgp->cmsg_level == IPPROTO_IP &&
- (cmsgp->cmsg_type == IP_TOS
-#ifdef IP_RECVTOS
- || cmsgp->cmsg_type == IP_RECVTOS
-#endif /* ifdef IP_RECVTOS */
- ))
- {
- dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
- dev->dscp >>= 2;
- dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
- goto next;
- }
-#endif /* ifdef IP_TOS */
- next:
- cmsgp = CMSG_NXTHDR(msg, cmsgp);
- }
-#endif /* USE_CMSG */
-}
-
-/*
- * Construct an iov array and attach it to the msghdr passed in. This is
- * the SEND constructor, which will use the used region of the buffer
- * (if using a buffer list) or will use the internal region (if a single
- * buffer I/O is requested).
- *
- * Nothing can be NULL, and the done event must list at least one buffer
- * on the buffer linked list for this function to be meaningful.
- *
- * If write_countp != NULL, *write_countp will hold the number of bytes
- * this transaction can send.
- */
-static void
-build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
- struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
- unsigned int iovcount;
- size_t write_count;
- struct cmsghdr *cmsgp;
-
- memset(msg, 0, sizeof(*msg));
-
- if (!sock->connected) {
- msg->msg_name = (void *)&dev->address.type.sa;
- msg->msg_namelen = dev->address.length;
- } else {
- msg->msg_name = NULL;
- msg->msg_namelen = 0;
- }
-
- write_count = dev->region.length - dev->n;
- iov[0].iov_base = (void *)(dev->region.base + dev->n);
- iov[0].iov_len = write_count;
- iovcount = 1;
-
- msg->msg_iov = iov;
- msg->msg_iovlen = iovcount;
- msg->msg_control = NULL;
- msg->msg_controllen = 0;
- msg->msg_flags = 0;
-#if defined(USE_CMSG)
-
- if ((sock->type == isc_sockettype_udp) &&
- ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
- {
- struct in6_pktinfo *pktinfop;
-
- socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
- dev->pktinfo.ipi6_ifindex);
-
- msg->msg_control = (void *)cmsgbuf;
- msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
- INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
-
- cmsgp = (struct cmsghdr *)cmsgbuf;
- cmsgp->cmsg_level = IPPROTO_IPV6;
- cmsgp->cmsg_type = IPV6_PKTINFO;
- cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
- pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
- memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
- }
-
-#if defined(IPV6_USE_MIN_MTU)
- if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
- ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
- {
- int use_min_mtu = 1; /* -1, 0, 1 */
-
- cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
- msg->msg_control = (void *)cmsgbuf;
- msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
- INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
-
- cmsgp->cmsg_level = IPPROTO_IPV6;
- cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
- cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
- memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
- }
-#endif /* if defined(IPV6_USE_MIN_MTU) */
-
- if (isc_dscp_check_value > -1) {
- if (sock->type == isc_sockettype_udp) {
- INSIST((int)dev->dscp == isc_dscp_check_value);
- } else if (sock->type == isc_sockettype_tcp) {
- INSIST((int)sock->dscp == isc_dscp_check_value);
- }
- }
-
-#if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
- if ((sock->type == isc_sockettype_udp) &&
- ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
- {
- int dscp = (dev->dscp << 2) & 0xff;
-
- INSIST(dev->dscp < 0x40);
-
-#ifdef IP_TOS
- if (sock->pf == AF_INET && sock->pktdscp) {
- cmsgp = (struct cmsghdr *)(cmsgbuf +
- msg->msg_controllen);
- msg->msg_control = (void *)cmsgbuf;
- msg->msg_controllen += cmsg_space(sizeof(dscp));
- INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
-
- cmsgp->cmsg_level = IPPROTO_IP;
- cmsgp->cmsg_type = IP_TOS;
- cmsgp->cmsg_len = cmsg_len(sizeof(char));
- *(unsigned char *)CMSG_DATA(cmsgp) = dscp;
- } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
- if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
- (void *)&dscp, sizeof(int)) < 0) {
- char strbuf[ISC_STRERRORSIZE];
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IP_TOS, %.02x)"
- " failed: %s",
- sock->fd, dscp >> 2, strbuf);
- } else {
- sock->dscp = dscp;
- }
- }
-#endif /* ifdef IP_TOS */
-#if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
- if (sock->pf == AF_INET6 && sock->pktdscp) {
- cmsgp = (struct cmsghdr *)(cmsgbuf +
- msg->msg_controllen);
- msg->msg_control = (void *)cmsgbuf;
- msg->msg_controllen += cmsg_space(sizeof(dscp));
- INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
-
- cmsgp->cmsg_level = IPPROTO_IPV6;
- cmsgp->cmsg_type = IPV6_TCLASS;
- cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
- memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
- } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
- if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
- (void *)&dscp, sizeof(int)) < 0)
- {
- char strbuf[ISC_STRERRORSIZE];
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IPV6_TCLASS, "
- "%.02x) failed: %s",
- sock->fd, dscp >> 2, strbuf);
- } else {
- sock->dscp = dscp;
- }
- }
-#endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
- if (msg->msg_controllen != 0 &&
- msg->msg_controllen < SENDCMSGBUFLEN) {
- memset(cmsgbuf + msg->msg_controllen, 0,
- SENDCMSGBUFLEN - msg->msg_controllen);
- }
- }
-#endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
- * defined(IPV6_TCLASS)) \
- * */
-#endif /* USE_CMSG */
-
- if (write_countp != NULL) {
- *write_countp = write_count;
- }
-}
-
-/*
- * Construct an iov array and attach it to the msghdr passed in. This is
- * the RECV constructor, which will use the available region of the buffer
- * (if using a buffer list) or will use the internal region (if a single
- * buffer I/O is requested).
- *
- * Nothing can be NULL, and the done event must list at least one buffer
- * on the buffer linked list for this function to be meaningful.
- *
- * If read_countp != NULL, *read_countp will hold the number of bytes
- * this transaction can receive.
- */
-static void
-build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
- struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
- unsigned int iovcount;
- size_t read_count;
-
- memset(msg, 0, sizeof(struct msghdr));
-
- if (sock->type == isc_sockettype_udp) {
- memset(&dev->address, 0, sizeof(dev->address));
- msg->msg_name = (void *)&dev->address.type.sa;
- msg->msg_namelen = sizeof(dev->address.type);
- } else { /* TCP */
- msg->msg_name = NULL;
- msg->msg_namelen = 0;
- dev->address = sock->peer_address;
- }
-
- read_count = dev->region.length - dev->n;
- iov[0].iov_base = (void *)(dev->region.base + dev->n);
- iov[0].iov_len = read_count;
- iovcount = 1;
-
- /*
- * If needed, set up to receive that one extra byte.
- */
-#ifdef ISC_PLATFORM_RECVOVERFLOW
- if (sock->type == isc_sockettype_udp) {
- INSIST(iovcount < MAXSCATTERGATHER_RECV);
- iov[iovcount].iov_base = (void *)(&sock->overflow);
- iov[iovcount].iov_len = 1;
- iovcount++;
- }
-#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
-
- msg->msg_iov = iov;
- msg->msg_iovlen = iovcount;
-
-#if defined(USE_CMSG)
- msg->msg_control = cmsgbuf;
- msg->msg_controllen = RECVCMSGBUFLEN;
-#else /* if defined(USE_CMSG) */
- msg->msg_control = NULL;
- msg->msg_controllen = 0;
-#endif /* USE_CMSG */
- msg->msg_flags = 0;
-
- if (read_countp != NULL) {
- *read_countp = read_count;
- }
-}
-
-static void
-set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock,
- isc_socketevent_t *dev) {
- if (sock->type == isc_sockettype_udp) {
- if (address != NULL) {
- dev->address = *address;
- } else {
- dev->address = sock->peer_address;
- }
- } else if (sock->type == isc_sockettype_tcp) {
- INSIST(address == NULL);
- dev->address = sock->peer_address;
- }
-}
-
-static void
-destroy_socketevent(isc_event_t *event) {
- isc_socketevent_t *ev = (isc_socketevent_t *)event;
-
- (ev->destroy)(event);
-}
-
-static isc_socketevent_t *
-allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
- isc_taskaction_t action, void *arg) {
- isc_socketevent_t *ev;
-
- ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
- action, arg, sizeof(*ev));
-
- ev->result = ISC_R_UNSET;
- ISC_LINK_INIT(ev, ev_link);
- ev->region.base = NULL;
- ev->n = 0;
- ev->offset = 0;
- ev->attributes = 0;
- ev->destroy = ev->ev_destroy;
- ev->ev_destroy = destroy_socketevent;
- ev->dscp = 0;
-
- return (ev);
-}
-
-#if defined(ISC_SOCKET_DEBUG)
-static void
-dump_msg(struct msghdr *msg) {
- unsigned int i;
-
- printf("MSGHDR %p\n", msg);
- printf("\tname %p, namelen %ld\n", msg->msg_name,
- (long)msg->msg_namelen);
- printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
- for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
- printf("\t\t%u\tbase %p, len %ld\n", i,
- msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
- printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
- (long)msg->msg_controllen);
-}
-#endif /* if defined(ISC_SOCKET_DEBUG) */
-
-#define DOIO_SUCCESS 0 /* i/o ok, event sent */
-#define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
-#define DOIO_HARD 2 /* i/o error, event sent */
-#define DOIO_EOF 3 /* EOF, no event sent */
-
-static int
-doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
- int cc;
- struct iovec iov[MAXSCATTERGATHER_RECV];
- size_t read_count;
- struct msghdr msghdr;
- int recv_errno;
- char strbuf[ISC_STRERRORSIZE];
- char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
-
- build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
-
-#if defined(ISC_SOCKET_DEBUG)
- dump_msg(&msghdr);
-#endif /* if defined(ISC_SOCKET_DEBUG) */
-
- cc = recvmsg(sock->fd, &msghdr, 0);
- recv_errno = errno;
-
-#if defined(ISC_SOCKET_DEBUG)
- dump_msg(&msghdr);
-#endif /* if defined(ISC_SOCKET_DEBUG) */
-
- if (cc < 0) {
- if (SOFT_ERROR(recv_errno)) {
- return (DOIO_SOFT);
- }
-
- if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
- strerror_r(recv_errno, strbuf, sizeof(strbuf));
- socket_log(sock, NULL, IOEVENT,
- "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
- sock->fd, cc, recv_errno, strbuf);
- }
-
-#define SOFT_OR_HARD(_system, _isc) \
- if (recv_errno == _system) { \
- if (sock->connected) { \
- dev->result = _isc; \
- inc_stats(sock->manager->stats, \
- sock->statsindex[STATID_RECVFAIL]); \
- return (DOIO_HARD); \
- } \
- return (DOIO_SOFT); \
- }
-#define ALWAYS_HARD(_system, _isc) \
- if (recv_errno == _system) { \
- dev->result = _isc; \
- inc_stats(sock->manager->stats, \
- sock->statsindex[STATID_RECVFAIL]); \
- return (DOIO_HARD); \
- }
-
- SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
- SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
- SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
- SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
- SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
- /*
- * Older operating systems may still return EPROTO in some
- * situations, for example when receiving ICMP/ICMPv6 errors.
- * A real life scenario is when ICMPv6 returns code 5 or 6.
- * These codes are introduced in RFC 4443 from March 2006,
- * and the document obsoletes RFC 1885. But unfortunately not
- * all operating systems have caught up with the new standard
- * (in 2020) and thus a generic protocol error is returned.
- */
- SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
- /* Should never get this one but it was seen. */
-#ifdef ENOPROTOOPT
- SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
-#endif /* ifdef ENOPROTOOPT */
- SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
-
-#undef SOFT_OR_HARD
-#undef ALWAYS_HARD
-
- dev->result = isc__errno2result(recv_errno);
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_RECVFAIL]);
- return (DOIO_HARD);
- }
-
- /*
- * On TCP and UNIX sockets, zero length reads indicate EOF,
- * while on UDP sockets, zero length reads are perfectly valid,
- * although strange.
- */
- switch (sock->type) {
- case isc_sockettype_tcp:
- case isc_sockettype_unix:
- if (cc == 0) {
- return (DOIO_EOF);
- }
- break;
- case isc_sockettype_udp:
- case isc_sockettype_raw:
- break;
- default:
- INSIST(0);
- ISC_UNREACHABLE();
- }
-
- if (sock->type == isc_sockettype_udp) {
- dev->address.length = msghdr.msg_namelen;
- if (isc_sockaddr_getport(&dev->address) == 0) {
- if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
- socket_log(sock, &dev->address, IOEVENT,
- "dropping source port zero packet");
- }
- return (DOIO_SOFT);
- }
- /*
- * Simulate a firewall blocking UDP responses bigger than
- * 'maxudp' bytes.
- */
- if (sock->manager->maxudp != 0 &&
- cc > (int)sock->manager->maxudp) {
- return (DOIO_SOFT);
- }
- }
-
- socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
-
- /*
- * Overflow bit detection. If we received MORE bytes than we should,
- * this indicates an overflow situation. Set the flag in the
- * dev entry and adjust how much we read by one.
- */
-#ifdef ISC_PLATFORM_RECVOVERFLOW
- if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
- dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
- cc--;
- }
-#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
-
- /*
- * If there are control messages attached, run through them and pull
- * out the interesting bits.
- */
- process_cmsg(sock, &msghdr, dev);
-
- /*
- * update the buffers (if any) and the i/o count
- */
- dev->n += cc;
-
- /*
- * If we read less than we expected, update counters,
- * and let the upper layer poke the descriptor.
- */
- if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
- return (DOIO_SOFT);
- }
-
- /*
- * Full reads are posted, or partials if partials are ok.
- */
- dev->result = ISC_R_SUCCESS;
- return (DOIO_SUCCESS);
-}
-
-/*
- * Returns:
- * DOIO_SUCCESS The operation succeeded. dev->result contains
- * ISC_R_SUCCESS.
- *
- * DOIO_HARD A hard or unexpected I/O error was encountered.
- * dev->result contains the appropriate error.
- *
- * DOIO_SOFT A soft I/O error was encountered. No senddone
- * event was sent. The operation should be retried.
- *
- * No other return values are possible.
- */
-static int
-doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
- int cc;
- struct iovec iov[MAXSCATTERGATHER_SEND];
- size_t write_count;
- struct msghdr msghdr;
- char addrbuf[ISC_SOCKADDR_FORMATSIZE];
- int attempts = 0;
- int send_errno;
- char strbuf[ISC_STRERRORSIZE];
- char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
-
- build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
-
-resend:
- if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
- write_count > sock->manager->maxudp)
- {
- cc = write_count;
- } else {
- cc = sendmsg(sock->fd, &msghdr, 0);
- }
- send_errno = errno;
-
- /*
- * Check for error or block condition.
- */
- if (cc < 0) {
- if (send_errno == EINTR && ++attempts < NRETRIES) {
- goto resend;
- }
-
- if (SOFT_ERROR(send_errno)) {
- if (errno == EWOULDBLOCK || errno == EAGAIN) {
- dev->result = ISC_R_WOULDBLOCK;
- }
- return (DOIO_SOFT);
- }
-
-#define SOFT_OR_HARD(_system, _isc) \
- if (send_errno == _system) { \
- if (sock->connected) { \
- dev->result = _isc; \
- inc_stats(sock->manager->stats, \
- sock->statsindex[STATID_SENDFAIL]); \
- return (DOIO_HARD); \
- } \
- return (DOIO_SOFT); \
- }
-#define ALWAYS_HARD(_system, _isc) \
- if (send_errno == _system) { \
- dev->result = _isc; \
- inc_stats(sock->manager->stats, \
- sock->statsindex[STATID_SENDFAIL]); \
- return (DOIO_HARD); \
- }
-
- SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
- ALWAYS_HARD(EACCES, ISC_R_NOPERM);
- ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
- ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
- ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
-#ifdef EHOSTDOWN
- ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
-#endif /* ifdef EHOSTDOWN */
- ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
- SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
- ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
- ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
- ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
-
-#undef SOFT_OR_HARD
-#undef ALWAYS_HARD
-
- /*
- * The other error types depend on whether or not the
- * socket is UDP or TCP. If it is UDP, some errors
- * that we expect to be fatal under TCP are merely
- * annoying, and are really soft errors.
- *
- * However, these soft errors are still returned as
- * a status.
- */
- isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
- strerror_r(send_errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
- addrbuf, strbuf);
- dev->result = isc__errno2result(send_errno);
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_SENDFAIL]);
- return (DOIO_HARD);
- }
-
- if (cc == 0) {
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_SENDFAIL]);
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "doio_send: send() returned 0");
- }
-
- /*
- * If we write less than we expected, update counters, poke.
- */
- dev->n += cc;
- if ((size_t)cc != write_count) {
- return (DOIO_SOFT);
- }
-
- /*
- * Exactly what we wanted to write. We're done with this
- * entry. Post its completion event.
- */
- dev->result = ISC_R_SUCCESS;
- return (DOIO_SUCCESS);
-}
-
-/*
- * Kill.
- *
- * Caller must ensure that the socket is not locked and no external
- * references exist.
- */
-static void
-socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) {
- int lockid = FDLOCK_ID(fd);
- /*
- * No one has this socket open, so the watcher doesn't have to be
- * poked, and the socket doesn't have to be locked.
- */
- LOCK(&thread->fdlock[lockid]);
- thread->fds[fd] = NULL;
- thread->fdstate[fd] = CLOSE_PENDING;
- UNLOCK(&thread->fdlock[lockid]);
- select_poke(thread->manager, thread->threadid, fd, SELECT_POKE_CLOSE);
-
- inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
-
- LOCK(&sock->lock);
- if (sock->active == 1) {
- dec_stats(thread->manager->stats,
- sock->statsindex[STATID_ACTIVE]);
- sock->active = 0;
- }
- UNLOCK(&sock->lock);
-
- /*
- * update manager->maxfd here (XXX: this should be implemented more
- * efficiently)
- */
-#ifdef USE_SELECT
- LOCK(&thread->manager->lock);
- if (thread->maxfd == fd) {
- int i;
-
- thread->maxfd = 0;
- for (i = fd - 1; i >= 0; i--) {
- lockid = FDLOCK_ID(i);
-
- LOCK(&thread->fdlock[lockid]);
- if (thread->fdstate[i] == MANAGED) {
- thread->maxfd = i;
- UNLOCK(&thread->fdlock[lockid]);
- break;
- }
- UNLOCK(&thread->fdlock[lockid]);
- }
- if (thread->maxfd < thread->pipe_fds[0]) {
- thread->maxfd = thread->pipe_fds[0];
- }
- }
-
- UNLOCK(&thread->manager->lock);
-#endif /* USE_SELECT */
-}
-
-static void
-destroy(isc_socket_t **sockp) {
- int fd = 0;
- isc_socket_t *sock = *sockp;
- isc_socketmgr_t *manager = sock->manager;
- isc__socketthread_t *thread = NULL;
-
- socket_log(sock, NULL, CREATION, "destroying");
-
- isc_refcount_destroy(&sock->references);
-
- LOCK(&sock->lock);
- INSIST(ISC_LIST_EMPTY(sock->connect_list));
- INSIST(ISC_LIST_EMPTY(sock->accept_list));
- INSIST(ISC_LIST_EMPTY(sock->recv_list));
- INSIST(ISC_LIST_EMPTY(sock->send_list));
- INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
-
- if (sock->fd >= 0) {
- fd = sock->fd;
- thread = &manager->threads[sock->threadid];
- sock->fd = -1;
- sock->threadid = -1;
- }
- UNLOCK(&sock->lock);
-
- if (fd > 0) {
- socketclose(thread, sock, fd);
- }
-
- LOCK(&manager->lock);
-
- ISC_LIST_UNLINK(manager->socklist, sock, link);
-
- if (ISC_LIST_EMPTY(manager->socklist)) {
- SIGNAL(&manager->shutdown_ok);
- }
-
- /* can't unlock manager as its memory context is still used */
- free_socket(sockp);
-
- UNLOCK(&manager->lock);
-}
-
-static isc_result_t
-allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
- isc_socket_t **socketp) {
- isc_socket_t *sock;
-
- sock = isc_mem_get(manager->mctx, sizeof(*sock));
-
- sock->magic = 0;
- isc_refcount_init(&sock->references, 0);
-
- sock->manager = manager;
- sock->type = type;
- sock->fd = -1;
- sock->threadid = -1;
- sock->dscp = 0; /* TOS/TCLASS is zero until set. */
- sock->statsindex = NULL;
- sock->active = 0;
-
- ISC_LINK_INIT(sock, link);
-
- memset(sock->name, 0, sizeof(sock->name));
- sock->tag = NULL;
-
- /*
- * Set up list of readers and writers to be initially empty.
- */
- ISC_LIST_INIT(sock->recv_list);
- ISC_LIST_INIT(sock->send_list);
- ISC_LIST_INIT(sock->accept_list);
- ISC_LIST_INIT(sock->connect_list);
-
- sock->listener = 0;
- sock->connected = 0;
- sock->connecting = 0;
- sock->bound = 0;
- sock->pktdscp = 0;
-
- /*
- * Initialize the lock.
- */
- isc_mutex_init(&sock->lock);
-
- sock->magic = SOCKET_MAGIC;
- *socketp = sock;
-
- return (ISC_R_SUCCESS);
-}
-
-/*
- * This event requires that the various lists be empty, that the reference
- * count be 1, and that the magic number is valid. The other socket bits,
- * like the lock, must be initialized as well. The fd associated must be
- * marked as closed, by setting it to -1 on close, or this routine will
- * also close the socket.
- */
-static void
-free_socket(isc_socket_t **socketp) {
- isc_socket_t *sock = *socketp;
- *socketp = NULL;
-
- INSIST(VALID_SOCKET(sock));
- isc_refcount_destroy(&sock->references);
- LOCK(&sock->lock);
- INSIST(!sock->connecting);
- INSIST(ISC_LIST_EMPTY(sock->recv_list));
- INSIST(ISC_LIST_EMPTY(sock->send_list));
- INSIST(ISC_LIST_EMPTY(sock->accept_list));
- INSIST(ISC_LIST_EMPTY(sock->connect_list));
- INSIST(!ISC_LINK_LINKED(sock, link));
- UNLOCK(&sock->lock);
-
- sock->magic = 0;
-
- isc_mutex_destroy(&sock->lock);
-
- isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
-}
-
-#if defined(SET_RCVBUF)
-static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
-static int rcvbuf = ISC_RECV_BUFFER_SIZE;
-
-static void
-set_rcvbuf(void) {
- int fd;
- int max = rcvbuf, min;
- socklen_t len;
-
- fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
- if (fd == -1) {
- switch (errno) {
- case EPROTONOSUPPORT:
- case EPFNOSUPPORT:
- case EAFNOSUPPORT:
- /*
- * Linux 2.2 (and maybe others) return EINVAL instead of
- * EAFNOSUPPORT.
- */
- case EINVAL:
- fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
- break;
- }
- }
- if (fd == -1) {
- return;
- }
-
- len = sizeof(min);
- if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
- min < rcvbuf)
- {
- again:
- if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
- sizeof(rcvbuf)) == -1)
- {
- if (errno == ENOBUFS && rcvbuf > min) {
- max = rcvbuf - 1;
- rcvbuf = (rcvbuf + min) / 2;
- goto again;
- } else {
- rcvbuf = min;
- goto cleanup;
- }
- } else {
- min = rcvbuf;
- }
- if (min != max) {
- rcvbuf = max;
- goto again;
- }
- }
-cleanup:
- close(fd);
-}
-#endif /* ifdef SO_RCVBUF */
-
-#if defined(SET_SNDBUF)
-static isc_once_t sndbuf_once = ISC_ONCE_INIT;
-static int sndbuf = ISC_SEND_BUFFER_SIZE;
-
-static void
-set_sndbuf(void) {
- int fd;
- int max = sndbuf, min;
- socklen_t len;
-
- fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
- if (fd == -1) {
- switch (errno) {
- case EPROTONOSUPPORT:
- case EPFNOSUPPORT:
- case EAFNOSUPPORT:
- /*
- * Linux 2.2 (and maybe others) return EINVAL instead of
- * EAFNOSUPPORT.
- */
- case EINVAL:
- fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
- break;
- }
- }
- if (fd == -1) {
- return;
- }
-
- len = sizeof(min);
- if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
- min < sndbuf)
- {
- again:
- if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
- sizeof(sndbuf)) == -1)
- {
- if (errno == ENOBUFS && sndbuf > min) {
- max = sndbuf - 1;
- sndbuf = (sndbuf + min) / 2;
- goto again;
- } else {
- sndbuf = min;
- goto cleanup;
- }
- } else {
- min = sndbuf;
- }
- if (min != max) {
- sndbuf = max;
- goto again;
- }
- }
-cleanup:
- close(fd);
-}
-#endif /* ifdef SO_SNDBUF */
-
-static void
-use_min_mtu(isc_socket_t *sock) {
-#if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
- UNUSED(sock);
-#endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
-#ifdef IPV6_USE_MIN_MTU
- /* use minimum MTU */
- if (sock->pf == AF_INET6) {
- int on = 1;
- (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
- (void *)&on, sizeof(on));
- }
-#endif /* ifdef IPV6_USE_MIN_MTU */
-#if defined(IPV6_MTU)
- /*
- * Use minimum MTU on IPv6 sockets.
- */
- if (sock->pf == AF_INET6) {
- int mtu = 1280;
- (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
- sizeof(mtu));
- }
-#endif /* if defined(IPV6_MTU) */
-}
-
-static void
-set_tcp_maxseg(isc_socket_t *sock, int size) {
-#ifdef TCP_MAXSEG
- if (sock->type == isc_sockettype_tcp) {
- (void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
- (void *)&size, sizeof(size));
- }
-#endif /* ifdef TCP_MAXSEG */
-}
-
-static void
-set_ip_disable_pmtud(isc_socket_t *sock) {
- /*
- * Disable Path MTU Discover on IP packets
- */
- if (sock->pf == AF_INET6) {
-#if defined(IPV6_DONTFRAG)
- (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
- &(int){ 0 }, sizeof(int));
-#endif
-#if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
- (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
- &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
-#endif
- } else if (sock->pf == AF_INET) {
-#if defined(IP_DONTFRAG)
- (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 },
- sizeof(int));
-#endif
-#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
- (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
- &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
-#endif
- }
-}
-
-static isc_result_t
-opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
- isc_result_t result;
- char strbuf[ISC_STRERRORSIZE];
- const char *err = "socket";
- int tries = 0;
-#if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
- int on = 1;
-#endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
-#if defined(SET_RCVBUF) || defined(SET_SNDBUF)
- socklen_t optlen;
- int size = 0;
-#endif
-
-again:
- switch (sock->type) {
- case isc_sockettype_udp:
- sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
- break;
- case isc_sockettype_tcp:
- sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
- break;
- case isc_sockettype_unix:
- sock->fd = socket(sock->pf, SOCK_STREAM, 0);
- break;
- case isc_sockettype_raw:
- errno = EPFNOSUPPORT;
- /*
- * PF_ROUTE is a alias for PF_NETLINK on linux.
- */
-#if defined(PF_ROUTE)
- if (sock->fd == -1 && sock->pf == PF_ROUTE) {
-#ifdef NETLINK_ROUTE
- sock->fd = socket(sock->pf, SOCK_RAW, NETLINK_ROUTE);
-#else /* ifdef NETLINK_ROUTE */
- sock->fd = socket(sock->pf, SOCK_RAW, 0);
-#endif /* ifdef NETLINK_ROUTE */
- if (sock->fd != -1) {
-#ifdef NETLINK_ROUTE
- struct sockaddr_nl sa;
- int n;
-
- /*
- * Do an implicit bind.
- */
- memset(&sa, 0, sizeof(sa));
- sa.nl_family = AF_NETLINK;
- sa.nl_groups = RTMGRP_IPV4_IFADDR |
- RTMGRP_IPV6_IFADDR;
- n = bind(sock->fd, (struct sockaddr *)&sa,
- sizeof(sa));
- if (n < 0) {
- close(sock->fd);
- sock->fd = -1;
- }
-#endif /* ifdef NETLINK_ROUTE */
- sock->bound = 1;
- }
- }
-#endif /* if defined(PF_ROUTE) */
- break;
- }
- if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
- goto again;
- }
-
-#ifdef F_DUPFD
- /*
- * Leave a space for stdio and TCP to work in.
- */
- if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
- sock->fd >= 0 && sock->fd < manager->reserved)
- {
- int newfd, tmp;
- newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
- tmp = errno;
- (void)close(sock->fd);
- errno = tmp;
- sock->fd = newfd;
- err = "isc_socket_create: fcntl/reserved";
- } else if (sock->fd >= 0 && sock->fd < 20) {
- int newfd, tmp;
- newfd = fcntl(sock->fd, F_DUPFD, 20);
- tmp = errno;
- (void)close(sock->fd);
- errno = tmp;
- sock->fd = newfd;
- err = "isc_socket_create: fcntl";
- }
-#endif /* ifdef F_DUPFD */
-
- if (sock->fd >= (int)manager->maxsocks) {
- (void)close(sock->fd);
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
- "socket: file descriptor exceeds limit (%d/%u)",
- sock->fd, manager->maxsocks);
- inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
- return (ISC_R_NORESOURCES);
- }
-
- if (sock->fd < 0) {
- switch (errno) {
- case EMFILE:
- case ENFILE:
- strerror_r(errno, strbuf, sizeof(strbuf));
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
- "%s: %s", err, strbuf);
- /* fallthrough */
- case ENOBUFS:
- inc_stats(manager->stats,
- sock->statsindex[STATID_OPENFAIL]);
- return (ISC_R_NORESOURCES);
-
- case EPROTONOSUPPORT:
- case EPFNOSUPPORT:
- case EAFNOSUPPORT:
- /*
- * Linux 2.2 (and maybe others) return EINVAL instead of
- * EAFNOSUPPORT.
- */
- case EINVAL:
- inc_stats(manager->stats,
- sock->statsindex[STATID_OPENFAIL]);
- return (ISC_R_FAMILYNOSUPPORT);
-
- default:
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
- err, strbuf);
- inc_stats(manager->stats,
- sock->statsindex[STATID_OPENFAIL]);
- return (ISC_R_UNEXPECTED);
- }
- }
-
- result = make_nonblock(sock->fd);
- if (result != ISC_R_SUCCESS) {
- (void)close(sock->fd);
- inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
- return (result);
- }
-
-#ifdef SO_NOSIGPIPE
- if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
- sizeof(on)) < 0) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
- sock->fd, strbuf);
- /* Press on... */
- }
-#endif /* ifdef SO_NOSIGPIPE */
-
- /*
- * Use minimum mtu if possible.
- */
- if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
- use_min_mtu(sock);
- set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
- }
-
-#if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
- if (sock->type == isc_sockettype_udp) {
-#if defined(USE_CMSG)
-#if defined(SO_TIMESTAMP)
- if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
- sizeof(on)) < 0 &&
- errno != ENOPROTOOPT)
- {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, SO_TIMESTAMP) failed: "
- "%s",
- sock->fd, strbuf);
- /* Press on... */
- }
-#endif /* SO_TIMESTAMP */
-
-#ifdef IPV6_RECVPKTINFO
- /* RFC 3542 */
- if ((sock->pf == AF_INET6) &&
- (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
- (void *)&on, sizeof(on)) < 0))
- {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IPV6_RECVPKTINFO) "
- "failed: %s",
- sock->fd, strbuf);
- }
-#else /* ifdef IPV6_RECVPKTINFO */
- /* RFC 2292 */
- if ((sock->pf == AF_INET6) &&
- (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
- (void *)&on, sizeof(on)) < 0))
- {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IPV6_PKTINFO) failed: "
- "%s",
- sock->fd, strbuf);
- }
-#endif /* IPV6_RECVPKTINFO */
-#endif /* defined(USE_CMSG) */
-
-#if defined(SET_RCVBUF)
- optlen = sizeof(size);
- if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
- &optlen) == 0 &&
- size < rcvbuf)
- {
- RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
- ISC_R_SUCCESS);
- if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
- (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
- {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, SO_RCVBUF, "
- "%d) failed: %s",
- sock->fd, rcvbuf, strbuf);
- }
- }
-#endif /* if defined(SET_RCVBUF) */
-
-#if defined(SET_SNDBUF)
- optlen = sizeof(size);
- if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
- &optlen) == 0 &&
- size < sndbuf)
- {
- RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
- ISC_R_SUCCESS);
- if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
- (void *)&sndbuf, sizeof(sndbuf)) == -1)
- {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, SO_SNDBUF, "
- "%d) failed: %s",
- sock->fd, sndbuf, strbuf);
- }
- }
-#endif /* if defined(SO_SNDBUF) */
- }
-#ifdef IPV6_RECVTCLASS
- if ((sock->pf == AF_INET6) &&
- (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
- sizeof(on)) < 0))
- {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IPV6_RECVTCLASS) "
- "failed: %s",
- sock->fd, strbuf);
- }
-#endif /* ifdef IPV6_RECVTCLASS */
-#ifdef IP_RECVTOS
- if ((sock->pf == AF_INET) &&
- (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
- sizeof(on)) < 0))
- {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IP_RECVTOS) "
- "failed: %s",
- sock->fd, strbuf);
- }
-#endif /* ifdef IP_RECVTOS */
-#endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
-
- set_ip_disable_pmtud(sock);
-
- inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
- if (sock->active == 0) {
- inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
- sock->active = 1;
- }
-
- return (ISC_R_SUCCESS);
-}
-
-/*
- * Create a 'type' socket, managed by 'manager'. Events will be posted to
- * 'task' and when dispatched 'action' will be called with 'arg' as the arg
- * value. The new socket is returned in 'socketp'.
- */
-static isc_result_t
-socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
- isc_socket_t **socketp) {
- isc_socket_t *sock = NULL;
- isc__socketthread_t *thread;
- isc_result_t result;
- int lockid;
-
- REQUIRE(VALID_MANAGER(manager));
- REQUIRE(socketp != NULL && *socketp == NULL);
-
- result = allocate_socket(manager, type, &sock);
- if (result != ISC_R_SUCCESS) {
- return (result);
- }
-
- switch (sock->type) {
- case isc_sockettype_udp:
- sock->statsindex = (pf == AF_INET) ? udp4statsindex
- : udp6statsindex;
-#define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
- sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
- break;
- case isc_sockettype_tcp:
- sock->statsindex = (pf == AF_INET) ? tcp4statsindex
- : tcp6statsindex;
- break;
- case isc_sockettype_unix:
- sock->statsindex = unixstatsindex;
- break;
- case isc_sockettype_raw:
- sock->statsindex = rawstatsindex;
- break;
- default:
- INSIST(0);
- ISC_UNREACHABLE();
- }
-
- sock->pf = pf;
-
- result = opensocket(manager, sock);
- if (result != ISC_R_SUCCESS) {
- free_socket(&sock);
- return (result);
- }
-
- if (sock->fd == -1) {
- abort();
- }
- sock->threadid = gen_threadid(sock);
- isc_refcount_increment0(&sock->references);
- thread = &manager->threads[sock->threadid];
- *socketp = sock;
-
- /*
- * Note we don't have to lock the socket like we normally would because
- * there are no external references to it yet.
- */
-
- lockid = FDLOCK_ID(sock->fd);
- LOCK(&thread->fdlock[lockid]);
- thread->fds[sock->fd] = sock;
- thread->fdstate[sock->fd] = MANAGED;
-#if defined(USE_EPOLL)
- thread->epoll_events[sock->fd] = 0;
-#endif /* if defined(USE_EPOLL) */
-#ifdef USE_DEVPOLL
- INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
- thread->fdpollinfo[sock->fd].want_write == 0);
-#endif /* ifdef USE_DEVPOLL */
- UNLOCK(&thread->fdlock[lockid]);
-
- LOCK(&manager->lock);
- ISC_LIST_APPEND(manager->socklist, sock, link);
-#ifdef USE_SELECT
- if (thread->maxfd < sock->fd) {
- thread->maxfd = sock->fd;
- }
-#endif /* ifdef USE_SELECT */
- UNLOCK(&manager->lock);
-
- socket_log(sock, NULL, CREATION, "created");
-
- return (ISC_R_SUCCESS);
-}
-
-/*%
- * Create a new 'type' socket managed by 'manager'. Events
- * will be posted to 'task' and when dispatched 'action' will be
- * called with 'arg' as the arg value. The new socket is returned
- * in 'socketp'.
- */
-isc_result_t
-isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
- isc_socket_t **socketp) {
- return (socket_create(manager, pf, type, socketp));
-}
-
-isc_result_t
-isc_socket_open(isc_socket_t *sock) {
- isc_result_t result;
- isc__socketthread_t *thread;
-
- REQUIRE(VALID_SOCKET(sock));
-
- LOCK(&sock->lock);
-
- REQUIRE(isc_refcount_current(&sock->references) >= 1);
- REQUIRE(sock->fd == -1);
- REQUIRE(sock->threadid == -1);
-
- result = opensocket(sock->manager, sock);
-
- UNLOCK(&sock->lock);
-
- if (result != ISC_R_SUCCESS) {
- sock->fd = -1;
- } else {
- sock->threadid = gen_threadid(sock);
- thread = &sock->manager->threads[sock->threadid];
- int lockid = FDLOCK_ID(sock->fd);
-
- LOCK(&thread->fdlock[lockid]);
- thread->fds[sock->fd] = sock;
- thread->fdstate[sock->fd] = MANAGED;
-#if defined(USE_EPOLL)
- thread->epoll_events[sock->fd] = 0;
-#endif /* if defined(USE_EPOLL) */
-#ifdef USE_DEVPOLL
- INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
- thread->fdpollinfo[sock->fd].want_write == 0);
-#endif /* ifdef USE_DEVPOLL */
- UNLOCK(&thread->fdlock[lockid]);
-
-#ifdef USE_SELECT
- LOCK(&sock->manager->lock);
- if (thread->maxfd < sock->fd) {
- thread->maxfd = sock->fd;
- }
- UNLOCK(&sock->manager->lock);
-#endif /* ifdef USE_SELECT */
- }
-
- return (result);
-}
-
-/*
- * Attach to a socket. Caller must explicitly detach when it is done.
- */
-void
-isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
- REQUIRE(VALID_SOCKET(sock));
- REQUIRE(socketp != NULL && *socketp == NULL);
-
- int old_refs = isc_refcount_increment(&sock->references);
- REQUIRE(old_refs > 0);
-
- *socketp = sock;
-}
-
-/*
- * Dereference a socket. If this is the last reference to it, clean things
- * up by destroying the socket.
- */
-void
-isc_socket_detach(isc_socket_t **socketp) {
- isc_socket_t *sock;
-
- REQUIRE(socketp != NULL);
- sock = *socketp;
- REQUIRE(VALID_SOCKET(sock));
- if (isc_refcount_decrement(&sock->references) == 1) {
- destroy(&sock);
- }
-
- *socketp = NULL;
-}
-
-isc_result_t
-isc_socket_close(isc_socket_t *sock) {
- int fd;
- isc_socketmgr_t *manager;
- isc__socketthread_t *thread;
-
- REQUIRE(VALID_SOCKET(sock));
-
- LOCK(&sock->lock);
-
- REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
-
- INSIST(!sock->connecting);
- INSIST(ISC_LIST_EMPTY(sock->recv_list));
- INSIST(ISC_LIST_EMPTY(sock->send_list));
- INSIST(ISC_LIST_EMPTY(sock->accept_list));
- INSIST(ISC_LIST_EMPTY(sock->connect_list));
-
- manager = sock->manager;
- thread = &manager->threads[sock->threadid];
- fd = sock->fd;
- sock->fd = -1;
- sock->threadid = -1;
-
- memset(sock->name, 0, sizeof(sock->name));
- sock->tag = NULL;
- sock->listener = 0;
- sock->connected = 0;
- sock->connecting = 0;
- sock->bound = 0;
- isc_sockaddr_any(&sock->peer_address);
-
- UNLOCK(&sock->lock);
-
- socketclose(thread, sock, fd);
-
- return (ISC_R_SUCCESS);
-}
-
-/*
- * Dequeue an item off the given socket's read queue, set the result code
- * in the done event to the one provided, and send it to the task it was
- * destined for.
- *
- * If the event to be sent is on a list, remove it before sending. If
- * asked to, send and detach from the socket as well.
- *
- * Caller must have the socket locked if the event is attached to the socket.
- */
-static void
-send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
- isc_task_t *task;
-
- task = (*dev)->ev_sender;
-
- (*dev)->ev_sender = sock;
-
- if (ISC_LINK_LINKED(*dev, ev_link)) {
- ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
- }
-
- if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
- isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
- sock->threadid);
- } else {
- isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
- }
-}
-
-/*
- * See comments for send_recvdone_event() above.
- *
- * Caller must have the socket locked if the event is attached to the socket.
- */
-static void
-send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
- isc_task_t *task;
-
- INSIST(dev != NULL && *dev != NULL);
-
- task = (*dev)->ev_sender;
- (*dev)->ev_sender = sock;
-
- if (ISC_LINK_LINKED(*dev, ev_link)) {
- ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
- }
-
- if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
- isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
- sock->threadid);
- } else {
- isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
- }
-}
-
-/*
- * See comments for send_recvdone_event() above.
- *
- * Caller must have the socket locked if the event is attached to the socket.
- */
-static void
-send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) {
- isc_task_t *task;
-
- INSIST(dev != NULL && *dev != NULL);
-
- task = (*dev)->ev_sender;
- (*dev)->ev_sender = sock;
-
- if (ISC_LINK_LINKED(*dev, ev_link)) {
- ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
- }
-
- isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
-}
-
-/*
- * Call accept() on a socket, to get the new file descriptor. The listen
- * socket is used as a prototype to create a new isc_socket_t. The new
- * socket has one outstanding reference. The task receiving the event
- * will be detached from just after the event is delivered.
- *
- * On entry to this function, the event delivered is the internal
- * readable event, and the first item on the accept_list should be
- * the done event we want to send. If the list is empty, this is a no-op,
- * so just unlock and return.
- */
-static void
-internal_accept(isc_socket_t *sock) {
- isc_socketmgr_t *manager;
- isc__socketthread_t *thread, *nthread;
- isc_socket_newconnev_t *dev;
- isc_task_t *task;
- socklen_t addrlen;
- int fd;
- isc_result_t result = ISC_R_SUCCESS;
- char strbuf[ISC_STRERRORSIZE];
- const char *err = "accept";
-
- INSIST(VALID_SOCKET(sock));
- REQUIRE(sock->fd >= 0);
-
- socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
-
- manager = sock->manager;
- INSIST(VALID_MANAGER(manager));
- thread = &manager->threads[sock->threadid];
-
- INSIST(sock->listener);
-
- /*
- * Get the first item off the accept list.
- * If it is empty, unlock the socket and return.
- */
- dev = ISC_LIST_HEAD(sock->accept_list);
- if (dev == NULL) {
- unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
- UNLOCK(&sock->lock);
- return;
- }
-
- /*
- * Try to accept the new connection. If the accept fails with
- * EAGAIN or EINTR, simply poke the watcher to watch this socket
- * again. Also ignore ECONNRESET, which has been reported to
- * be spuriously returned on Linux 2.2.19 although it is not
- * a documented error for accept(). ECONNABORTED has been
- * reported for Solaris 8. The rest are thrown in not because
- * we have seen them but because they are ignored by other
- * daemons such as BIND 8 and Apache.
- */
-
- addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
- memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
- fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
- (void *)&addrlen);
-
-#ifdef F_DUPFD
- /*
- * Leave a space for stdio to work in.
- */
- if (fd >= 0 && fd < 20) {
- int newfd, tmp;
- newfd = fcntl(fd, F_DUPFD, 20);
- tmp = errno;
- (void)close(fd);
- errno = tmp;
- fd = newfd;
- err = "accept/fcntl";
- }
-#endif /* ifdef F_DUPFD */
-
- if (fd < 0) {
- if (SOFT_ERROR(errno)) {
- goto soft_error;
- }
- switch (errno) {
- case ENFILE:
- case EMFILE:
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
- "%s: too many open file descriptors",
- err);
- goto soft_error;
-
- case ENOBUFS:
- case ENOMEM:
- case ECONNRESET:
- case ECONNABORTED:
- case EHOSTUNREACH:
- case EHOSTDOWN:
- case ENETUNREACH:
- case ENETDOWN:
- case ECONNREFUSED:
-#ifdef EPROTO
- case EPROTO:
-#endif /* ifdef EPROTO */
-#ifdef ENONET
- case ENONET:
-#endif /* ifdef ENONET */
- goto soft_error;
- default:
- break;
- }
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "internal_accept: %s() failed: %s", err,
- strbuf);
- fd = -1;
- result = ISC_R_UNEXPECTED;
- } else {
- if (addrlen == 0U) {
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "internal_accept(): "
- "accept() failed to return "
- "remote address");
-
- (void)close(fd);
- goto soft_error;
- } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
- sock->pf) {
- UNEXPECTED_ERROR(
- __FILE__, __LINE__,
- "internal_accept(): "
- "accept() returned peer address "
- "family %u (expected %u)",
- NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
- sock->pf);
- (void)close(fd);
- goto soft_error;
- } else if (fd >= (int)manager->maxsocks) {
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
- "accept: file descriptor exceeds limit "
- "(%d/%u)",
- fd, manager->maxsocks);
- (void)close(fd);
- goto soft_error;
- }
- }
-
- if (fd != -1) {
- NEWCONNSOCK(dev)->peer_address.length = addrlen;
- NEWCONNSOCK(dev)->pf = sock->pf;
- }
-
- /*
- * Pull off the done event.
- */
- ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
-
- /*
- * Poke watcher if there are more pending accepts.
- */
- if (ISC_LIST_EMPTY(sock->accept_list)) {
- unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
- }
-
- if (fd != -1) {
- result = make_nonblock(fd);
- if (result != ISC_R_SUCCESS) {
- (void)close(fd);
- fd = -1;
- }
- }
-
- /*
- * We need to unlock sock->lock now to be able to lock manager->lock
- * without risking a deadlock with xmlstats.
- */
- UNLOCK(&sock->lock);
-
- /*
- * -1 means the new socket didn't happen.
- */
- if (fd != -1) {
- int lockid = FDLOCK_ID(fd);
-
- NEWCONNSOCK(dev)->fd = fd;
- NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
- NEWCONNSOCK(dev)->bound = 1;
- NEWCONNSOCK(dev)->connected = 1;
- nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
-
- /*
- * We already hold a lock on one fdlock in accepting thread,
- * we need to make sure that we don't double lock.
- */
- bool same_bucket = (sock->threadid ==
- NEWCONNSOCK(dev)->threadid) &&
- (FDLOCK_ID(sock->fd) == lockid);
-
- /*
- * Use minimum mtu if possible.
- */
- use_min_mtu(NEWCONNSOCK(dev));
- set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
-
- /*
- * Ensure DSCP settings are inherited across accept.
- */
- setdscp(NEWCONNSOCK(dev), sock->dscp);
-
- /*
- * Save away the remote address
- */
- dev->address = NEWCONNSOCK(dev)->peer_address;
-
- if (NEWCONNSOCK(dev)->active == 0) {
- inc_stats(manager->stats,
- NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
- NEWCONNSOCK(dev)->active = 1;
- }
-
- if (!same_bucket) {
- LOCK(&nthread->fdlock[lockid]);
- }
- nthread->fds[fd] = NEWCONNSOCK(dev);
- nthread->fdstate[fd] = MANAGED;
-#if defined(USE_EPOLL)
- nthread->epoll_events[fd] = 0;
-#endif /* if defined(USE_EPOLL) */
- if (!same_bucket) {
- UNLOCK(&nthread->fdlock[lockid]);
- }
-
- LOCK(&manager->lock);
-
-#ifdef USE_SELECT
- if (nthread->maxfd < fd) {
- nthread->maxfd = fd;
- }
-#endif /* ifdef USE_SELECT */
-
- socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
- "accepted connection, new socket %p",
- dev->newsocket);
-
- ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
-
- UNLOCK(&manager->lock);
-
- inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
- } else {
- inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
- isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
- free_socket((isc_socket_t **)&dev->newsocket);
- }
-
- /*
- * Fill in the done event details and send it off.
- */
- dev->result = result;
- task = dev->ev_sender;
- dev->ev_sender = sock;
-
- isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
- return;
-
-soft_error:
- watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
- UNLOCK(&sock->lock);
-
- inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
- return;
-}
-
-static void
-internal_recv(isc_socket_t *sock) {
- isc_socketevent_t *dev;
-
- INSIST(VALID_SOCKET(sock));
- REQUIRE(sock->fd >= 0);
-
- dev = ISC_LIST_HEAD(sock->recv_list);
- if (dev == NULL) {
- goto finish;
- }
-
- socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
- dev, dev->ev_sender);
-
- /*
- * Try to do as much I/O as possible on this socket. There are no
- * limits here, currently.
- */
- while (dev != NULL) {
- switch (doio_recv(sock, dev)) {
- case DOIO_SOFT:
- goto finish;
-
- case DOIO_EOF:
- /*
- * read of 0 means the remote end was closed.
- * Run through the event queue and dispatch all
- * the events with an EOF result code.
- */
- do {
- dev->result = ISC_R_EOF;
- send_recvdone_event(sock, &dev);
- dev = ISC_LIST_HEAD(sock->recv_list);
- } while (dev != NULL);
- goto finish;
-
- case DOIO_SUCCESS:
- case DOIO_HARD:
- send_recvdone_event(sock, &dev);
- break;
- }
-
- dev = ISC_LIST_HEAD(sock->recv_list);
- }
-
-finish:
- if (ISC_LIST_EMPTY(sock->recv_list)) {
- unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
- SELECT_POKE_READ);
- }
-}
-
-static void
-internal_send(isc_socket_t *sock) {
- isc_socketevent_t *dev;
-
- INSIST(VALID_SOCKET(sock));
- REQUIRE(sock->fd >= 0);
-
- dev = ISC_LIST_HEAD(sock->send_list);
- if (dev == NULL) {
- goto finish;
- }
- socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
- dev->ev_sender);
-
- /*
- * Try to do as much I/O as possible on this socket. There are no
- * limits here, currently.
- */
- while (dev != NULL) {
- switch (doio_send(sock, dev)) {
- case DOIO_SOFT:
- goto finish;
-
- case DOIO_HARD:
- case DOIO_SUCCESS:
- send_senddone_event(sock, &dev);
- break;
- }
-
- dev = ISC_LIST_HEAD(sock->send_list);
- }
-
-finish:
- if (ISC_LIST_EMPTY(sock->send_list)) {
- unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
- SELECT_POKE_WRITE);
- }
-}
-
-/*
- * Process read/writes on each fd here. Avoid locking
- * and unlocking twice if both reads and writes are possible.
- */
-static void
-process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
- isc_socket_t *sock;
- int lockid = FDLOCK_ID(fd);
-
- /*
- * If the socket is going to be closed, don't do more I/O.
- */
- LOCK(&thread->fdlock[lockid]);
- if (thread->fdstate[fd] == CLOSE_PENDING) {
- UNLOCK(&thread->fdlock[lockid]);
-
- (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
- (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
- return;
- }
-
- sock = thread->fds[fd];
- if (sock == NULL) {
- UNLOCK(&thread->fdlock[lockid]);
- return;
- }
-
- LOCK(&sock->lock);
-
- if (sock->fd < 0) {
- /*
- * Sock is being closed - the final external reference
- * is gone but it was not yet removed from event loop
- * and fdstate[]/fds[] as destroy() is waiting on
- * thread->fdlock[lockid] or sock->lock that we're holding.
- * Just release the locks and bail.
- */
- UNLOCK(&sock->lock);
- UNLOCK(&thread->fdlock[lockid]);
- return;
- }
-
- REQUIRE(readable || writeable);
- if (writeable) {
- if (sock->connecting) {
- internal_connect(sock);
- } else {
- internal_send(sock);
- }
- }
-
- if (readable) {
- if (sock->listener) {
- internal_accept(sock); /* unlocks sock */
- } else {
- internal_recv(sock);
- UNLOCK(&sock->lock);
- }
- } else {
- UNLOCK(&sock->lock);
- }
-
- UNLOCK(&thread->fdlock[lockid]);
-
- /*
- * Socket destruction might be pending, it will resume
- * after releasing fdlock and sock->lock.
- */
-}
-
-/*
- * process_fds is different for different event loops
- * it takes the events from event loops and for each FD
- * launches process_fd
- */
-#ifdef USE_KQUEUE
-static bool
-process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
- int i;
- bool readable, writable;
- bool done = false;
- bool have_ctlevent = false;
- if (nevents == thread->nevents) {
- /*
- * This is not an error, but something unexpected. If this
- * happens, it may indicate the need for increasing
- * ISC_SOCKET_MAXEVENTS.
- */
- thread_log(thread, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
- "maximum number of FD events (%d) received",
- nevents);
- }
-
- for (i = 0; i < nevents; i++) {
- REQUIRE(events[i].ident < thread->manager->maxsocks);
- if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
- have_ctlevent = true;
- continue;
- }
- readable = (events[i].filter == EVFILT_READ);
- writable = (events[i].filter == EVFILT_WRITE);
- process_fd(thread, events[i].ident, readable, writable);
- }
-
- if (have_ctlevent) {
- done = process_ctlfd(thread);
- }
-
- return (done);
-}
-#elif defined(USE_EPOLL)
-static bool
-process_fds(isc__socketthread_t *thread, struct epoll_event *events,
- int nevents) {
- int i;
- bool done = false;
- bool have_ctlevent = false;
-
- if (nevents == thread->nevents) {
- thread_log(thread, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
- "maximum number of FD events (%d) received",
- nevents);
- }
-
- for (i = 0; i < nevents; i++) {
- REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
- if (events[i].data.fd == thread->pipe_fds[0]) {
- have_ctlevent = true;
- continue;
- }
- if ((events[i].events & EPOLLERR) != 0 ||
- (events[i].events & EPOLLHUP) != 0) {
- /*
- * epoll does not set IN/OUT bits on an erroneous
- * condition, so we need to try both anyway. This is a
- * bit inefficient, but should be okay for such rare
- * events. Note also that the read or write attempt
- * won't block because we use non-blocking sockets.
- */
- int fd = events[i].data.fd;
- events[i].events |= thread->epoll_events[fd];
- }
- process_fd(thread, events[i].data.fd,
- (events[i].events & EPOLLIN) != 0,
- (events[i].events & EPOLLOUT) != 0);
- }
-
- if (have_ctlevent) {
- done = process_ctlfd(thread);
- }
-
- return (done);
-}
-#elif defined(USE_DEVPOLL)
-static bool
-process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
- int i;
- bool done = false;
- bool have_ctlevent = false;
-
- if (nevents == thread->nevents) {
- thread_log(thread, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
- "maximum number of FD events (%d) received",
- nevents);
- }
-
- for (i = 0; i < nevents; i++) {
- REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
- if (events[i].fd == thread->pipe_fds[0]) {
- have_ctlevent = true;
- continue;
- }
- process_fd(thread, events[i].fd,
- (events[i].events & POLLIN) != 0,
- (events[i].events & POLLOUT) != 0);
- }
-
- if (have_ctlevent) {
- done = process_ctlfd(thread);
- }
-
- return (done);
-}
-#elif defined(USE_SELECT)
-static void
-process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
- fd_set *writefds) {
- int i;
-
- REQUIRE(maxfd <= (int)thread->manager->maxsocks);
-
- for (i = 0; i < maxfd; i++) {
- if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
- continue;
- }
- process_fd(thread, i, FD_ISSET(i, readfds),
- FD_ISSET(i, writefds));
- }
-}
-#endif /* ifdef USE_KQUEUE */
-
-static bool
-process_ctlfd(isc__socketthread_t *thread) {
- int msg, fd;
-
- for (;;) {
- select_readmsg(thread, &fd, &msg);
-
- thread_log(thread, IOEVENT,
- "watcher got message %d for socket %d", msg, fd);
-
- /*
- * Nothing to read?
- */
- if (msg == SELECT_POKE_NOTHING) {
- break;
- }
-
- /*
- * Handle shutdown message. We really should
- * jump out of this loop right away, but
- * it doesn't matter if we have to do a little
- * more work first.
- */
- if (msg == SELECT_POKE_SHUTDOWN) {
- return (true);
- }
-
- /*
- * This is a wakeup on a socket. Look
- * at the event queue for both read and write,
- * and decide if we need to watch on it now
- * or not.
- */
- wakeup_socket(thread, fd, msg);
- }
-
- return (false);
-}
-
-/*
- * This is the thread that will loop forever, always in a select or poll
- * call.
- *
- * When select returns something to do, do whatever's necessary and post
- * an event to the task that was requesting the action.
- */
-static isc_threadresult_t
-netthread(void *uap) {
- isc__socketthread_t *thread = uap;
- isc_socketmgr_t *manager = thread->manager;
- (void)manager;
- bool done;
- int cc;
-#ifdef USE_KQUEUE
- const char *fnname = "kevent()";
-#elif defined(USE_EPOLL)
- const char *fnname = "epoll_wait()";
-#elif defined(USE_DEVPOLL)
- isc_result_t result;
- const char *fnname = "ioctl(DP_POLL)";
- struct dvpoll dvp;
- int pass;
-#if defined(ISC_SOCKET_USE_POLLWATCH)
- pollstate_t pollstate = poll_idle;
-#endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
-#elif defined(USE_SELECT)
- const char *fnname = "select()";
- int maxfd;
- int ctlfd;
-#endif /* ifdef USE_KQUEUE */
- char strbuf[ISC_STRERRORSIZE];
-
-#if defined(USE_SELECT)
- /*
- * Get the control fd here. This will never change.
- */
- ctlfd = thread->pipe_fds[0];
-#endif /* if defined(USE_SELECT) */
- done = false;
- while (!done) {
- do {
-#ifdef USE_KQUEUE
- cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
- thread->nevents, NULL);
-#elif defined(USE_EPOLL)
- cc = epoll_wait(thread->epoll_fd, thread->events,
- thread->nevents, -1);
-#elif defined(USE_DEVPOLL)
- /*
- * Re-probe every thousand calls.
- */
- if (thread->calls++ > 1000U) {
- result = isc_resource_getcurlimit(
- isc_resource_openfiles,
- &thread->open_max);
- if (result != ISC_R_SUCCESS) {
- thread->open_max = 64;
- }
- thread->calls = 0;
- }
- for (pass = 0; pass < 2; pass++) {
- dvp.dp_fds = thread->events;
- dvp.dp_nfds = thread->nevents;
- if (dvp.dp_nfds >= thread->open_max) {
- dvp.dp_nfds = thread->open_max - 1;
- }
-#ifndef ISC_SOCKET_USE_POLLWATCH
- dvp.dp_timeout = -1;
-#else /* ifndef ISC_SOCKET_USE_POLLWATCH */
- if (pollstate == poll_idle) {
- dvp.dp_timeout = -1;
- } else {
- dvp.dp_timeout =
- ISC_SOCKET_POLLWATCH_TIMEOUT;
- }
-#endif /* ISC_SOCKET_USE_POLLWATCH */
- cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
- if (cc == -1 && errno == EINVAL) {
- /*
- * {OPEN_MAX} may have dropped. Look
- * up the current value and try again.
- */
- result = isc_resource_getcurlimit(
- isc_resource_openfiles,
- &thread->open_max);
- if (result != ISC_R_SUCCESS) {
- thread->open_max = 64;
- }
- } else {
- break;
- }
- }
-#elif defined(USE_SELECT)
- /*
- * We will have only one thread anyway, we can lock
- * manager lock and don't care
- */
- LOCK(&manager->lock);
- memmove(thread->read_fds_copy, thread->read_fds,
- thread->fd_bufsize);
- memmove(thread->write_fds_copy, thread->write_fds,
- thread->fd_bufsize);
- maxfd = thread->maxfd + 1;
- UNLOCK(&manager->lock);
-
- cc = select(maxfd, thread->read_fds_copy,
- thread->write_fds_copy, NULL, NULL);
-#endif /* USE_KQUEUE */
-
- if (cc < 0 && !SOFT_ERROR(errno)) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
- fnname, strbuf);
- }
-
-#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
- if (cc == 0) {
- if (pollstate == poll_active) {
- pollstate = poll_checking;
- } else if (pollstate == poll_checking) {
- pollstate = poll_idle;
- }
- } else if (cc > 0) {
- if (pollstate == poll_checking) {
- /*
- * XXX: We'd like to use a more
- * verbose log level as it's actually an
- * unexpected event, but the kernel bug
- * reportedly happens pretty frequently
- * (and it can also be a false positive)
- * so it would be just too noisy.
- */
- thread_log(thread,
- ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET,
- ISC_LOG_DEBUG(1),
- "unexpected POLL timeout");
- }
- pollstate = poll_active;
- }
-#endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
- } while (cc < 0);
-
-#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
- done = process_fds(thread, thread->events, cc);
-#elif defined(USE_SELECT)
- process_fds(thread, maxfd, thread->read_fds_copy,
- thread->write_fds_copy);
-
- /*
- * Process reads on internal, control fd.
- */
- if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
- done = process_ctlfd(thread);
- }
-#endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
- * */
- }
-
- thread_log(thread, TRACE, "watcher exiting");
- return ((isc_threadresult_t)0);
-}
-
-void
-isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) {
- REQUIRE(VALID_MANAGER(manager));
-
- manager->reserved = reserved;
-}
-
-void
-isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) {
- REQUIRE(VALID_MANAGER(manager));
-
- manager->maxudp = maxudp;
-}
-
-/*
- * Setup socket thread, thread->manager and thread->threadid must be filled.
- */
-
-static isc_result_t
-setup_thread(isc__socketthread_t *thread) {
- isc_result_t result = ISC_R_SUCCESS;
- int i;
- char strbuf[ISC_STRERRORSIZE];
-
- REQUIRE(thread != NULL);
- REQUIRE(VALID_MANAGER(thread->manager));
- REQUIRE(thread->threadid >= 0 &&
- thread->threadid < thread->manager->nthreads);
-
- thread->fds =
- isc_mem_get(thread->manager->mctx,
- thread->manager->maxsocks * sizeof(isc_socket_t *));
-
- memset(thread->fds, 0,
- thread->manager->maxsocks * sizeof(isc_socket_t *));
-
- thread->fdstate = isc_mem_get(thread->manager->mctx,
- thread->manager->maxsocks * sizeof(int));
-
- memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
-
- thread->fdlock = isc_mem_get(thread->manager->mctx,
- FDLOCK_COUNT * sizeof(isc_mutex_t));
-
- for (i = 0; i < FDLOCK_COUNT; i++) {
- isc_mutex_init(&thread->fdlock[i]);
- }
-
- if (pipe(thread->pipe_fds) != 0) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
- strbuf);
- return (ISC_R_UNEXPECTED);
- }
- RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
-
-#ifdef USE_KQUEUE
- thread->nevents = ISC_SOCKET_MAXEVENTS;
- thread->events = isc_mem_get(thread->manager->mctx,
- sizeof(struct kevent) * thread->nevents);
-
- thread->kqueue_fd = kqueue();
- if (thread->kqueue_fd == -1) {
- result = isc__errno2result(errno);
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
- strbuf);
- isc_mem_put(thread->manager->mctx, thread->events,
- sizeof(struct kevent) * thread->nevents);
- return (result);
- }
-
- result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
- if (result != ISC_R_SUCCESS) {
- close(thread->kqueue_fd);
- isc_mem_put(thread->manager->mctx, thread->events,
- sizeof(struct kevent) * thread->nevents);
- }
- return (result);
-
-#elif defined(USE_EPOLL)
- thread->nevents = ISC_SOCKET_MAXEVENTS;
- thread->epoll_events =
- isc_mem_get(thread->manager->mctx,
- (thread->manager->maxsocks * sizeof(uint32_t)));
-
- memset(thread->epoll_events, 0,
- thread->manager->maxsocks * sizeof(uint32_t));
-
- thread->events =
- isc_mem_get(thread->manager->mctx,
- sizeof(struct epoll_event) * thread->nevents);
-
- thread->epoll_fd = epoll_create(thread->nevents);
- if (thread->epoll_fd == -1) {
- result = isc__errno2result(errno);
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
- strbuf);
- return (result);
- }
-
- result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
- return (result);
-
-#elif defined(USE_DEVPOLL)
- thread->nevents = ISC_SOCKET_MAXEVENTS;
- result = isc_resource_getcurlimit(isc_resource_openfiles,
- &thread->open_max);
- if (result != ISC_R_SUCCESS) {
- thread->open_max = 64;
- }
- thread->calls = 0;
- thread->events = isc_mem_get(thread->manager->mctx,
- sizeof(struct pollfd) * thread->nevents);
-
- /*
- * Note: fdpollinfo should be able to support all possible FDs, so
- * it must have maxsocks entries (not nevents).
- */
- thread->fdpollinfo =
- isc_mem_get(thread->manager->mctx,
- sizeof(pollinfo_t) * thread->manager->maxsocks);
- memset(thread->fdpollinfo, 0,
- sizeof(pollinfo_t) * thread->manager->maxsocks);
- thread->devpoll_fd = open("/dev/poll", O_RDWR);
- if (thread->devpoll_fd == -1) {
- result = isc__errno2result(errno);
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "open(/dev/poll) failed: %s", strbuf);
- isc_mem_put(thread->manager->mctx, thread->events,
- sizeof(struct pollfd) * thread->nevents);
- isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
- sizeof(pollinfo_t) * thread->manager->maxsocks);
- return (result);
- }
- result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
- if (result != ISC_R_SUCCESS) {
- close(thread->devpoll_fd);
- isc_mem_put(thread->manager->mctx, thread->events,
- sizeof(struct pollfd) * thread->nevents);
- isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
- sizeof(pollinfo_t) * thread->manager->maxsocks);
- return (result);
- }
-
- return (ISC_R_SUCCESS);
-#elif defined(USE_SELECT)
- UNUSED(result);
-
-#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
- /*
- * Note: this code should also cover the case of MAXSOCKETS <=
- * FD_SETSIZE, but we separate the cases to avoid possible portability
- * issues regarding howmany() and the actual representation of fd_set.
- */
- thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
- sizeof(fd_mask);
-#else /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
- thread->fd_bufsize = sizeof(fd_set);
-#endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
-
- thread->read_fds = isc_mem_get(thread->manager->mctx,
- thread->fd_bufsize);
- thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
- thread->fd_bufsize);
- thread->write_fds = isc_mem_get(thread->manager->mctx,
- thread->fd_bufsize);
- thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
- thread->fd_bufsize);
- memset(thread->read_fds, 0, thread->fd_bufsize);
- memset(thread->write_fds, 0, thread->fd_bufsize);
-
- (void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
- thread->maxfd = thread->pipe_fds[0];
-
- return (ISC_R_SUCCESS);
-#endif /* USE_KQUEUE */
-}
-
-static void
-cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
- isc_result_t result;
- int i;
-
- result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
- if (result != ISC_R_SUCCESS) {
- UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
- }
-#ifdef USE_KQUEUE
- close(thread->kqueue_fd);
- isc_mem_put(mctx, thread->events,
- sizeof(struct kevent) * thread->nevents);
-#elif defined(USE_EPOLL)
- close(thread->epoll_fd);
-
- isc_mem_put(mctx, thread->events,
- sizeof(struct epoll_event) * thread->nevents);
-#elif defined(USE_DEVPOLL)
- close(thread->devpoll_fd);
- isc_mem_put(mctx, thread->events,
- sizeof(struct pollfd) * thread->nevents);
- isc_mem_put(mctx, thread->fdpollinfo,
- sizeof(pollinfo_t) * thread->manager->maxsocks);
-#elif defined(USE_SELECT)
- if (thread->read_fds != NULL) {
- isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
- }
- if (thread->read_fds_copy != NULL) {
- isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
- }
- if (thread->write_fds != NULL) {
- isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
- }
- if (thread->write_fds_copy != NULL) {
- isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
- }
-#endif /* USE_KQUEUE */
- for (i = 0; i < (int)thread->manager->maxsocks; i++) {
- if (thread->fdstate[i] == CLOSE_PENDING) {
- /* no need to lock */
- (void)close(i);
- }
- }
-
-#if defined(USE_EPOLL)
- isc_mem_put(thread->manager->mctx, thread->epoll_events,
- thread->manager->maxsocks * sizeof(uint32_t));
-#endif /* if defined(USE_EPOLL) */
- isc_mem_put(thread->manager->mctx, thread->fds,
- thread->manager->maxsocks * sizeof(isc_socket_t *));
- isc_mem_put(thread->manager->mctx, thread->fdstate,
- thread->manager->maxsocks * sizeof(int));
-
- for (i = 0; i < FDLOCK_COUNT; i++) {
- isc_mutex_destroy(&thread->fdlock[i]);
- }
- isc_mem_put(thread->manager->mctx, thread->fdlock,
- FDLOCK_COUNT * sizeof(isc_mutex_t));
-}
-
-isc_result_t
-isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp,
- unsigned int maxsocks, int nthreads) {
- int i;
- isc_socketmgr_t *manager;
-
- REQUIRE(managerp != NULL && *managerp == NULL);
-
- if (maxsocks == 0) {
- maxsocks = ISC_SOCKET_MAXSOCKETS;
- }
-
- manager = isc_mem_get(mctx, sizeof(*manager));
-
- /* zero-clear so that necessary cleanup on failure will be easy */
- memset(manager, 0, sizeof(*manager));
- manager->maxsocks = maxsocks;
- manager->reserved = 0;
- manager->maxudp = 0;
- manager->nthreads = nthreads;
- manager->stats = NULL;
-
- manager->magic = SOCKET_MANAGER_MAGIC;
- manager->mctx = NULL;
- ISC_LIST_INIT(manager->socklist);
- isc_mutex_init(&manager->lock);
- isc_condition_init(&manager->shutdown_ok);
-
- /*
- * Start up the select/poll thread.
- */
- manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
- manager->nthreads);
- isc_mem_attach(mctx, &manager->mctx);
-
- for (i = 0; i < manager->nthreads; i++) {
- manager->threads[i].manager = manager;
- manager->threads[i].threadid = i;
- setup_thread(&manager->threads[i]);
- isc_thread_create(netthread, &manager->threads[i],
- &manager->threads[i].thread);
- char tname[1024];
- sprintf(tname, "isc-socket-%d", i);
- isc_thread_setname(manager->threads[i].thread, tname);
- }
-
- *managerp = manager;
-
- return (ISC_R_SUCCESS);
-}
-
-isc_result_t
-isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
- REQUIRE(VALID_MANAGER(manager));
- REQUIRE(nsockp != NULL);
-
- *nsockp = manager->maxsocks;
-
- return (ISC_R_SUCCESS);
-}
-
-void
-isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
- REQUIRE(VALID_MANAGER(manager));
- REQUIRE(ISC_LIST_EMPTY(manager->socklist));
- REQUIRE(manager->stats == NULL);
- REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
-
- isc_stats_attach(stats, &manager->stats);
-}
-
-void
-isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
- isc_socketmgr_t *manager;
-
- /*
- * Destroy a socket manager.
- */
-
- REQUIRE(managerp != NULL);
- manager = *managerp;
- REQUIRE(VALID_MANAGER(manager));
-
- LOCK(&manager->lock);
-
- /*
- * Wait for all sockets to be destroyed.
- */
- while (!ISC_LIST_EMPTY(manager->socklist)) {
- manager_log(manager, CREATION, "sockets exist");
- WAIT(&manager->shutdown_ok, &manager->lock);
- }
-
- UNLOCK(&manager->lock);
-
- /*
- * Here, poke our select/poll thread. Do this by closing the write
- * half of the pipe, which will send EOF to the read half.
- * This is currently a no-op in the non-threaded case.
- */
- for (int i = 0; i < manager->nthreads; i++) {
- select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
- }
-
- /*
- * Wait for thread to exit.
- */
- for (int i = 0; i < manager->nthreads; i++) {
- isc_thread_join(manager->threads[i].thread, NULL);
- cleanup_thread(manager->mctx, &manager->threads[i]);
- }
- /*
- * Clean up.
- */
- isc_mem_put(manager->mctx, manager->threads,
- sizeof(isc__socketthread_t) * manager->nthreads);
- (void)isc_condition_destroy(&manager->shutdown_ok);
-
- if (manager->stats != NULL) {
- isc_stats_detach(&manager->stats);
- }
- isc_mutex_destroy(&manager->lock);
- manager->magic = 0;
- isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
-
- *managerp = NULL;
-}
-
-static isc_result_t
-socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
- unsigned int flags) {
- int io_state;
- bool have_lock = false;
- isc_task_t *ntask = NULL;
- isc_result_t result = ISC_R_SUCCESS;
-
- dev->ev_sender = task;
-
- if (sock->type == isc_sockettype_udp) {
- io_state = doio_recv(sock, dev);
- } else {
- LOCK(&sock->lock);
- have_lock = true;
-
- if (ISC_LIST_EMPTY(sock->recv_list)) {
- io_state = doio_recv(sock, dev);
- } else {
- io_state = DOIO_SOFT;
- }
- }
-
- switch (io_state) {
- case DOIO_SOFT:
- /*
- * We couldn't read all or part of the request right now, so
- * queue it.
- *
- * Attach to socket and to task
- */
- isc_task_attach(task, &ntask);
- dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
-
- if (!have_lock) {
- LOCK(&sock->lock);
- have_lock = true;
- }
-
- /*
- * Enqueue the request. If the socket was previously not being
- * watched, poke the watcher to start paying attention to it.
- */
- bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
- ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
- if (do_poke) {
- select_poke(sock->manager, sock->threadid, sock->fd,
- SELECT_POKE_READ);
- }
-
- socket_log(sock, NULL, EVENT,
- "socket_recv: event %p -> task %p", dev, ntask);
-
- if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
- result = ISC_R_INPROGRESS;
- }
- break;
-
- case DOIO_EOF:
- dev->result = ISC_R_EOF;
- /* fallthrough */
-
- case DOIO_HARD:
- case DOIO_SUCCESS:
- if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
- send_recvdone_event(sock, &dev);
- }
- break;
- }
-
- if (have_lock) {
- UNLOCK(&sock->lock);
- }
-
- return (result);
-}
-
-isc_result_t
-isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
- isc_task_t *task, isc_taskaction_t action, void *arg) {
- isc_socketevent_t *dev;
- isc_socketmgr_t *manager;
-
- REQUIRE(VALID_SOCKET(sock));
- REQUIRE(action != NULL);
-
- manager = sock->manager;
- REQUIRE(VALID_MANAGER(manager));
-
- INSIST(sock->bound);
-
- dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
- action, arg);
- if (dev == NULL) {
- return (ISC_R_NOMEMORY);
- }
-
- return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
-}
-
-isc_result_t
-isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
- isc_task_t *task, isc_socketevent_t *event,
- unsigned int flags) {
- event->ev_sender = sock;
- event->result = ISC_R_UNSET;
- event->region = *region;
- event->n = 0;
- event->offset = 0;
- event->attributes = 0;
-
- /*
- * UDP sockets are always partial read.
- */
- if (sock->type == isc_sockettype_udp) {
- event->minimum = 1;
- } else {
- if (minimum == 0) {
- event->minimum = region->length;
- } else {
- event->minimum = minimum;
- }
- }
-
- return (socket_recv(sock, event, task, flags));
-}
-
-static isc_result_t
-socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
- const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
- unsigned int flags) {
- int io_state;
- bool have_lock = false;
- isc_task_t *ntask = NULL;
- isc_result_t result = ISC_R_SUCCESS;
-
- dev->ev_sender = task;
-
- set_dev_address(address, sock, dev);
- if (pktinfo != NULL) {
- dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
- dev->pktinfo = *pktinfo;
-
- if (!isc_sockaddr_issitelocal(&dev->address) &&
- !isc_sockaddr_islinklocal(&dev->address))
- {
- socket_log(sock, NULL, TRACE,
- "pktinfo structure provided, ifindex %u "
- "(set to 0)",
- pktinfo->ipi6_ifindex);
-
- /*
- * Set the pktinfo index to 0 here, to let the
- * kernel decide what interface it should send on.
- */
- dev->pktinfo.ipi6_ifindex = 0;
- }
- }
-
- if (sock->type == isc_sockettype_udp) {
- io_state = doio_send(sock, dev);
- } else {
- LOCK(&sock->lock);
- have_lock = true;
-
- if (ISC_LIST_EMPTY(sock->send_list)) {
- io_state = doio_send(sock, dev);
- } else {
- io_state = DOIO_SOFT;
- }
- }
-
- switch (io_state) {
- case DOIO_SOFT:
- /*
- * We couldn't send all or part of the request right now, so
- * queue it unless ISC_SOCKFLAG_NORETRY is set.
- */
- if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
- isc_task_attach(task, &ntask);
- dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
-
- if (!have_lock) {
- LOCK(&sock->lock);
- have_lock = true;
- }
-
- /*
- * Enqueue the request. If the socket was previously
- * not being watched, poke the watcher to start
- * paying attention to it.
- */
- bool do_poke = ISC_LIST_EMPTY(sock->send_list);
- ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
- if (do_poke) {
- select_poke(sock->manager, sock->threadid,
- sock->fd, SELECT_POKE_WRITE);
- }
- socket_log(sock, NULL, EVENT,
- "socket_send: event %p -> task %p", dev,
- ntask);
-
- if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
- result = ISC_R_INPROGRESS;
- }
- break;
- }
-
- /* FALLTHROUGH */
-
- case DOIO_HARD:
- case DOIO_SUCCESS:
- if (!have_lock) {
- LOCK(&sock->lock);
- have_lock = true;
- }
- if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
- send_senddone_event(sock, &dev);
- }
- break;
- }
-
- if (have_lock) {
- UNLOCK(&sock->lock);
- }
-
- return (result);
-}
-
-isc_result_t
-isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
- isc_taskaction_t action, void *arg) {
- /*
- * REQUIRE() checking is performed in isc_socket_sendto().
- */
- return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
-}
-
-isc_result_t
-isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
- isc_taskaction_t action, void *arg,
- const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
- isc_socketevent_t *dev;
- isc_socketmgr_t *manager;
-
- REQUIRE(VALID_SOCKET(sock));
- REQUIRE(region != NULL);
- REQUIRE(task != NULL);
- REQUIRE(action != NULL);
-
- manager = sock->manager;
- REQUIRE(VALID_MANAGER(manager));
-
- INSIST(sock->bound);
-
- dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
- action, arg);
- if (dev == NULL) {
- return (ISC_R_NOMEMORY);
- }
-
- dev->region = *region;
-
- return (socket_send(sock, dev, task, address, pktinfo, 0));
-}
-
-isc_result_t
-isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
- const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
- isc_socketevent_t *event, unsigned int flags) {
- REQUIRE(VALID_SOCKET(sock));
- REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
- 0);
- if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
- REQUIRE(sock->type == isc_sockettype_udp);
- }
- event->ev_sender = sock;
- event->result = ISC_R_UNSET;
- event->region = *region;
- event->n = 0;
- event->offset = 0;
- event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
-
- return (socket_send(sock, event, task, address, pktinfo, flags));
-}
-
-void
-isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
- int s;
- struct stat sb;
- char strbuf[ISC_STRERRORSIZE];
-
- if (sockaddr->type.sa.sa_family != AF_UNIX) {
- return;
- }
-
-#ifndef S_ISSOCK
-#if defined(S_IFMT) && defined(S_IFSOCK)
-#define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
-#elif defined(_S_IFMT) && defined(S_IFSOCK)
-#define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
-#endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
-#endif /* ifndef S_ISSOCK */
-
-#ifndef S_ISFIFO
-#if defined(S_IFMT) && defined(S_IFIFO)
-#define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
-#elif defined(_S_IFMT) && defined(S_IFIFO)
-#define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
-#endif /* if defined(S_IFMT) && defined(S_IFIFO) */
-#endif /* ifndef S_ISFIFO */
-
-#if !defined(S_ISFIFO) && !defined(S_ISSOCK)
-/* cppcheck-suppress preprocessorErrorDirective */
-#error \
- You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>.
-#endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
-
-#ifndef S_ISFIFO
-#define S_ISFIFO(mode) 0
-#endif /* ifndef S_ISFIFO */
-
-#ifndef S_ISSOCK
-#define S_ISSOCK(mode) 0
-#endif /* ifndef S_ISSOCK */
-
- if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
- switch (errno) {
- case ENOENT:
- if (active) { /* We exited cleanly last time */
- break;
- }
- /* FALLTHROUGH */
- default:
- strerror_r(errno, strbuf, sizeof(strbuf));
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET,
- active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
- "isc_socket_cleanunix: stat(%s): %s",
- sockaddr->type.sunix.sun_path, strbuf);
- return;
- }
- } else {
- if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET,
- active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
- "isc_socket_cleanunix: %s: not a socket",
- sockaddr->type.sunix.sun_path);
- return;
- }
- }
-
- if (active) {
- if (unlink(sockaddr->type.sunix.sun_path) < 0) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
- "isc_socket_cleanunix: unlink(%s): %s",
- sockaddr->type.sunix.sun_path, strbuf);
- }
- return;
- }
-
- s = socket(AF_UNIX, SOCK_STREAM, 0);
- if (s < 0) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
- "isc_socket_cleanunix: socket(%s): %s",
- sockaddr->type.sunix.sun_path, strbuf);
- return;
- }
-
- if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
- sizeof(sockaddr->type.sunix)) < 0)
- {
- switch (errno) {
- case ECONNREFUSED:
- case ECONNRESET:
- if (unlink(sockaddr->type.sunix.sun_path) < 0) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- isc_log_write(
- isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
- "isc_socket_cleanunix: "
- "unlink(%s): %s",
- sockaddr->type.sunix.sun_path, strbuf);
- }
- break;
- default:
- strerror_r(errno, strbuf, sizeof(strbuf));
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
- "isc_socket_cleanunix: connect(%s): %s",
- sockaddr->type.sunix.sun_path, strbuf);
- break;
- }
- }
- close(s);
-}
-
-isc_result_t
-isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
- uint32_t owner, uint32_t group) {
- isc_result_t result = ISC_R_SUCCESS;
- char strbuf[ISC_STRERRORSIZE];
- char path[sizeof(sockaddr->type.sunix.sun_path)];
-#ifdef NEED_SECURE_DIRECTORY
- char *slash;
-#endif /* ifdef NEED_SECURE_DIRECTORY */
-
- REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
- INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
- strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
-
-#ifdef NEED_SECURE_DIRECTORY
- slash = strrchr(path, '/');
- if (slash != NULL) {
- if (slash != path) {
- *slash = '\0';
- } else {
- strlcpy(path, "/", sizeof(path));
- }
- } else {
- strlcpy(path, ".", sizeof(path));
- }
-#endif /* ifdef NEED_SECURE_DIRECTORY */
-
- if (chmod(path, perm) < 0) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
- "isc_socket_permunix: chmod(%s, %d): %s", path,
- perm, strbuf);
- result = ISC_R_FAILURE;
- }
- if (chown(path, owner, group) < 0) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
- "isc_socket_permunix: chown(%s, %d, %d): %s",
- path, owner, group, strbuf);
- result = ISC_R_FAILURE;
- }
- return (result);
-}
-
-isc_result_t
-isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
- isc_socket_options_t options) {
- char strbuf[ISC_STRERRORSIZE];
- int on = 1;
-
- REQUIRE(VALID_SOCKET(sock));
-
- LOCK(&sock->lock);
-
- INSIST(!sock->bound);
-
- if (sock->pf != sockaddr->type.sa.sa_family) {
- UNLOCK(&sock->lock);
- return (ISC_R_FAMILYMISMATCH);
- }
-
- /*
- * Only set SO_REUSEADDR when we want a specific port.
- */
-#ifdef AF_UNIX
- if (sock->pf == AF_UNIX) {
- goto bind_socket;
- }
-#endif /* ifdef AF_UNIX */
- if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
- isc_sockaddr_getport(sockaddr) != (in_port_t)0)
- {
- if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
- sizeof(on)) < 0) {
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d) failed", sock->fd);
- }
-#if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
- if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
- (void *)&on, sizeof(on)) < 0)
- {
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d) failed", sock->fd);
- }
-#elif defined(__linux__) && defined(SO_REUSEPORT)
- if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
- sizeof(on)) < 0) {
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d) failed", sock->fd);
- }
-#endif /* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
- /* Press on... */
- }
-#ifdef AF_UNIX
-bind_socket:
-#endif /* ifdef AF_UNIX */
- if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_BINDFAIL]);
-
- UNLOCK(&sock->lock);
- switch (errno) {
- case EACCES:
- return (ISC_R_NOPERM);
- case EADDRNOTAVAIL:
- return (ISC_R_ADDRNOTAVAIL);
- case EADDRINUSE:
- return (ISC_R_ADDRINUSE);
- case EINVAL:
- return (ISC_R_BOUND);
- default:
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
- strbuf);
- return (ISC_R_UNEXPECTED);
- }
- }
-
- socket_log(sock, sockaddr, TRACE, "bound");
- sock->bound = 1;
-
- UNLOCK(&sock->lock);
- return (ISC_R_SUCCESS);
-}
-
-/*
- * Enable this only for specific OS versions, and only when they have repaired
- * their problems with it. Until then, this is is broken and needs to be
- * disabled by default. See RT22589 for details.
- */
-#undef ENABLE_ACCEPTFILTER
-
-isc_result_t
-isc_socket_filter(isc_socket_t *sock, const char *filter) {
-#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
- char strbuf[ISC_STRERRORSIZE];
- struct accept_filter_arg afa;
-#else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
- UNUSED(sock);
- UNUSED(filter);
-#endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
-
- REQUIRE(VALID_SOCKET(sock));
-
-#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
- bzero(&afa, sizeof(afa));
- strlcpy(afa.af_name, filter, sizeof(afa.af_name));
- if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
- sizeof(afa)) == -1) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- socket_log(sock, NULL, CREATION,
- "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
- return (ISC_R_FAILURE);
- }
- return (ISC_R_SUCCESS);
-#else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
- return (ISC_R_NOTIMPLEMENTED);
-#endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
-}
-
-/*
- * Try enabling TCP Fast Open for a given socket if the OS supports it.
- */
-static void
-set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) {
-#if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
- char strbuf[ISC_STRERRORSIZE];
-
-/*
- * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
- * shipping a default kernel without TFO support, so we special-case it by
- * performing an additional runtime check for TFO support using sysctl to
- * prevent setsockopt() errors from being logged.
- */
-#if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
-#define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
- unsigned int enabled;
- size_t enabledlen = sizeof(enabled);
- static bool tfo_notice_logged = false;
-
- if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
- /*
- * This kernel does not support TCP Fast Open. There is
- * nothing more we can do.
- */
- return;
- } else if (enabled == 0) {
- /*
- * This kernel does support TCP Fast Open, but it is disabled
- * by sysctl. Notify the user, but do not nag.
- */
- if (!tfo_notice_logged) {
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
- "TCP_FASTOPEN support is disabled by "
- "sysctl (" SYSCTL_TFO " = 0)");
- tfo_notice_logged = true;
- }
- return;
- }
-#endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
-
-#ifdef __APPLE__
- backlog = 1;
-#else /* ifdef __APPLE__ */
- backlog = backlog / 2;
- if (backlog == 0) {
- backlog = 1;
- }
-#endif /* ifdef __APPLE__ */
- if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
- sizeof(backlog)) < 0)
- {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, TCP_FASTOPEN) failed with %s",
- sock->fd, strbuf);
- /* TCP_FASTOPEN is experimental so ignore failures */
- }
-#else /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
- UNUSED(sock);
- UNUSED(backlog);
-#endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
-}
-
-/*
- * Set up to listen on a given socket. We do this by creating an internal
- * event that will be dispatched when the socket has read activity. The
- * watcher will send the internal event to the task when there is a new
- * connection.
- *
- * Unlike in read, we don't preallocate a done event here. Every time there
- * is a new connection we'll have to allocate a new one anyway, so we might
- * as well keep things simple rather than having to track them.
- */
-isc_result_t
-isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
- char strbuf[ISC_STRERRORSIZE];
-
- REQUIRE(VALID_SOCKET(sock));
-
- LOCK(&sock->lock);
-
- REQUIRE(!sock->listener);
- REQUIRE(sock->bound);
- REQUIRE(sock->type == isc_sockettype_tcp ||
- sock->type == isc_sockettype_unix);
-
- if (backlog == 0) {
- backlog = SOMAXCONN;
- }
-
- if (listen(sock->fd, (int)backlog) < 0) {
- UNLOCK(&sock->lock);
- strerror_r(errno, strbuf, sizeof(strbuf));
-
- UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
-
- return (ISC_R_UNEXPECTED);
- }
-
- set_tcp_fastopen(sock, backlog);
-
- sock->listener = 1;
-
- UNLOCK(&sock->lock);
- return (ISC_R_SUCCESS);
-}
-
-/*
- * This should try to do aggressive accept() XXXMLG
- */
-isc_result_t
-isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action,
- void *arg) {
- isc_socket_newconnev_t *dev;
- isc_socketmgr_t *manager;
- isc_task_t *ntask = NULL;
- isc_socket_t *nsock;
- isc_result_t result;
- bool do_poke = false;
-
- REQUIRE(VALID_SOCKET(sock));
- manager = sock->manager;
- REQUIRE(VALID_MANAGER(manager));
-
- LOCK(&sock->lock);
-
- REQUIRE(sock->listener);
-
- /*
- * Sender field is overloaded here with the task we will be sending
- * this event to. Just before the actual event is delivered the
- * actual ev_sender will be touched up to be the socket.
- */
- dev = (isc_socket_newconnev_t *)isc_event_allocate(
- manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
- sizeof(*dev));
- ISC_LINK_INIT(dev, ev_link);
-
- result = allocate_socket(manager, sock->type, &nsock);
- if (result != ISC_R_SUCCESS) {
- isc_event_free(ISC_EVENT_PTR(&dev));
- UNLOCK(&sock->lock);
- return (result);
- }
-
- /*
- * Attach to socket and to task.
- */
- isc_task_attach(task, &ntask);
- if (isc_task_exiting(ntask)) {
- free_socket(&nsock);
- isc_task_detach(&ntask);
- isc_event_free(ISC_EVENT_PTR(&dev));
- UNLOCK(&sock->lock);
- return (ISC_R_SHUTTINGDOWN);
- }
- isc_refcount_increment0(&nsock->references);
- nsock->statsindex = sock->statsindex;
-
- dev->ev_sender = ntask;
- dev->newsocket = nsock;
-
- /*
- * Poke watcher here. We still have the socket locked, so there
- * is no race condition. We will keep the lock for such a short
- * bit of time waking it up now or later won't matter all that much.
- */
- do_poke = ISC_LIST_EMPTY(sock->accept_list);
- ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
- if (do_poke) {
- select_poke(manager, sock->threadid, sock->fd,
- SELECT_POKE_ACCEPT);
- }
- UNLOCK(&sock->lock);
- return (ISC_R_SUCCESS);
-}
-
-isc_result_t
-isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
- isc_task_t *task, isc_taskaction_t action, void *arg) {
- isc_socket_connev_t *dev;
- isc_task_t *ntask = NULL;
- isc_socketmgr_t *manager;
- int cc;
- char strbuf[ISC_STRERRORSIZE];
- char addrbuf[ISC_SOCKADDR_FORMATSIZE];
-
- REQUIRE(VALID_SOCKET(sock));
- REQUIRE(addr != NULL);
- REQUIRE(task != NULL);
- REQUIRE(action != NULL);
-
- manager = sock->manager;
- REQUIRE(VALID_MANAGER(manager));
- REQUIRE(addr != NULL);
-
- if (isc_sockaddr_ismulticast(addr)) {
- return (ISC_R_MULTICAST);
- }
-
- LOCK(&sock->lock);
-
- dev = (isc_socket_connev_t *)isc_event_allocate(
- manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
- sizeof(*dev));
- ISC_LINK_INIT(dev, ev_link);
-
- if (sock->connecting) {
- INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
- goto queue;
- }
-
- if (sock->connected) {
- INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
- dev->result = ISC_R_SUCCESS;
- isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
-
- UNLOCK(&sock->lock);
-
- return (ISC_R_SUCCESS);
- }
-
- /*
- * Try to do the connect right away, as there can be only one
- * outstanding, and it might happen to complete.
- */
- sock->peer_address = *addr;
- cc = connect(sock->fd, &addr->type.sa, addr->length);
- if (cc < 0) {
- /*
- * The socket is nonblocking and the connection cannot be
- * completed immediately. It is possible to select(2) or
- * poll(2) for completion by selecting the socket for writing.
- * After select(2) indicates writability, use getsockopt(2) to
- * read the SO_ERROR option at level SOL_SOCKET to determine
- * whether connect() completed successfully (SO_ERROR is zero)
- * or unsuccessfully (SO_ERROR is one of the usual error codes
- * listed here, explaining the reason for the failure).
- */
- if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
- cc = 0;
- goto success;
- }
- if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
- goto queue;
- }
-
- switch (errno) {
-#define ERROR_MATCH(a, b) \
- case a: \
- dev->result = b; \
- goto err_exit;
- ERROR_MATCH(EACCES, ISC_R_NOPERM);
- ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
- ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
- ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
- ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
-#ifdef EHOSTDOWN
- ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
-#endif /* ifdef EHOSTDOWN */
- ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
- ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
- ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
- ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
- ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
- ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
-#undef ERROR_MATCH
- }
-
- sock->connected = 0;
-
- strerror_r(errno, strbuf, sizeof(strbuf));
- isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
- addrbuf, errno, strbuf);
-
- UNLOCK(&sock->lock);
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_CONNECTFAIL]);
- isc_event_free(ISC_EVENT_PTR(&dev));
- return (ISC_R_UNEXPECTED);
-
- err_exit:
- sock->connected = 0;
- isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
-
- UNLOCK(&sock->lock);
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_CONNECTFAIL]);
- return (ISC_R_SUCCESS);
- }
-
- /*
- * If connect completed, fire off the done event.
- */
-success:
- if (cc == 0) {
- sock->connected = 1;
- sock->bound = 1;
- dev->result = ISC_R_SUCCESS;
- isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
-
- UNLOCK(&sock->lock);
-
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_CONNECT]);
-
- return (ISC_R_SUCCESS);
- }
-
-queue:
-
- /*
- * Attach to task.
- */
- isc_task_attach(task, &ntask);
-
- dev->ev_sender = ntask;
-
- /*
- * Poke watcher here. We still have the socket locked, so there
- * is no race condition. We will keep the lock for such a short
- * bit of time waking it up now or later won't matter all that much.
- */
- bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
- ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
- if (do_poke && !sock->connecting) {
- sock->connecting = 1;
- select_poke(manager, sock->threadid, sock->fd,
- SELECT_POKE_CONNECT);
- }
-
- UNLOCK(&sock->lock);
- return (ISC_R_SUCCESS);
-}
-
-/*
- * Called when a socket with a pending connect() finishes.
- */
-static void
-internal_connect(isc_socket_t *sock) {
- isc_socket_connev_t *dev;
- int cc;
- isc_result_t result;
- socklen_t optlen;
- char strbuf[ISC_STRERRORSIZE];
- char peerbuf[ISC_SOCKADDR_FORMATSIZE];
-
- INSIST(VALID_SOCKET(sock));
- REQUIRE(sock->fd >= 0);
-
- /*
- * Get the first item off the connect list.
- * If it is empty, unlock the socket and return.
- */
- dev = ISC_LIST_HEAD(sock->connect_list);
- if (dev == NULL) {
- INSIST(!sock->connecting);
- goto finish;
- }
-
- INSIST(sock->connecting);
- sock->connecting = 0;
-
- /*
- * Get any possible error status here.
- */
- optlen = sizeof(cc);
- if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
- (void *)&optlen) != 0)
- {
- cc = errno;
- } else {
- errno = cc;
- }
-
- if (errno != 0) {
- /*
- * If the error is EAGAIN, just re-select on this
- * fd and pretend nothing strange happened.
- */
- if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
- sock->connecting = 1;
- return;
- }
-
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_CONNECTFAIL]);
-
- /*
- * Translate other errors into ISC_R_* flavors.
- */
- switch (errno) {
-#define ERROR_MATCH(a, b) \
- case a: \
- result = b; \
- break;
- ERROR_MATCH(EACCES, ISC_R_NOPERM);
- ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
- ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
- ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
- ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
-#ifdef EHOSTDOWN
- ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
-#endif /* ifdef EHOSTDOWN */
- ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
- ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
- ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
- ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
- ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
- ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
-#undef ERROR_MATCH
- default:
- result = ISC_R_UNEXPECTED;
- isc_sockaddr_format(&sock->peer_address, peerbuf,
- sizeof(peerbuf));
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "internal_connect: connect(%s) %s",
- peerbuf, strbuf);
- }
- } else {
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_CONNECT]);
- result = ISC_R_SUCCESS;
- sock->connected = 1;
- sock->bound = 1;
- }
-
- do {
- dev->result = result;
- send_connectdone_event(sock, &dev);
- dev = ISC_LIST_HEAD(sock->connect_list);
- } while (dev != NULL);
-
-finish:
- unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
- SELECT_POKE_CONNECT);
-}
-
-isc_result_t
-isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
- isc_result_t result;
-
- REQUIRE(VALID_SOCKET(sock));
- REQUIRE(addressp != NULL);
-
- LOCK(&sock->lock);
-
- if (sock->connected) {
- *addressp = sock->peer_address;
- result = ISC_R_SUCCESS;
- } else {
- result = ISC_R_NOTCONNECTED;
- }
-
- UNLOCK(&sock->lock);
-
- return (result);
-}
-
-isc_result_t
-isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
- socklen_t len;
- isc_result_t result;
- char strbuf[ISC_STRERRORSIZE];
-
- REQUIRE(VALID_SOCKET(sock));
- REQUIRE(addressp != NULL);
-
- LOCK(&sock->lock);
-
- if (!sock->bound) {
- result = ISC_R_NOTBOUND;
- goto out;
- }
-
- result = ISC_R_SUCCESS;
-
- len = sizeof(addressp->type);
- if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
- result = ISC_R_UNEXPECTED;
- goto out;
- }
- addressp->length = (unsigned int)len;
-
-out:
- UNLOCK(&sock->lock);
-
- return (result);
-}
-
-/*
- * Run through the list of events on this socket, and cancel the ones
- * queued for task "task" of type "how". "how" is a bitmask.
- */
-void
-isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
- REQUIRE(VALID_SOCKET(sock));
-
- /*
- * Quick exit if there is nothing to do. Don't even bother locking
- * in this case.
- */
- if (how == 0) {
- return;
- }
-
- LOCK(&sock->lock);
-
- /*
- * All of these do the same thing, more or less.
- * Each will:
- * o If the internal event is marked as "posted" try to
- * remove it from the task's queue. If this fails, mark it
- * as canceled instead, and let the task clean it up later.
- * o For each I/O request for that task of that type, post
- * its done event with status of "ISC_R_CANCELED".
- * o Reset any state needed.
- */
- if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
- !ISC_LIST_EMPTY(sock->recv_list)) {
- isc_socketevent_t *dev;
- isc_socketevent_t *next;
- isc_task_t *current_task;
-
- dev = ISC_LIST_HEAD(sock->recv_list);
-
- while (dev != NULL) {
- current_task = dev->ev_sender;
- next = ISC_LIST_NEXT(dev, ev_link);
-
- if ((task == NULL) || (task == current_task)) {
- dev->result = ISC_R_CANCELED;
- send_recvdone_event(sock, &dev);
- }
- dev = next;
- }
- }
-
- if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
- !ISC_LIST_EMPTY(sock->send_list)) {
- isc_socketevent_t *dev;
- isc_socketevent_t *next;
- isc_task_t *current_task;
-
- dev = ISC_LIST_HEAD(sock->send_list);
-
- while (dev != NULL) {
- current_task = dev->ev_sender;
- next = ISC_LIST_NEXT(dev, ev_link);
-
- if ((task == NULL) || (task == current_task)) {
- dev->result = ISC_R_CANCELED;
- send_senddone_event(sock, &dev);
- }
- dev = next;
- }
- }
-
- if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
- !ISC_LIST_EMPTY(sock->accept_list)) {
- isc_socket_newconnev_t *dev;
- isc_socket_newconnev_t *next;
- isc_task_t *current_task;
-
- dev = ISC_LIST_HEAD(sock->accept_list);
- while (dev != NULL) {
- current_task = dev->ev_sender;
- next = ISC_LIST_NEXT(dev, ev_link);
-
- if ((task == NULL) || (task == current_task)) {
- ISC_LIST_UNLINK(sock->accept_list, dev,
- ev_link);
-
- isc_refcount_decrementz(
- &NEWCONNSOCK(dev)->references);
- free_socket((isc_socket_t **)&dev->newsocket);
-
- dev->result = ISC_R_CANCELED;
- dev->ev_sender = sock;
- isc_task_sendtoanddetach(¤t_task,
- ISC_EVENT_PTR(&dev),
- sock->threadid);
- }
-
- dev = next;
- }
- }
-
- if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
- !ISC_LIST_EMPTY(sock->connect_list))
- {
- isc_socket_connev_t *dev;
- isc_socket_connev_t *next;
- isc_task_t *current_task;
-
- INSIST(sock->connecting);
- sock->connecting = 0;
-
- dev = ISC_LIST_HEAD(sock->connect_list);
-
- while (dev != NULL) {
- current_task = dev->ev_sender;
- next = ISC_LIST_NEXT(dev, ev_link);
-
- if ((task == NULL) || (task == current_task)) {
- dev->result = ISC_R_CANCELED;
- send_connectdone_event(sock, &dev);
- }
- dev = next;
- }
- }
-
- UNLOCK(&sock->lock);
-}
-
-isc_sockettype_t
-isc_socket_gettype(isc_socket_t *sock) {
- REQUIRE(VALID_SOCKET(sock));
-
- return (sock->type);
-}
-
-void
-isc_socket_ipv6only(isc_socket_t *sock, bool yes) {
-#if defined(IPV6_V6ONLY)
- int onoff = yes ? 1 : 0;
-#else /* if defined(IPV6_V6ONLY) */
- UNUSED(yes);
- UNUSED(sock);
-#endif /* if defined(IPV6_V6ONLY) */
-
- REQUIRE(VALID_SOCKET(sock));
-
-#ifdef IPV6_V6ONLY
- if (sock->pf == AF_INET6) {
- if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
- (void *)&onoff, sizeof(int)) < 0)
- {
- char strbuf[ISC_STRERRORSIZE];
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IPV6_V6ONLY) failed: "
- "%s",
- sock->fd, strbuf);
- }
- }
-#endif /* ifdef IPV6_V6ONLY */
-}
-
-static void
-setdscp(isc_socket_t *sock, isc_dscp_t dscp) {
-#if defined(IP_TOS) || defined(IPV6_TCLASS)
- int value = dscp << 2;
-#endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
-
- sock->dscp = dscp;
-
-#ifdef IP_TOS
- if (sock->pf == AF_INET) {
- if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
- sizeof(value)) < 0) {
- char strbuf[ISC_STRERRORSIZE];
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IP_TOS, %.02x) "
- "failed: %s",
- sock->fd, value >> 2, strbuf);
- }
- }
-#endif /* ifdef IP_TOS */
-#ifdef IPV6_TCLASS
- if (sock->pf == AF_INET6) {
- if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
- (void *)&value, sizeof(value)) < 0)
- {
- char strbuf[ISC_STRERRORSIZE];
- strerror_r(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IPV6_TCLASS, %.02x) "
- "failed: %s",
- sock->fd, dscp >> 2, strbuf);
- }
- }
-#endif /* ifdef IPV6_TCLASS */
-}
-
-void
-isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
- REQUIRE(VALID_SOCKET(sock));
- REQUIRE(dscp < 0x40);
-
-#if !defined(IP_TOS) && !defined(IPV6_TCLASS)
- UNUSED(dscp);
-#else /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
- if (dscp < 0) {
- return;
- }
-
- /* The DSCP value must not be changed once it has been set. */
- if (isc_dscp_check_value != -1) {
- INSIST(dscp == isc_dscp_check_value);
- }
-#endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
-
- setdscp(sock, dscp);
-}
-
-isc_socketevent_t *
-isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
- isc_taskaction_t action, void *arg) {
- return (allocate_socketevent(mctx, sender, eventtype, action, arg));
-}
-
-void
-isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) {
- /*
- * Name 'sock'.
- */
-
- REQUIRE(VALID_SOCKET(sock));
-
- LOCK(&sock->lock);
- strlcpy(sock->name, name, sizeof(sock->name));
- sock->tag = tag;
- UNLOCK(&sock->lock);
-}
-
-const char *
-isc_socket_getname(isc_socket_t *sock) {
- return (sock->name);
-}
-
-void *
-isc_socket_gettag(isc_socket_t *sock) {
- return (sock->tag);
-}
-
-int
-isc_socket_getfd(isc_socket_t *sock) {
- return ((short)sock->fd);
-}
-
-static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
-static bool hasreuseport = false;
-
-static void
-init_hasreuseport(void) {
-/*
- * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
- * We only want to use it on Linux, if it's available.
- */
-#if (defined(SO_REUSEPORT) && defined(__linux__)) || \
- (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
- int sock, yes = 1;
- sock = socket(AF_INET, SOCK_DGRAM, 0);
- if (sock < 0) {
- sock = socket(AF_INET6, SOCK_DGRAM, 0);
- if (sock < 0) {
- return;
- }
- }
- if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
- sizeof(yes)) < 0) {
- close(sock);
- return;
-#if defined(__FreeBSD_kernel__)
- } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
- sizeof(yes)) < 0)
-#else /* if defined(__FreeBSD_kernel__) */
- } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
- sizeof(yes)) < 0)
-#endif /* if defined(__FreeBSD_kernel__) */
- {
- close(sock);
- return;
- }
- hasreuseport = true;
- close(sock);
-#endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
- * (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
-}
-
-bool
-isc_socket_hasreuseport(void) {
- RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
- ISC_R_SUCCESS);
- return (hasreuseport);
-}
-
-#if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
-static const char *
-_socktype(isc_sockettype_t type) {
- switch (type) {
- case isc_sockettype_udp:
- return ("udp");
- case isc_sockettype_tcp:
- return ("tcp");
- case isc_sockettype_unix:
- return ("unix");
- default:
- return ("not-initialized");
- }
-}
-#endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
-
-#ifdef HAVE_LIBXML2
-#define TRY0(a) \
- do { \
- xmlrc = (a); \
- if (xmlrc < 0) \
- goto error; \
- } while (0)
-int
-isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) {
- isc_socket_t *sock = NULL;
- char peerbuf[ISC_SOCKADDR_FORMATSIZE];
- isc_sockaddr_t addr;
- socklen_t len;
- int xmlrc;
- xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
-
- LOCK(&mgr->lock);
-
- TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
- sock = ISC_LIST_HEAD(mgr->socklist);
- while (sock != NULL) {
- LOCK(&sock->lock);
- TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
-
- TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
- TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
- TRY0(xmlTextWriterEndElement(writer));
-
- if (sock->name[0] != 0) {
- TRY0(xmlTextWriterStartElement(writer,
- ISC_XMLCHAR "name"));
- TRY0(xmlTextWriterWriteFormatString(writer, "%s",
- sock->name));
- TRY0(xmlTextWriterEndElement(writer)); /* name */
- }
-
- TRY0(xmlTextWriterStartElement(writer,
- ISC_XMLCHAR "references"));
- TRY0(xmlTextWriterWriteFormatString(
- writer, "%d",
- (int)isc_refcount_current(&sock->references)));
- TRY0(xmlTextWriterEndElement(writer));
-
- TRY0(xmlTextWriterWriteElement(
- writer, ISC_XMLCHAR "type",
- ISC_XMLCHAR _socktype(sock->type)));
-
- if (sock->connected) {
- isc_sockaddr_format(&sock->peer_address, peerbuf,
- sizeof(peerbuf));
- TRY0(xmlTextWriterWriteElement(
- writer, ISC_XMLCHAR "peer-address",
- ISC_XMLCHAR peerbuf));
- }
-
- len = sizeof(addr);
- if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
- isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
- TRY0(xmlTextWriterWriteElement(
- writer, ISC_XMLCHAR "local-address",
- ISC_XMLCHAR peerbuf));
- }
-
- TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
- if (sock->listener) {
- TRY0(xmlTextWriterWriteElement(writer,
- ISC_XMLCHAR "state",
- ISC_XMLCHAR "listener"));
- }
- if (sock->connected) {
- TRY0(xmlTextWriterWriteElement(
- writer, ISC_XMLCHAR "state",
- ISC_XMLCHAR "connected"));
- }
- if (sock->connecting) {
- TRY0(xmlTextWriterWriteElement(
- writer, ISC_XMLCHAR "state",
- ISC_XMLCHAR "connecting"));
- }
- if (sock->bound) {
- TRY0(xmlTextWriterWriteElement(writer,
- ISC_XMLCHAR "state",
- ISC_XMLCHAR "bound"));
- }
-
- TRY0(xmlTextWriterEndElement(writer)); /* states */
-
- TRY0(xmlTextWriterEndElement(writer)); /* socket */
-
- UNLOCK(&sock->lock);
- sock = ISC_LIST_NEXT(sock, link);
- }
- TRY0(xmlTextWriterEndElement(writer)); /* sockets */
-
-error:
- if (sock != NULL) {
- UNLOCK(&sock->lock);
- }
-
- UNLOCK(&mgr->lock);
-
- return (xmlrc);
-}
-#endif /* HAVE_LIBXML2 */
-
-#ifdef HAVE_JSON_C
-#define CHECKMEM(m) \
- do { \
- if (m == NULL) { \
- result = ISC_R_NOMEMORY; \
- goto error; \
- } \
- } while (0)
-
-isc_result_t
-isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) {
- isc_result_t result = ISC_R_SUCCESS;
- isc_socket_t *sock = NULL;
- char peerbuf[ISC_SOCKADDR_FORMATSIZE];
- isc_sockaddr_t addr;
- socklen_t len;
- json_object *obj, *array = json_object_new_array();
- json_object *stats = (json_object *)stats0;
-
- CHECKMEM(array);
-
- LOCK(&mgr->lock);
-
- sock = ISC_LIST_HEAD(mgr->socklist);
- while (sock != NULL) {
- json_object *states, *entry = json_object_new_object();
- char buf[255];
-
- CHECKMEM(entry);
- json_object_array_add(array, entry);
-
- LOCK(&sock->lock);
-
- snprintf(buf, sizeof(buf), "%p", sock);
- obj = json_object_new_string(buf);
- CHECKMEM(obj);
- json_object_object_add(entry, "id", obj);
-
- if (sock->name[0] != 0) {
- obj = json_object_new_string(sock->name);
- CHECKMEM(obj);
- json_object_object_add(entry, "name", obj);
- }
-
- obj = json_object_new_int(
- (int)isc_refcount_current(&sock->references));
- CHECKMEM(obj);
- json_object_object_add(entry, "references", obj);
-
- obj = json_object_new_string(_socktype(sock->type));
- CHECKMEM(obj);
- json_object_object_add(entry, "type", obj);
-
- if (sock->connected) {
- isc_sockaddr_format(&sock->peer_address, peerbuf,
- sizeof(peerbuf));
- obj = json_object_new_string(peerbuf);
- CHECKMEM(obj);
- json_object_object_add(entry, "peer-address", obj);
- }
-
- len = sizeof(addr);
- if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
- isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
- obj = json_object_new_string(peerbuf);
- CHECKMEM(obj);
- json_object_object_add(entry, "local-address", obj);
- }
-
- states = json_object_new_array();
- CHECKMEM(states);
- json_object_object_add(entry, "states", states);
-
- if (sock->listener) {
- obj = json_object_new_string("listener");
- CHECKMEM(obj);
- json_object_array_add(states, obj);
- }
-
- if (sock->connected) {
- obj = json_object_new_string("connected");
- CHECKMEM(obj);
- json_object_array_add(states, obj);
- }
-
- if (sock->connecting) {
- obj = json_object_new_string("connecting");
- CHECKMEM(obj);
- json_object_array_add(states, obj);
- }
-
- if (sock->bound) {
- obj = json_object_new_string("bound");
- CHECKMEM(obj);
- json_object_array_add(states, obj);
- }
-
- UNLOCK(&sock->lock);
- sock = ISC_LIST_NEXT(sock, link);
- }
-
- json_object_object_add(stats, "sockets", array);
- array = NULL;
- result = ISC_R_SUCCESS;
-
-error:
- if (array != NULL) {
- json_object_put(array);
- }
-
- if (sock != NULL) {
- UNLOCK(&sock->lock);
- }
-
- UNLOCK(&mgr->lock);
-
- return (result);
-}
-#endif /* HAVE_JSON_C */