From: Stefan Metzmacher Date: Wed, 12 Oct 2022 15:26:16 +0000 (+0200) Subject: lib/tsocket: avoid endless cpu-spinning in tstream_bsd_fde_handler() X-Git-Tag: talloc-2.4.0~706 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e232ba946f00aac39d67197d9939bc923814479c;p=thirdparty%2Fsamba.git lib/tsocket: avoid endless cpu-spinning in tstream_bsd_fde_handler() There were some reports that strace output an LDAP server socket is in CLOSE_WAIT state, returning EAGAIN for writev over and over (after a call to epoll() each time). In the tstream_bsd code the problem happens when we have a pending writev_send, while there's no readv_send pending. In that case we still ask for TEVENT_FD_READ in order to notice connection errors early, so we try to call writev even if the socket doesn't report TEVENT_FD_WRITE. And there are situations where we do that over and over again. It happens like this with a Linux kernel: tcp_fin() has this: struct tcp_sock *tp = tcp_sk(sk); inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); inet_csk_enter_pingpong_mode(sk); break; It means RCV_SHUTDOWN gets set as well as TCP_CLOSE_WAIT, but sk->sk_err is not changed to indicate an error. tcp_sendmsg_locked has this: ... err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (msg_data_left(msg)) { int copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_space; ... wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; It means if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) doesn't hit as we only have RCV_SHUTDOWN and sk_stream_wait_memory returns -EAGAIN. tcp_poll has this: if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; So we'll get EPOLLIN | EPOLLRDNORM | EPOLLRDHUP triggering TEVENT_FD_READ and writev/sendmsg keeps getting EAGAIN. So we need to always clear TEVENT_FD_READ if we don't have readable handler in order to avoid burning cpu. But we turn it on again after a timeout of 1 second in order to monitor the error state of the connection. And now that our tsocket_bsd_error() helper checks for POLLRDHUP, we can check if the socket is in an error state before calling the writable handler when TEVENT_FD_READ was reported. Only on error we'll call the writable handler, which will pick the error without calling writev(). BUG: https://bugzilla.samba.org/show_bug.cgi?id=15202 Signed-off-by: Stefan Metzmacher Reviewed-by: Ralph Boehme --- diff --git a/lib/tsocket/tsocket_bsd.c b/lib/tsocket/tsocket_bsd.c index 72499561f2d..daba33eb9d7 100644 --- a/lib/tsocket/tsocket_bsd.c +++ b/lib/tsocket/tsocket_bsd.c @@ -1754,6 +1754,9 @@ struct tstream_bsd { void (*readable_handler)(void *private_data); void *writeable_private; void (*writeable_handler)(void *private_data); + + struct tevent_context *error_ctx; + struct tevent_timer *error_timer; }; bool tstream_bsd_optimize_readv(struct tstream_context *stream, @@ -1775,6 +1778,28 @@ bool tstream_bsd_optimize_readv(struct tstream_context *stream, return old; } +static void tstream_bsd_error_timer(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval current_time, + void *private_data) +{ + struct tstream_bsd *bsds = + talloc_get_type(private_data, + struct tstream_bsd); + + TALLOC_FREE(bsds->error_timer); + + /* + * Turn on TEVENT_FD_READABLE() again + * if we have a writeable_handler that + * wants to monitor the connection + * for errors. + */ + if (bsds->writeable_handler != NULL) { + TEVENT_FD_READABLE(bsds->fde); + } +} + static void tstream_bsd_fde_handler(struct tevent_context *ev, struct tevent_fd *fde, uint16_t flags, @@ -1789,11 +1814,74 @@ static void tstream_bsd_fde_handler(struct tevent_context *ev, } if (flags & TEVENT_FD_READ) { if (!bsds->readable_handler) { - if (bsds->writeable_handler) { + struct timeval recheck_time; + + /* + * In order to avoid cpu-spinning + * we no longer want to get TEVENT_FD_READ + */ + TEVENT_FD_NOT_READABLE(bsds->fde); + + if (!bsds->writeable_handler) { + return; + } + + /* + * If we have a writeable handler we + * want that to report connection errors + * early. + * + * So we check if the socket is in an + * error state. + */ + if (bsds->error == 0) { + int ret = tsocket_bsd_error(bsds->fd); + + if (ret == -1) { + bsds->error = errno; + } + } + + if (bsds->error != 0) { + /* + * Let the writeable handler report the error + */ + bsds->writeable_handler(bsds->writeable_private); + return; + } + + /* + * Here we called TEVENT_FD_NOT_READABLE() without + * calling into the writeable handler. + * + * So we may have to wait for the kernels tcp stack + * to report TEVENT_FD_WRITE in order to let + * make progress and turn on TEVENT_FD_READABLE() + * again. + * + * As a fallback we use a timer that turns on + * TEVENT_FD_READABLE() again after a timeout of + * 1 second. + */ + + if (bsds->error_timer != NULL) { + return; + } + + recheck_time = timeval_current_ofs(1, 0); + bsds->error_timer = tevent_add_timer(bsds->error_ctx, + bsds, + recheck_time, + tstream_bsd_error_timer, + bsds); + if (bsds->error_timer == NULL) { + bsds->error = ENOMEM; + /* + * Let the writeable handler report the error + */ bsds->writeable_handler(bsds->writeable_private); return; } - TEVENT_FD_NOT_READABLE(bsds->fde); return; } bsds->readable_handler(bsds->readable_private); @@ -1848,6 +1936,8 @@ static int tstream_bsd_set_readable_handler(struct tstream_bsd *bsds, TEVENT_FD_READABLE(bsds->fde); } + TALLOC_FREE(bsds->error_timer); + bsds->readable_handler = handler; bsds->readable_private = private_data; @@ -1870,7 +1960,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds, bsds->writeable_handler = NULL; bsds->writeable_private = NULL; TEVENT_FD_NOT_WRITEABLE(bsds->fde); - + TALLOC_FREE(bsds->error_timer); + bsds->error_ctx = NULL; return 0; } @@ -1882,6 +1973,8 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds, } bsds->event_ptr = NULL; TALLOC_FREE(bsds->fde); + TALLOC_FREE(bsds->error_timer); + bsds->error_ctx = NULL; } if (tevent_fd_get_flags(bsds->fde) == 0) { @@ -1907,6 +2000,7 @@ static int tstream_bsd_set_writeable_handler(struct tstream_bsd *bsds, bsds->writeable_handler = handler; bsds->writeable_private = private_data; + bsds->error_ctx = ev; return 0; } @@ -2212,7 +2306,14 @@ static void tstream_bsd_writev_handler(void *private_data) } err = tsocket_bsd_error_from_errno(ret, errno, &retry); if (retry) { - /* retry later */ + /* + * retry later... + * + * make sure we also wait readable again + * in order to notice errors early + */ + TEVENT_FD_READABLE(bsds->fde); + TALLOC_FREE(bsds->error_timer); return; } if (err != 0) { @@ -2238,7 +2339,13 @@ static void tstream_bsd_writev_handler(void *private_data) } if (state->count > 0) { - /* we have more to read */ + /* + * we have more to write + * + * make sure we also wait readable again + * in order to notice errors early + */ + TEVENT_FD_READABLE(bsds->fde); return; } @@ -2286,6 +2393,8 @@ static struct tevent_req *tstream_bsd_disconnect_send(TALLOC_CTX *mem_ctx, goto post; } + TALLOC_FREE(bsds->error_timer); + bsds->error_ctx = NULL; TALLOC_FREE(bsds->fde); ret = close(bsds->fd); bsds->fd = -1; @@ -2328,6 +2437,8 @@ static const struct tstream_context_ops tstream_bsd_ops = { static int tstream_bsd_destructor(struct tstream_bsd *bsds) { + TALLOC_FREE(bsds->error_timer); + bsds->error_ctx = NULL; TALLOC_FREE(bsds->fde); if (bsds->fd != -1) { close(bsds->fd); diff --git a/selftest/knownfail.d/tstream b/selftest/knownfail.d/tstream deleted file mode 100644 index 4d67b2c8b61..00000000000 --- a/selftest/knownfail.d/tstream +++ /dev/null @@ -1,6 +0,0 @@ -^samba.unittests.tsocket_tstream.test_tstream_disconnected_tcp_client_spin -^samba.unittests.tsocket_tstream.test_tstream_disconnected_unix_client_spin -^samba.unittests.tsocket_tstream.test_tstream_more_tcp_client_spin -^samba.unittests.tsocket_tstream.test_tstream_more_unix_client_spin -^samba.unittests.tsocket_tstream.test_tstream_more_disconnect_tcp_client_spin -^samba.unittests.tsocket_tstream.test_tstream_more_disconnect_unix_client_spin