From: Olivier Houchard Date: Fri, 3 Jul 2020 12:04:37 +0000 (+0200) Subject: BUG/MEDIUM: connections: Set the tid for the old tasklet on takeover. X-Git-Tag: v2.2-dev12~6 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=1662cdb0c6523213cf179889e5935aaceb917c3a;p=thirdparty%2Fhaproxy.git BUG/MEDIUM: connections: Set the tid for the old tasklet on takeover. In the various takeover() methods, make sure we schedule the old tasklet on the old thread, as we don't want it to run on our own thread! This was causing a very rare crash when building with DEBUG_STRICT, seeing that either an FD's thread mask didn't match the thread ID in h1_io_cb(), or that stream_int_notify() would try to queue a task with the wrong tid_bit. In order to reproduce this, it is necessary to maintain many connections (typically 30k) at a high request rate flowing over H1+SSL between two proxies, the second of which would randomly reject ~1% of the incoming connection and randomly killing some idle ones using a very short client timeout. The request rate must be adjusted so that the CPUs are nearly saturated, but never reach 100%. It's easier to reproduce this by skipping local connections and always picking from other threads. The issue should happen in less than 20s otherwise it's necessary to restart to reset the idle connections lists. No backport is needed, takeover() is 2.2 only. --- diff --git a/include/haproxy/connection-t.h b/include/haproxy/connection-t.h index 8b48c9815d..e7e9c0d632 100644 --- a/include/haproxy/connection-t.h +++ b/include/haproxy/connection-t.h @@ -390,7 +390,7 @@ struct mux_ops { void (*reset)(struct connection *conn); /* Reset the mux, because we're re-trying to connect */ const struct cs_info *(*get_cs_info)(struct conn_stream *cs); /* Return info on the specified conn_stream or NULL if not defined */ int (*ctl)(struct connection *conn, enum mux_ctl_type mux_ctl, void *arg); /* Provides information about the mux */ - int (*takeover)(struct connection *conn); /* Attempts to migrate the connection to the current thread */ + int (*takeover)(struct connection *conn, int orig_tid); /* Attempts to migrate the connection to the current thread */ unsigned int flags; /* some flags characterizing the mux's capabilities (MX_FL_*) */ char name[8]; /* mux layer name, zero-terminated */ }; diff --git a/src/backend.c b/src/backend.c index f87940be8d..7027c54cda 100644 --- a/src/backend.c +++ b/src/backend.c @@ -1135,7 +1135,7 @@ static struct connection *conn_backend_get(struct server *srv, int is_safe) HA_SPIN_LOCK(OTHER_LOCK, &idle_conns[i].takeover_lock); mt_list_for_each_entry_safe(conn, &mt_list[i], list, elt1, elt2) { - if (conn->mux->takeover && conn->mux->takeover(conn) == 0) { + if (conn->mux->takeover && conn->mux->takeover(conn, i) == 0) { MT_LIST_DEL_SAFE(elt1); _HA_ATOMIC_ADD(&activity[tid].fd_takeover, 1); found = 1; @@ -1145,7 +1145,7 @@ static struct connection *conn_backend_get(struct server *srv, int is_safe) if (!found && !is_safe && srv->curr_safe_nb > 0) { mt_list_for_each_entry_safe(conn, &srv->safe_conns[i], list, elt1, elt2) { - if (conn->mux->takeover && conn->mux->takeover(conn) == 0) { + if (conn->mux->takeover && conn->mux->takeover(conn, i) == 0) { MT_LIST_DEL_SAFE(elt1); _HA_ATOMIC_ADD(&activity[tid].fd_takeover, 1); found = 1; diff --git a/src/mux_fcgi.c b/src/mux_fcgi.c index c4c7ce6421..9e8eb9a4ba 100644 --- a/src/mux_fcgi.c +++ b/src/mux_fcgi.c @@ -4084,7 +4084,7 @@ static void fcgi_show_fd(struct buffer *msg, struct connection *conn) * Return 0 if successful, non-zero otherwise. * Expected to be called with the old thread lock held. */ -static int fcgi_takeover(struct connection *conn) +static int fcgi_takeover(struct connection *conn, int orig_tid) { struct fcgi_conn *fcgi = conn->ctx; struct task *task; @@ -4098,7 +4098,7 @@ static int fcgi_takeover(struct connection *conn) * set its context to NULL; */ fcgi->wait_event.tasklet->context = NULL; - tasklet_wakeup(fcgi->wait_event.tasklet); + tasklet_wakeup_on(fcgi->wait_event.tasklet, orig_tid); task = fcgi->task; if (task) { diff --git a/src/mux_h1.c b/src/mux_h1.c index 89c55b4d46..0111f19e73 100644 --- a/src/mux_h1.c +++ b/src/mux_h1.c @@ -2922,7 +2922,7 @@ static int add_hdr_case_adjust(const char *from, const char *to, char **err) * Return 0 if successful, non-zero otherwise. * Expected to be called with the old thread lock held. */ -static int h1_takeover(struct connection *conn) +static int h1_takeover(struct connection *conn, int orig_tid) { struct h1c *h1c = conn->ctx; struct task *task; @@ -2936,7 +2936,7 @@ static int h1_takeover(struct connection *conn) * set its context to NULL. */ h1c->wait_event.tasklet->context = NULL; - tasklet_wakeup(h1c->wait_event.tasklet); + tasklet_wakeup_on(h1c->wait_event.tasklet, orig_tid); task = h1c->task; if (task) { diff --git a/src/mux_h2.c b/src/mux_h2.c index 827ff8e4f6..22212afe85 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -6055,7 +6055,7 @@ static void h2_show_fd(struct buffer *msg, struct connection *conn) * Return 0 if successful, non-zero otherwise. * Expected to be called with the old thread lock held. */ -static int h2_takeover(struct connection *conn) +static int h2_takeover(struct connection *conn, int orig_tid) { struct h2c *h2c = conn->ctx; struct task *task; @@ -6069,7 +6069,7 @@ static int h2_takeover(struct connection *conn) * set its context to NULL. */ h2c->wait_event.tasklet->context = NULL; - tasklet_wakeup(h2c->wait_event.tasklet); + tasklet_wakeup_on(h2c->wait_event.tasklet, orig_tid); task = h2c->task; if (task) {