From: Joshua Elson Date: Mon, 18 Mar 2024 19:14:36 +0000 (-0400) Subject: Implement Configurable TCP Keepalive Settings in PJSIP Transports X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=555eb9d3d2bf9b3afac18a81d1da67fae8651e08;p=thirdparty%2Fasterisk.git Implement Configurable TCP Keepalive Settings in PJSIP Transports This commit introduces configurable TCP keepalive settings for both TCP and TLS transports. The changes allow for finer control over TCP connection keepalives, enhancing stability and reliability in environments prone to connection timeouts or where intermediate devices may prematurely close idle connections. This has proven necessary and has already been tested in production in several specialized environments where access to the underlying transport is unreliable in ways invisible to the operating system directly, so these keepalive and timeout mechanisms are necessary. Fixes #657 --- diff --git a/configs/samples/pjsip.conf.sample b/configs/samples/pjsip.conf.sample index 247b540276..9d454089d3 100644 --- a/configs/samples/pjsip.conf.sample +++ b/configs/samples/pjsip.conf.sample @@ -171,6 +171,32 @@ ;type=transport ;protocol=flow +; Example IPv4 TCP transport with Keepalive options +; +;[transport-tcp] +;type=transport +;protocol=tcp +;bind=0.0.0.0 +;tcp_keepalive_enable=yes ; Enable TCP keepalive (yes/no) +;tcp_keepalive_idle_time=30 ; Time in seconds the connection needs to remain idle before TCP starts sending keepalive probes +;tcp_keepalive_interval_time=10 ; The time in seconds between individual keepalive probes +;tcp_keepalive_probe_count=5 ; The maximum number of keepalive probes TCP should send before dropping the connection + +; Example IPv4 TLS transport with Keepalive options +; +;[transport-tls] +;type=transport +;protocol=tls +;bind=0.0.0.0 +;cert_file=/path/to/mycert.crt +;priv_key_file=/path/to/mykey.key +;cipher=ADH-AES256-SHA,ADH-AES128-SHA +;method=tlsv1 +;tcp_keepalive_enable=yes ; Enable TCP keepalive (yes/no) +;tcp_keepalive_idle_time=30 ; Time in seconds the connection needs to remain idle before TCP starts sending keepalive probes +;tcp_keepalive_interval_time=10 ; The time in seconds between individual keepalive probes +;tcp_keepalive_probe_count=5 ; The maximum number of keepalive probes TCP should send before dropping the connection + ;===============OUTBOUND REGISTRATION WITH OUTBOUND AUTHENTICATION============ ; ; This is a simple registration that works with some SIP trunking providers. diff --git a/contrib/ast-db-manage/config/versions/8fce8496f03e_add_tcp_keepalive_settings_to_ps_.py b/contrib/ast-db-manage/config/versions/8fce8496f03e_add_tcp_keepalive_settings_to_ps_.py new file mode 100644 index 0000000000..4844a60ed7 --- /dev/null +++ b/contrib/ast-db-manage/config/versions/8fce8496f03e_add_tcp_keepalive_settings_to_ps_.py @@ -0,0 +1,28 @@ +"""Add TCP keepalive settings to ps_transports + +Revision ID: 8fce8496f03e +Revises: 74dc751dfe8e +Create Date: 2024-03-18 17:00:17.148018 + +""" + +# revision identifiers, used by Alembic. +revision = '8fce8496f03e' +down_revision = '74dc751dfe8e' + +from alembic import op +import sqlalchemy as sa + +def upgrade(): + with op.batch_alter_table('ps_transports') as batch_op: + batch_op.add_column(sa.Column('tcp_keepalive_enable', sa.Boolean(), nullable=True)) + batch_op.add_column(sa.Column('tcp_keepalive_idle_time', sa.Integer(), nullable=True)) + batch_op.add_column(sa.Column('tcp_keepalive_interval_time', sa.Integer(), nullable=True)) + batch_op.add_column(sa.Column('tcp_keepalive_probe_count', sa.Integer(), nullable=True)) + +def downgrade(): + with op.batch_alter_table('ps_transports') as batch_op: + batch_op.drop_column('tcp_keepalive_enable') + batch_op.drop_column('tcp_keepalive_idle_time') + batch_op.drop_column('tcp_keepalive_interval_time') + batch_op.drop_column('tcp_keepalive_probe_count') diff --git a/include/asterisk/res_pjsip.h b/include/asterisk/res_pjsip.h index b320cff525..214605209a 100644 --- a/include/asterisk/res_pjsip.h +++ b/include/asterisk/res_pjsip.h @@ -299,6 +299,14 @@ struct ast_sip_transport { int symmetric_transport; /*! This is a flow to another target */ int flow; + /*! Enable TCP keepalive */ + int tcp_keepalive_enable; + /*! Time in seconds the connection needs to remain idle before TCP starts sending keepalive probes */ + int tcp_keepalive_idle_time; + /*! The time in seconds between individual keepalive probes */ + int tcp_keepalive_interval_time; + /*! The maximum number of keepalive probes TCP should send before dropping the connection */ + int tcp_keepalive_probe_count; }; #define SIP_SORCERY_DOMAIN_ALIAS_TYPE "domain_alias" diff --git a/res/res_pjsip/config_transport.c b/res/res_pjsip/config_transport.c index 07b60a9a70..f538eaf862 100644 --- a/res/res_pjsip/config_transport.c +++ b/res/res_pjsip/config_transport.c @@ -828,17 +828,55 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj) } else if (transport->type == AST_TRANSPORT_TCP) { pjsip_tcp_transport_cfg cfg; static int option = 1; + int sockopt_count = 0; pjsip_tcp_transport_cfg_default(&cfg, temp_state->state->host.addr.sa_family); cfg.bind_addr = temp_state->state->host; cfg.async_cnt = transport->async_operations; set_qos(transport, &cfg.qos_params); + /* sockopt_params.options is copied to each newly connected socket */ - cfg.sockopt_params.options[0].level = pj_SOL_TCP(); - cfg.sockopt_params.options[0].optname = pj_TCP_NODELAY(); - cfg.sockopt_params.options[0].optval = &option; - cfg.sockopt_params.options[0].optlen = sizeof(option); - cfg.sockopt_params.cnt = 1; + cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP(); + cfg.sockopt_params.options[sockopt_count].optname = pj_TCP_NODELAY(); + cfg.sockopt_params.options[sockopt_count].optval = &option; + cfg.sockopt_params.options[sockopt_count].optlen = sizeof(option); + sockopt_count++; + + if (transport->tcp_keepalive_enable) { +#if defined(PJ_MAX_SOCKOPT_PARAMS) && PJ_MAX_SOCKOPT_PARAMS >= 5 + ast_log(LOG_DEBUG, "TCP Keepalive enabled for transport '%s'. Idle Time: %d, Interval: %d, Count: %d\n", + ast_sorcery_object_get_id(obj), transport->tcp_keepalive_idle_time, transport->tcp_keepalive_interval_time, transport->tcp_keepalive_probe_count); + + cfg.sockopt_params.options[sockopt_count].level = pj_SOL_SOCKET(); + cfg.sockopt_params.options[sockopt_count].optname = SO_KEEPALIVE; + cfg.sockopt_params.options[sockopt_count].optval = &option; + cfg.sockopt_params.options[sockopt_count].optlen = sizeof(option); + sockopt_count++; + + cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP(); + cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPIDLE; + cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_idle_time; + cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_idle_time); + sockopt_count++; + + cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP(); + cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPINTVL; + cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_interval_time; + cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_interval_time); + sockopt_count++; + + cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP(); + cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPCNT; + cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_probe_count; + cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_probe_count); + sockopt_count++; +#else + ast_log(LOG_WARNING, "TCP keepalive settings for '%s' not set due to PJSIP built without support for setting all options. Consider using bundled PJSIP.\n", + ast_sorcery_object_get_id(obj)); +#endif + } + + cfg.sockopt_params.cnt = sockopt_count; for (i = 0; i < BIND_TRIES && res != PJ_SUCCESS; i++) { if (perm_state && perm_state->state && perm_state->state->factory @@ -853,6 +891,7 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj) } else if (transport->type == AST_TRANSPORT_TLS) { #if defined(PJ_HAS_SSL_SOCK) && PJ_HAS_SSL_SOCK != 0 static int option = 1; + int sockopt_count = 0; if (transport->async_operations > 1 && ast_compare_versions(pj_get_version(), "2.5.0") < 0) { ast_log(LOG_ERROR, "Transport: %s: When protocol=tls and pjproject version < 2.5.0, async_operations can't be > 1\n", @@ -864,11 +903,47 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj) set_qos(transport, &temp_state->state->tls.qos_params); /* sockopt_params.options is copied to each newly connected socket */ - temp_state->state->tls.sockopt_params.options[0].level = pj_SOL_TCP(); - temp_state->state->tls.sockopt_params.options[0].optname = pj_TCP_NODELAY(); - temp_state->state->tls.sockopt_params.options[0].optval = &option; - temp_state->state->tls.sockopt_params.options[0].optlen = sizeof(option); - temp_state->state->tls.sockopt_params.cnt = 1; + temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP(); + temp_state->state->tls.sockopt_params.options[sockopt_count].optname = pj_TCP_NODELAY(); + temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &option; + temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(option); + sockopt_count++; + + if (transport->tcp_keepalive_enable) { +#if defined(PJ_MAX_SOCKOPT_PARAMS) && PJ_MAX_SOCKOPT_PARAMS >= 5 + ast_log(LOG_DEBUG, "TCP Keepalive enabled for transport '%s'. Idle Time: %d, Interval: %d, Count: %d\n", + ast_sorcery_object_get_id(obj), transport->tcp_keepalive_idle_time, transport->tcp_keepalive_interval_time, transport->tcp_keepalive_probe_count); + + temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_SOCKET(); + temp_state->state->tls.sockopt_params.options[sockopt_count].optname = SO_KEEPALIVE; + temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &option; + temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(option); + sockopt_count++; + + temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP(); + temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPIDLE; + temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_idle_time; + temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_idle_time); + sockopt_count++; + + temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP(); + temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPINTVL; + temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_interval_time; + temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_interval_time); + sockopt_count++; + + temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP(); + temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPCNT; + temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_probe_count; + temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_probe_count); + sockopt_count++; +#else + ast_log(LOG_WARNING, "TCP keepalive settings for '%s' not set due to PJSIP built without support for setting all options. Consider using bundled PJSIP.\n", + ast_sorcery_object_get_id(obj)); +#endif + } + + temp_state->state->tls.sockopt_params.cnt = sockopt_count; for (i = 0; i < BIND_TRIES && res != PJ_SUCCESS; i++) { if (perm_state && perm_state->state && perm_state->state->factory @@ -1760,6 +1835,10 @@ int ast_sip_initialize_sorcery_transport(void) ast_sorcery_object_field_register_custom(sorcery, "transport", "require_client_cert", "", transport_tls_bool_handler, require_client_cert_to_str, NULL, 0, 0); ast_sorcery_object_field_register_custom(sorcery, "transport", "allow_wildcard_certs", "", transport_tls_bool_handler, allow_wildcard_certs_to_str, NULL, 0, 0); ast_sorcery_object_field_register_custom(sorcery, "transport", "method", "", transport_tls_method_handler, tls_method_to_str, NULL, 0, 0); + ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_enable", "no", OPT_BOOL_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_enable)); + ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_idle_time", "30", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_idle_time)); + ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_interval_time", "1", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_interval_time)); + ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_probe_count", "5", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_probe_count)); #if defined(PJ_HAS_SSL_SOCK) && PJ_HAS_SSL_SOCK != 0 ast_sorcery_object_field_register_custom(sorcery, "transport", "cipher", "", transport_tls_cipher_handler, transport_tls_cipher_to_str, NULL, 0, 0); #endif diff --git a/res/res_pjsip/pjsip_config.xml b/res/res_pjsip/pjsip_config.xml index a7ccf741c8..c88ec54813 100644 --- a/res/res_pjsip/pjsip_config.xml +++ b/res/res_pjsip/pjsip_config.xml @@ -1798,6 +1798,30 @@ Require client certificate (TLS ONLY, not WSS) + + Enable TCP keepalive + + When set to 'yes', TCP keepalive messages are sent to verify that the endpoint is still reachable. This can help detect dead TCP connections in environments where connections may be silently dropped (e.g., NAT timeouts). + + + + Idle time before the first TCP keepalive probe is sent + + Specifies the amount of time in seconds that the connection must be idle before the first TCP keepalive probe is sent. An idle connection is defined as a connection in which no data has been sent or received by the application. + + + + Interval between TCP keepalive probes + + Specifies the interval in seconds between individual TCP keepalive probes, once the first probe is sent. This interval is used for subsequent probes if the peer does not respond to the previous probe. + + + + Maximum number of TCP keepalive probes + + Specifies the maximum number of TCP keepalive probes to send before considering the connection dead and notifying the application. If the peer does not respond after this many probes, the connection is considered broken. + + Must be of type 'transport'. diff --git a/third-party/pjproject/patches/config_site.h b/third-party/pjproject/patches/config_site.h index bb40c7bcd4..0492b04812 100644 --- a/third-party/pjproject/patches/config_site.h +++ b/third-party/pjproject/patches/config_site.h @@ -35,6 +35,15 @@ #define PJ_IOQUEUE_HAS_SAFE_UNREG 1 #define PJ_IOQUEUE_MAX_EVENTS_IN_SINGLE_POLL (16) +/* + * Increase the number of socket options available. This adjustment is necessary + * to accommodate additional TCP keepalive settings required for optimizing SIP + * transport stability, especially in environments prone to connection timeouts. + * The default limit is insufficient when configuring all desired keepalive + * parameters along with standard socket options. + */ +#define PJ_MAX_SOCKOPT_PARAMS 5 + #define PJ_SCANNER_USE_BITWISE 0 #define PJ_OS_HAS_CHECK_STACK 0