From cd866f5c4ce5a85ce7076f6beb9b32aacb25fb5a Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Thu, 3 Aug 2023 15:45:45 +0200
Subject: [PATCH] s3:smbd: fix multichannel connection passing race

If a client opens multiple connection with the same
client guid in parallel, our connection passing is likely
to hit a race.

Assume we have 3 processes:

smbdA: This process already handles all connections for
       a given client guid
smbdB: This just received a new connection with an
       SMB2 neprot for the same client guid
smbdC: This also received a new connection with an
       SMB2 neprot for the same client guid

Now both smbdB and smbdC send a MSG_SMBXSRV_CONNECTION_PASS
message to smbdA. These messages contain the socket fd
for each connection.

While waiting for a MSG_SMBXSRV_CONNECTION_PASSED message
from smbdA, both smbdB and smbdC watch the smbXcli_client.tdb
record for changes (that also verifies smbdA stays alive).

Once one of them say smbdB received the MSG_SMBXSRV_CONNECTION_PASSED
message, the dbwrap_watch logic will wakeup smbdC in order to
let it recheck the smbXcli_client.tdb record in order to
handle the case where smbdA died or deleted its record.

Now smbdC rechecks the smbXcli_client.tdb record, but it
was not woken because of a problem with smbdA. It meant
that smbdC sends a MSG_SMBXSRV_CONNECTION_PASS message
including the socket fd again.

As a result smbdA got the socket fd from smbdC twice (or even more),
and creates two (or more) smbXsrv_connection structures for the
same low level tcp connection. And it also sends more than one
SMB2 negprot response. Depending on the tevent logic, it will
use different smbXsrv_connection structures to process incoming
requests. And this will almost immediately result in errors.

The typicall error is:
 smb2_validate_sequence_number: smb2_validate_sequence_number: bad message_id 2 (sequence id 2) (granted = 1, low = 1, range = 1)

But other errors would also be possible.

The detail that leads to the long delays on the client side is
that our smbd_server_connection_terminate_ex() code will close
only the fd of a single smbXsrv_connection, but the refcount
on the socket fd in the kernel is still not 0, so the tcp
connection is still alive...

Now we remember the server_id of the process that we send
the MSG_SMBXSRV_CONNECTION_PASS message to. And just keep
watching the smbXcli_client.tdb record if the server_id
don't change. As we just need more patience to wait for
the MSG_SMBXSRV_CONNECTION_PASSED message.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=15346

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Reviewed-by: Andreas Schneider <asn@samba.org>

Autobuild-User(master): Andreas Schneider <asn@cryptomilk.org>
Autobuild-Date(master): Tue Aug  8 13:59:58 UTC 2023 on atb-devel-224

(cherry picked from commit f348b84fbcf203ab1ba92840cf7aecd55dbf9aa0)

Autobuild-User(v4-18-test): Jule Anger <janger@samba.org>
Autobuild-Date(v4-18-test): Fri Aug 11 09:49:53 UTC 2023 on atb-devel-224
---
 selftest/knownfail.d/samba3.smb2.multichannel |  1 -
 source3/smbd/smbXsrv_client.c                 | 31 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)
 delete mode 100644 selftest/knownfail.d/samba3.smb2.multichannel

diff --git a/selftest/knownfail.d/samba3.smb2.multichannel b/selftest/knownfail.d/samba3.smb2.multichannel
deleted file mode 100644
index 26b80a39d5b..00000000000
--- a/selftest/knownfail.d/samba3.smb2.multichannel
+++ /dev/null
@@ -1 +0,0 @@
-^samba3.smb2.multichannel.bugs.bug_15346
diff --git a/source3/smbd/smbXsrv_client.c b/source3/smbd/smbXsrv_client.c
index 577131fc16f..928a9d72caa 100644
--- a/source3/smbd/smbXsrv_client.c
+++ b/source3/smbd/smbXsrv_client.c
@@ -488,6 +488,7 @@ struct smb2srv_client_mc_negprot_state {
 	struct tevent_context *ev;
 	struct smbd_smb2_request *smb2req;
 	struct db_record *db_rec;
+	struct server_id sent_server_id;
 	uint64_t watch_instance;
 	uint32_t last_seqnum;
 	struct tevent_req *filter_subreq;
@@ -530,6 +531,8 @@ struct tevent_req *smb2srv_client_mc_negprot_send(TALLOC_CTX *mem_ctx,
 
 	tevent_req_set_cleanup_fn(req, smb2srv_client_mc_negprot_cleanup);
 
+	server_id_set_disconnected(&state->sent_server_id);
+
 	smb2srv_client_mc_negprot_next(req);
 
 	if (!tevent_req_is_in_progress(req)) {
@@ -625,6 +628,30 @@ verify_again:
 		return;
 	}
 
+	if (server_id_equal(&state->sent_server_id, &global->server_id)) {
+		/*
+		 * We hit a race with other concurrent connections,
+		 * which have woken us.
+		 *
+		 * We already sent the pass or drop message to
+		 * the process, so we need to wait for a
+		 * response and not pass the connection
+		 * again! Otherwise the process would
+		 * receive the same tcp connection via
+		 * more than one file descriptor and
+		 * create more than one smbXsrv_connection
+		 * structure for the same tcp connection,
+		 * which means the client would see more
+		 * than one SMB2 negprot response to its
+		 * single SMB2 netprot request and we
+		 * as server get the session keys and
+		 * message id validation wrong
+		 */
+		goto watch_again;
+	}
+
+	server_id_set_disconnected(&state->sent_server_id);
+
 	/*
 	 * If last_server_id is set, we expect
 	 * smbXsrv_client_global_verify_record()
@@ -660,6 +687,7 @@ verify_again:
 			 */
 			goto verify_again;
 		}
+		state->sent_server_id = global->server_id;
 		if (tevent_req_nterror(req, status)) {
 			return;
 		}
@@ -674,11 +702,14 @@ verify_again:
 			 */
 			goto verify_again;
 		}
+		state->sent_server_id = global->server_id;
 		if (tevent_req_nterror(req, status)) {
 			return;
 		}
 	}
 
+watch_again:
+
 	/*
 	 * If the record changed, but we are not happy with the change yet,
 	 * we better remove ourself from the waiter list
-- 
2.47.3