]> git.ipfire.org Git - thirdparty/samba.git/commitdiff
smbd: add option "smbd lease break:debug hung procs"
authorRalph Boehme <slow@samba.org>
Thu, 4 Apr 2024 17:18:19 +0000 (19:18 +0200)
committerJule Anger <janger@samba.org>
Wed, 2 Oct 2024 14:34:13 +0000 (14:34 +0000)
By enabling this a process sending a lease break message to another process
holding a lease will start watching that process and if that process didn't
process the lease break within 10 seconds (cf server_id_watch_waited()), we log
a kernel stack backtrace of that process.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=15624

Pair-Programmed-With: Stefan Metzmacher <metze@samba.org>

Signed-off-by: Ralph Boehme <slow@samba.org>
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Reviewed-by: Guenther Deschner <gd@samba.org>
(cherry picked from commit d8613d7ee23c4e990285a387eb9ac2eeefff9749)

source3/smbd/open.c

index 5a0fc4626bddb48e99693069ad75d69aed1001b0..12735303c6b3bd02926afdd495ff042c95804398 100644 (file)
@@ -38,6 +38,7 @@
 #include "serverid.h"
 #include "messages.h"
 #include "source3/lib/dbwrap/dbwrap_watch.h"
+#include "source3/lib/server_id_watch.h"
 #include "locking/leases_db.h"
 #include "librpc/gen_ndr/ndr_leases_db.h"
 #include "lib/util/time_basic.h"
@@ -2479,6 +2480,10 @@ static int map_lease_type_to_oplock(uint32_t lease_type)
        return result;
 }
 
+struct blocker_debug_state {
+       size_t num_blockers;
+};
+
 struct delay_for_oplock_state {
        struct files_struct *fsp;
        const struct smb2_lease *lease;
@@ -2490,8 +2495,22 @@ struct delay_for_oplock_state {
        bool have_other_lease;
        uint32_t total_lease_types;
        bool delay;
+       struct blocker_debug_state *blocker_debug_state;
 };
 
+static int blocker_debug_state_destructor(struct blocker_debug_state *state)
+{
+       if (state->num_blockers == 0) {
+               return 0;
+       }
+
+       DBG_DEBUG("blocker_debug_state [%p] num_blockers [%zu]\n",
+                 state, state->num_blockers);
+       return 0;
+}
+
+static void delay_for_oplock_fn_watch_done(struct tevent_req *subreq);
+
 static bool delay_for_oplock_fn(
        struct share_mode_entry *e,
        bool *modified,
@@ -2504,6 +2523,8 @@ static bool delay_for_oplock_fn(
        uint32_t e_lease_type = SMB2_LEASE_NONE;
        uint32_t break_to;
        bool lease_is_breaking = false;
+       struct tevent_req *subreq = NULL;
+       struct server_id_buf idbuf = {};
 
        if (e_is_lease) {
                NTSTATUS status;
@@ -2643,9 +2664,56 @@ static bool delay_for_oplock_fn(
                state->delay = true;
        }
 
+       if (!state->delay) {
+               return false;
+       }
+
+       if (state->blocker_debug_state == NULL) {
+               return false;
+       }
+
+       subreq = server_id_watch_send(state->blocker_debug_state,
+                                     fsp->conn->sconn->ev_ctx,
+                                     e->pid);
+       if (subreq == NULL) {
+               DBG_ERR("server_id_watch_send(%s) returned NULL\n",
+                       server_id_str_buf(e->pid, &idbuf));
+               return false;
+       }
+
+       tevent_req_set_callback(subreq,
+                               delay_for_oplock_fn_watch_done,
+                               state->blocker_debug_state);
+
+       state->blocker_debug_state->num_blockers++;
+
+       DBG_DEBUG("Starting to watch pid [%s] state [%p] num_blockers [%zu]\n",
+                 server_id_str_buf(e->pid, &idbuf),
+                 state->blocker_debug_state,
+                 state->blocker_debug_state->num_blockers);
+
        return false;
 };
 
+static void delay_for_oplock_fn_watch_done(struct tevent_req *subreq)
+{
+       struct blocker_debug_state *blocker_debug_state = tevent_req_callback_data(
+               subreq, struct blocker_debug_state);
+       struct server_id pid = {};
+       struct server_id_buf idbuf = {};
+       int ret;
+
+       ret = server_id_watch_recv(subreq, &pid);
+       if (ret != 0) {
+               DBG_ERR("server_id_watch_recv failed %s\n", strerror(ret));
+               return;
+       }
+
+       DBG_DEBUG("state [%p] server_id_watch_recv() returned pid [%s] exited\n",
+                 blocker_debug_state,
+                 server_id_str_buf(pid, &idbuf));
+}
+
 static NTSTATUS delay_for_oplock(files_struct *fsp,
                                 int oplock_request,
                                 const struct smb2_lease *lease,
@@ -2654,7 +2722,8 @@ static NTSTATUS delay_for_oplock(files_struct *fsp,
                                 uint32_t create_disposition,
                                 bool first_open_attempt,
                                 int *poplock_type,
-                                uint32_t *pgranted)
+                                uint32_t *pgranted,
+                                struct blocker_debug_state **blocker_debug_state)
 {
        struct delay_for_oplock_state state = {
                .fsp = fsp,
@@ -2700,6 +2769,22 @@ static NTSTATUS delay_for_oplock(files_struct *fsp,
                goto grant;
        }
 
+       if (lp_parm_bool(GLOBAL_SECTION_SNUM,
+                        "smbd lease break",
+                        "debug hung procs",
+                        false))
+       {
+               state.blocker_debug_state = talloc_zero(fsp,
+                                               struct blocker_debug_state);
+               if (state.blocker_debug_state == NULL) {
+                       return NT_STATUS_NO_MEMORY;
+               }
+               talloc_steal(talloc_tos(), state.blocker_debug_state);
+
+               talloc_set_destructor(state.blocker_debug_state,
+                                     blocker_debug_state_destructor);
+       }
+
        state.delay_mask = have_sharing_violation ?
                SMB2_LEASE_HANDLE : SMB2_LEASE_WRITE;
 
@@ -2721,6 +2806,7 @@ static NTSTATUS delay_for_oplock(files_struct *fsp,
        }
 
        if (state.delay) {
+               *blocker_debug_state = state.blocker_debug_state;
                return NT_STATUS_RETRY;
        }
 
@@ -2834,7 +2920,8 @@ static NTSTATUS handle_share_mode_lease(
        const struct smb2_lease *lease,
        bool first_open_attempt,
        int *poplock_type,
-       uint32_t *pgranted)
+       uint32_t *pgranted,
+       struct blocker_debug_state **blocker_debug_state)
 {
        bool sharing_violation = false;
        NTSTATUS status;
@@ -2875,7 +2962,8 @@ static NTSTATUS handle_share_mode_lease(
                create_disposition,
                first_open_attempt,
                poplock_type,
-               pgranted);
+               pgranted,
+               blocker_debug_state);
        if (!NT_STATUS_IS_OK(status)) {
                return status;
        }
@@ -2910,7 +2998,8 @@ static void defer_open_done(struct tevent_req *req);
 static void defer_open(struct share_mode_lock *lck,
                       struct timeval timeout,
                       struct smb_request *req,
-                      struct file_id id)
+                      struct file_id id,
+                      struct blocker_debug_state **blocker_debug_state)
 {
        struct deferred_open_record *open_rec = NULL;
        struct timeval abs_timeout;
@@ -2954,6 +3043,8 @@ static void defer_open(struct share_mode_lock *lck,
        }
        tevent_req_set_callback(watch_req, defer_open_done, watch_state);
 
+       talloc_move(watch_req, blocker_debug_state);
+
        ok = tevent_req_set_endtime(watch_req, req->sconn->ev_ctx, abs_timeout);
        if (!ok) {
                exit_server("tevent_req_set_endtime failed");
@@ -3236,7 +3327,8 @@ static bool open_match_attributes(connection_struct *conn,
 
 static void schedule_defer_open(struct share_mode_lock *lck,
                                struct file_id id,
-                               struct smb_request *req)
+                               struct smb_request *req,
+                               struct blocker_debug_state **blocker_debug_state)
 {
        /* This is a relative time, added to the absolute
           request_time value to get the absolute timeout time.
@@ -3260,7 +3352,7 @@ static void schedule_defer_open(struct share_mode_lock *lck,
                return;
        }
 
-       defer_open(lck, timeout, req, id);
+       defer_open(lck, timeout, req, id, blocker_debug_state);
 }
 
 /****************************************************************************
@@ -3322,6 +3414,7 @@ static NTSTATUS check_and_store_share_mode(
        int oplock_type = NO_OPLOCK;
        uint32_t granted_lease = 0;
        const struct smb2_lease_key *lease_key = NULL;
+       struct blocker_debug_state *blocker_debug_state = NULL;
        bool delete_on_close;
        bool ok;
 
@@ -3344,9 +3437,10 @@ static NTSTATUS check_and_store_share_mode(
                                         lease,
                                         first_open_attempt,
                                         &oplock_type,
-                                        &granted_lease);
+                                        &granted_lease,
+                                        &blocker_debug_state);
        if (NT_STATUS_EQUAL(status, NT_STATUS_RETRY)) {
-               schedule_defer_open(lck, fsp->file_id, req);
+               schedule_defer_open(lck, fsp->file_id, req, &blocker_debug_state);
                return NT_STATUS_SHARING_VIOLATION;
        }
        if (!NT_STATUS_IS_OK(status)) {