From: Oliver Kurth Date: Fri, 26 Oct 2018 17:44:58 +0000 (-0700) Subject: Implement tools hang detection logic X-Git-Tag: stable-11.0.0~365 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0e4b5b90b268a542984f572909ea49d32bd15b12;p=thirdparty%2Fopen-vm-tools.git Implement tools hang detection logic Create a dedicated detector thread. The thread sits in a loop and wakes up periodically to decrement an atomic counter. Also schedules a checkin timer with the main loop to reset the counter periodically. If the counter ever drops to/below zero, the tools hang is detected, and a tools hang event is generated. Otherwise, if there was a hang, but the counter has now come back up to positive, a tools recovery event is generated. In order to properly create hang and recovery event, previous state needs to be tracked. In order to properly handle shutdown, we need a condition variable so that the detector thread can wake up on it while sleeping. This is because the toolsd calls the thread pool shutdown function which in turn calls each thread's terminate function and wait for the threads to quit. Therefore, our terminate function shall wake up the detector thread and make it quit. Otherwise, the toolsd shutdown shall hang. Next change shall implement the new RPCI command to send the hang/recovery event to VMX. --- diff --git a/open-vm-tools/services/vmtoolsd/Makefile.am b/open-vm-tools/services/vmtoolsd/Makefile.am index c21a8bb3e..90f271717 100644 --- a/open-vm-tools/services/vmtoolsd/Makefile.am +++ b/open-vm-tools/services/vmtoolsd/Makefile.am @@ -42,6 +42,7 @@ vmtoolsd_SOURCES += serviceObj.c vmtoolsd_SOURCES += threadPool.c vmtoolsd_SOURCES += toolsRpc.c vmtoolsd_SOURCES += svcSignals.c +vmtoolsd_SOURCES += toolsHangDetector.c BUILT_SOURCES = BUILT_SOURCES += svcSignals.c diff --git a/open-vm-tools/services/vmtoolsd/mainLoop.c b/open-vm-tools/services/vmtoolsd/mainLoop.c index aabc25cea..bf6dde84a 100644 --- a/open-vm-tools/services/vmtoolsd/mainLoop.c +++ b/open-vm-tools/services/vmtoolsd/mainLoop.c @@ -33,6 +33,7 @@ #include "conf.h" #include "guestApp.h" #include "serviceObj.h" +#include "toolsHangDetector.h" #include "str.h" #include "system.h" #include "util.h" @@ -455,6 +456,12 @@ ToolsCoreRunLoop(ToolsServiceState *state) #if defined(__APPLE__) ToolsCore_CFRunLoop(state); #else + /* + * For now exclude the MAC due to limited testing. + */ + if (state->mainService && ToolsCoreHangDetector_Start(&state->ctx)) { + g_info("Successfully started tools hang detector"); + } g_main_loop_run(state->ctx.mainLoop); #endif } diff --git a/open-vm-tools/services/vmtoolsd/toolsHangDetector.c b/open-vm-tools/services/vmtoolsd/toolsHangDetector.c new file mode 100644 index 000000000..97b68808c --- /dev/null +++ b/open-vm-tools/services/vmtoolsd/toolsHangDetector.c @@ -0,0 +1,404 @@ +/********************************************************* + * Copyright (C) 2018 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation version 2.1 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the Lesser GNU General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + *********************************************************/ + +/** + * @file toolsHangDetector.c + * + * Implementation of the tools hang detection and reporting + */ + +#include + +#include "vmware.h" +#include "vmware/tools/threadPool.h" +#include "toolsHangDetector.h" + + +typedef enum { + NORMAL, + HUNG +} DetectedMode; + +typedef struct HangDetectorState { + /* 'mutex' and 'cond' protect concurrent accesses to 'terminate' flag */ + GMutex mutex; + GCond cond; + gboolean terminate; + + gint atomic; + DetectedMode mode; + GSource *checkinTimer; +} HangDetectorState; + +static HangDetectorState gDetectorState; + +#define SLEEP_INTERVAL 1 /* approximately 1 second */ +#define CHECKIN_INTERVAL 1 /* approximately 1 second */ +#define COUNTER_RESET_VALUE 5 /* approximately 5 seconds */ + + +/* + ****************************************************************************** + * DetectorInit -- */ /** + * + * Initialization + * + * Note: No need to call g_mutex_init, g_mutex_clear, g_cond_init, + * and g_cond_clear on statically allocated GMutex and GCond. + * Directly use them according to glib documentation. + * + ****************************************************************************** + */ + +static void +DetectorInit(void) +{ + HangDetectorState *state = &gDetectorState; + + state->terminate = FALSE; + state->mode = NORMAL; + g_atomic_int_set(&state->atomic, COUNTER_RESET_VALUE); +} + + +/* + ****************************************************************************** + * DetectorFree -- */ /** + * + * Resets any global state and frees up memory + * + * @param[in] args Currently unused + * + ****************************************************************************** + */ + +static void +DetectorFree(UNUSED_PARAM(gpointer args)) +{ + HangDetectorState *state = &gDetectorState; + + if (state->checkinTimer) { + g_source_destroy(state->checkinTimer); + g_source_unref(state->checkinTimer); + state->checkinTimer = NULL; + } +} + + +/* + ****************************************************************************** + * DetectorTerminate -- */ /** + * + * Signals the detector thread to exit. + * + * @param[in] ctx Application context + * @param[in] args Currently unused + * + ****************************************************************************** + */ + +static void +DetectorTerminate(ToolsAppCtx *ctx, + UNUSED_PARAM(gpointer args)) +{ + HangDetectorState *state = &gDetectorState; + + g_mutex_lock(&state->mutex); + + state->terminate = TRUE; + g_cond_signal(&state->cond); + + g_mutex_unlock(&state->mutex); +} + + +/* + ****************************************************************************** + * UpdateVmx -- */ /** + * + * Notify the VMX about a tools service hang/recover event + * + * @param[in] event event to put into the notification RPCI command. + * + ****************************************************************************** + */ + +static void +UpdateVmx(const char *event) +{ + /* TBD */ +} + + +/* + ****************************************************************************** + * UpdateStateToHung -- */ /** + * + * Update the current state to HUNG, and notify VMX + * + ****************************************************************************** + */ + +static void +UpdateStateToHung(void) +{ + HangDetectorState *state = &gDetectorState; + + state->mode = HUNG; + + g_info("tools service hung."); + + UpdateVmx("hang"); +} + + +/* + ****************************************************************************** + * UpdateStateToNormal -- */ /** + * + * Update the current state to NORMAL, and notify VMX + * + ****************************************************************************** + */ + +static void +UpdateStateToNormal(void) +{ + HangDetectorState *state = &gDetectorState; + + state->mode = NORMAL; + + g_info("tools service recovered from a hang."); + + UpdateVmx("recover"); +} + + +/* + ****************************************************************************** + * DetectorUpdate -- */ /** + * + * Check the counter value and send proper updates to VMX + * + * @param[in] value Counter value + * + ****************************************************************************** + */ + +static void +DetectorUpdate(gint value) +{ + HangDetectorState *state = &gDetectorState; + + if (state->mode == NORMAL) { + if (value <= 0) { + UpdateStateToHung(); + } + } else { + if (value > 0) { + UpdateStateToNormal(); + } + } +} + + +/* + ****************************************************************************** + * SleepToExit -- */ /** + * + * Sleep for the time specified, or return if the caller should terminate. + * + * @param[in] time Time in seconds to sleep + * + * @return TRUE if the caller should terminate, e.g. signaled by a terminator + * FALSE otherwise + * + ****************************************************************************** + */ + +static gboolean +SleepToExit(gint time) +{ + HangDetectorState *state = &gDetectorState; + gint64 endTime = g_get_monotonic_time() + time * G_TIME_SPAN_SECOND; + gboolean ret; + + g_mutex_lock(&state->mutex); + + while (!state->terminate) { + if (!g_cond_wait_until(&state->cond, &state->mutex, endTime)) { + /* endTime passed */ + ret = FALSE; + goto exit; + } + } + + ret = TRUE; + +exit: + + g_mutex_unlock(&state->mutex); + + return ret; +} + + +/* + ****************************************************************************** + * DetectorThread -- */ /** + * + * Detector thread entry function + * + * @param[in] ctx Application context + * @param[in] args Currently unused + * + ****************************************************************************** + */ + +static void +DetectorThread(ToolsAppCtx *ctx, + UNUSED_PARAM(gpointer args)) +{ + HangDetectorState *state = &gDetectorState; + + while (1) { + gint old = g_atomic_int_add(&state->atomic, -1); + DetectorUpdate(old); + + if (SleepToExit(SLEEP_INTERVAL)) { + break; + } + } +} + + +/* + ****************************************************************************** + * DetectorCheckin -- */ /** + * + * Check in with the detector by resetting the counter + * + * @param[in] args Currently unused + * + * @return TRUE always, otherwise the event source is removed by glib + * + ****************************************************************************** + */ + +static gboolean +DetectorCheckin(UNUSED_PARAM(gpointer args)) +{ + HangDetectorState *state = &gDetectorState; + + g_atomic_int_set(&state->atomic, COUNTER_RESET_VALUE); + + return TRUE; +} + + +/* + ****************************************************************************** + * ScheduleCheckinTimer -- */ /** + * + * Schedule the periodic checkin timer with the main loop + * + * @param[in] ctx Application Context + * + * @return TRUE iff timer is successfully scheduled + * + ****************************************************************************** + */ + +static gboolean +ScheduleCheckinTimer(ToolsAppCtx *ctx) +{ + HangDetectorState *state = &gDetectorState; + GMainContext *mainCtx = g_main_loop_get_context(ctx->mainLoop); + GSource *eventSource; + + ASSERT(NULL == state->checkinTimer); + ASSERT(NULL != mainCtx); + + eventSource = VMTools_CreateTimer(CHECKIN_INTERVAL * 1000); + + if (NULL == eventSource) { + return FALSE; + } + + g_source_set_callback(eventSource, DetectorCheckin, NULL, NULL); + g_source_attach(eventSource, mainCtx); + + state->checkinTimer = eventSource; + + return TRUE; +} + + +/* + ****************************************************************************** + * ToolsCoreHangDetector_Start -- */ /** + * + * Register the checkin function to the tools main loop as a timer handler. + * Start the detector thread to watch for the tools hang. + * + * @param[in] ctx Application context. + * + * @return TRUE iff the hang detector is successfully started + * + ****************************************************************************** + */ + +gboolean +ToolsCoreHangDetector_Start(ToolsAppCtx *ctx) +{ + gboolean ret; + GKeyFile *cfg = ctx->config; + gboolean disabled; + + ASSERT(NULL != cfg); + disabled = g_key_file_get_boolean(cfg, VMTOOLS_GUEST_SERVICE, + "toolsHangDetectorDisabled", + NULL); + if (disabled) { + g_info("tools hang detector is disabled"); + ret = FALSE; + goto exit; + } + + DetectorInit(); + + ret = ScheduleCheckinTimer(ctx); + if (!ret) { + g_info("Unable to schedule hang detector checkin timer on the main loop"); + goto exit; + } + + ret = ToolsCorePool_StartThread(ctx, + DetectorThread, + DetectorTerminate, + NULL, + DetectorFree); + if (!ret) { + g_info("Unable to start the detector thread"); + DetectorFree(NULL); + } + +exit: + + return ret; +} diff --git a/open-vm-tools/services/vmtoolsd/toolsHangDetector.h b/open-vm-tools/services/vmtoolsd/toolsHangDetector.h new file mode 100644 index 000000000..89150d210 --- /dev/null +++ b/open-vm-tools/services/vmtoolsd/toolsHangDetector.h @@ -0,0 +1,30 @@ +/********************************************************* + * Copyright (C) 2018 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation version 2.1 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the Lesser GNU General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + *********************************************************/ + +#ifndef _TOOLS_HANG_DETECTOR_H_ +#define _TOOLS_HANG_DETECTOR_H_ + +/** + * @file toolsHangDetector.h + * + * Interface of the tools hang detection module. + */ + +gboolean ToolsCoreHangDetector_Start(ToolsAppCtx *ctx); + +#endif /* _TOOLS_HANG_DETECTOR_H_ */