]> git.ipfire.org Git - thirdparty/bacula.git/commitdiff
use pthread_kill() instead of pthread_cancel() to terminate SD_msg_chan
authorAlain Spineux <alain@baculasystems.com>
Tue, 5 Dec 2017 09:53:51 +0000 (10:53 +0100)
committerKern Sibbald <kern@sibbald.com>
Sat, 14 Jul 2018 09:48:22 +0000 (11:48 +0200)
- fix a dead lock after a protocol error
- bdir_get_msg() use locks, SQL, FILE*.
  This is not compatible with pthread_cancel()
  while (!job_canceled(jcr) && (n=bget_dirmsg(sd)) >= 0)
  The msg_thread uses bdir_get_msg()  to dispatch Jmsg,
  and do catalog information

- moved the code in new terminate_sd_msg_chan_thread(JCR)

bacula/src/dird/backup.c
bacula/src/dird/mac.c
bacula/src/dird/msgchan.c
bacula/src/dird/protos.h
bacula/src/dird/restore.c
bacula/src/dird/vbackup.c

index 47888eecc37f2939a2bb48d8bae532b5f13fc74a..f31fd447269c0145671a3ade03cd7580a73f3739 100644 (file)
@@ -845,21 +845,11 @@ void backup_cleanup(JCR *jcr, int TermCode)
       case JS_ErrorTerminated:
          Mmsg(term_msg, _("*** Backup Error ***"));
          msg_type = M_ERROR;          /* Generate error message */
-         if (jcr->store_bsock) {
-            jcr->store_bsock->signal(BNET_TERMINATE);
-            if (jcr->SD_msg_chan_started) {
-               pthread_cancel(jcr->SD_msg_chan);
-            }
-         }
+         terminate_sd_msg_chan_thread(jcr);
          break;
       case JS_Canceled:
          Mmsg(term_msg, _("Backup Canceled"));
-         if (jcr->store_bsock) {
-            jcr->store_bsock->signal(BNET_TERMINATE);
-            if (jcr->SD_msg_chan_started) {
-               pthread_cancel(jcr->SD_msg_chan);
-            }
-         }
+         terminate_sd_msg_chan_thread(jcr);
          break;
       default:
          Mmsg(term_msg, _("Inappropriate term code: %c\n"), jcr->JobStatus);
index 9d4b135c4122c4569f04edda1dd0f375a471c901..1ed32c1b3e02252e48016127ef9f450e407dce22 100644 (file)
@@ -793,33 +793,13 @@ void mac_cleanup(JCR *jcr, int TermCode, int writeTermCode)
       case JS_ErrorTerminated:
          Mmsg(term_msg, _("*** %%s Error ***"));
          msg_type = M_ERROR;          /* Generate error message */
-         if (jcr->store_bsock) {
-            jcr->store_bsock->signal(BNET_TERMINATE);
-            if (jcr->SD_msg_chan_started) {
-               pthread_cancel(jcr->SD_msg_chan);
-            }
-         }
-         if (wjcr->store_bsock) {
-            wjcr->store_bsock->signal(BNET_TERMINATE);
-            if (wjcr->SD_msg_chan_started) {
-               pthread_cancel(wjcr->SD_msg_chan);
-            }
-         }
+         terminate_sd_msg_chan_thread(jcr);
+         terminate_sd_msg_chan_thread(wjcr);
          break;
       case JS_Canceled:
          Mmsg(term_msg, _("%%s Canceled"));
-         if (jcr->store_bsock) {
-            jcr->store_bsock->signal(BNET_TERMINATE);
-            if (jcr->SD_msg_chan_started) {
-               pthread_cancel(jcr->SD_msg_chan);
-            }
-         }
-         if (wjcr->store_bsock) {
-            wjcr->store_bsock->signal(BNET_TERMINATE);
-            if (wjcr->SD_msg_chan_started) {
-               pthread_cancel(wjcr->SD_msg_chan);
-            }
-         }
+         terminate_sd_msg_chan_thread(jcr);
+         terminate_sd_msg_chan_thread(wjcr);
          break;
       default:
          Mmsg(term_msg, _("Inappropriate %s term code"));
index 6ead5516ff504164703340062e4b796efa5e6662..247b09d17c8c2bc74a714b0e39f7628039806d12 100644 (file)
@@ -501,6 +501,38 @@ void wait_for_storage_daemon_termination(JCR *jcr)
    jcr->setJobStatus(JS_Terminated);
 }
 
+void terminate_sd_msg_chan_thread(JCR *jcr)
+{
+   if (jcr->store_bsock) {
+      jcr->store_bsock->signal(BNET_TERMINATE);
+      jcr->lock();
+      if (  !jcr->sd_msg_thread_done
+          && jcr->SD_msg_chan_started
+          && !pthread_equal(jcr->SD_msg_chan, pthread_self())) {
+         Dmsg1(800, "Send kill to SD msg chan jid=%d\n", jcr->JobId);
+         int cnt = 6; // 6*5sec
+         while (!jcr->sd_msg_thread_done && cnt>0) {
+            jcr->unlock();
+            pthread_kill(jcr->SD_msg_chan, TIMEOUT_SIGNAL);
+            struct timeval tv;
+            struct timezone tz;
+            struct timespec timeout;
+
+            gettimeofday(&tv, &tz);
+            timeout.tv_nsec = 0;
+            timeout.tv_sec = tv.tv_sec + 5; /* wait 5 seconds */
+            Dmsg0(00, "I'm waiting for message thread termination.\n");
+            P(mutex);
+            pthread_cond_timedwait(&jcr->term_wait, &mutex, &timeout);
+            V(mutex);
+            jcr->lock();
+            cnt--;
+         }
+      }
+      jcr->unlock();
+   }
+}
+
 /*
  * Send bootstrap file to Storage daemon.
  *  This is used for restore, verify VolumeToCatalog, migration,
index bac1b68af166a47bccc2726d8fc1c2be39972df9..a35d969dda72d93dfafbe28ebb0ae34b4355d43c 100644 (file)
@@ -151,6 +151,7 @@ extern void dird_free_jcr_pointers(JCR *jcr);
 extern void cancel_storage_daemon_job(JCR *jcr);
 extern bool run_console_command(JCR *jcr, const char *cmd);
 extern void sd_msg_thread_send_signal(JCR *jcr, int sig);
+void terminate_sd_msg_chan_thread(JCR *jcr);
 
 /* jobq.c */
 extern bool inc_read_store(JCR *jcr);
index 022e9479b2f97d3687ad8a2833a69e7b73c450bd..7848a07f8c8286ab8d8b22d84649914699baa344 100644 (file)
@@ -659,21 +659,11 @@ void restore_cleanup(JCR *jcr, int TermCode)
    case JS_ErrorTerminated:
       term_msg = _("*** Restore Error ***");
       msg_type = M_ERROR;          /* Generate error message */
-      if (jcr->store_bsock) {
-         jcr->store_bsock->signal(BNET_TERMINATE);
-         if (jcr->SD_msg_chan_started) {
-            pthread_cancel(jcr->SD_msg_chan);
-         }
-      }
+      terminate_sd_msg_chan_thread(jcr);
       break;
    case JS_Canceled:
       term_msg = _("Restore Canceled");
-      if (jcr->store_bsock) {
-         jcr->store_bsock->signal(BNET_TERMINATE);
-         if (jcr->SD_msg_chan_started) {
-            pthread_cancel(jcr->SD_msg_chan);
-         }
-      }
+      terminate_sd_msg_chan_thread(jcr);
       break;
    default:
       term_msg = term_code;
index 278d96f9351236624fd889af642c4b7582a61f55..ab7c2d4fcb10267d8f815a3e2c55bdf104e44d63 100644 (file)
@@ -423,21 +423,11 @@ void vbackup_cleanup(JCR *jcr, int TermCode)
       case JS_ErrorTerminated:
          term_msg = _("*** Backup Error ***");
          msg_type = M_ERROR;          /* Generate error message */
-         if (jcr->store_bsock) {
-            jcr->store_bsock->signal(BNET_TERMINATE);
-            if (jcr->SD_msg_chan_started) {
-               pthread_cancel(jcr->SD_msg_chan);
-            }
-         }
+         terminate_sd_msg_chan_thread(jcr);
          break;
       case JS_Canceled:
          term_msg = _("Backup Canceled");
-         if (jcr->store_bsock) {
-            jcr->store_bsock->signal(BNET_TERMINATE);
-            if (jcr->SD_msg_chan_started) {
-               pthread_cancel(jcr->SD_msg_chan);
-            }
-         }
+         terminate_sd_msg_chan_thread(jcr);
          break;
       default:
          term_msg = term_code;