From: Michal Rakowski Date: Mon, 28 Jun 2021 07:02:02 +0000 (+0200) Subject: Fix #7671 About stopping job waiting for the device X-Git-Tag: Release-11.3.2~473 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=dd6c3c4dd734fef880ceae87827a0930628a2adc;p=thirdparty%2Fbacula.git Fix #7671 About stopping job waiting for the device 1. Decrement timeout value for the running jobs when we are waiting on SD to stop the job and free all it's JCRs related. 2. When 'stop' is called against job which has not been started, upgrade command to 'cancel' instead and inform user about that. --- diff --git a/bacula/src/dird/backup.c b/bacula/src/dird/backup.c index 686d9af59..e5b5a2c81 100644 --- a/bacula/src/dird/backup.c +++ b/bacula/src/dird/backup.c @@ -807,7 +807,7 @@ int wait_for_job_termination(JCR *jcr, int timeout) Jmsg(jcr, M_WARNING, 0, _("Unexpected Client Job message: %s\n"), fd->msg); } - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { break; } } diff --git a/bacula/src/dird/job.c b/bacula/src/dird/job.c index ddc5b4111..b86dd997b 100644 --- a/bacula/src/dird/job.c +++ b/bacula/src/dird/job.c @@ -685,7 +685,10 @@ int cancel_inactive_job(UAContext *ua) } jcr->store_mngr->set_wstorage(store.store, store.store_source); - cancel_sd_job(ua, "cancel", jcr); + if (!cancel_sd_job(ua, "cancel", jcr)) { + ua->error_msg(_("Failed to cancel storage dameon job for JobId=%d\n"), jcr->JobId); + goto bail_out; + } bail_out: jcr->JobId = 0; @@ -706,6 +709,7 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) char ed1[50]; int32_t old_status = jcr->JobStatus; int status; + bool ret = false, not_running = false; const char *reason, *cmd; /* Keep track of this important event */ @@ -723,7 +727,24 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) } } - if (cancel) { + switch (old_status) { + case JS_Created: + case JS_WaitJobRes: + case JS_WaitClientRes: + case JS_WaitStoreRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + case JS_WaitDevice: + not_running = true; + break; + default: + break; + } + + /* If job has not been started at all, there is no need to stoping it, + * can be simply canceled and removed from the watiting queue*/ + if (cancel || not_running) { status = JS_Canceled; reason = _("canceled"); cmd = NT_("cancel"); @@ -736,29 +757,34 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) jcr->setJobStatus(status); - switch (old_status) { - case JS_Created: - case JS_WaitJobRes: - case JS_WaitClientRes: - case JS_WaitStoreRes: - case JS_WaitPriority: - case JS_WaitMaxJobs: - case JS_WaitStartTime: - case JS_WaitDevice: - ua->info_msg(_("JobId %s, Job %s marked to be %s.\n"), - edit_uint64(jcr->JobId, ed1), jcr->Job, - reason); - jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ - break; - - default: + if (not_running) { + status = jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ + if (status != 0) { + ua->error_msg(_("Cannot %s JobId %s, Job %s is not in work queue\n"), + cmd, edit_uint64(jcr->JobId, ed1), jcr->Job); + goto bail_out; + } + if (!cancel) { + /* Inform user about command 'upgrade' */ + ua->info_msg(_("Canceling JobId %s, Job %s because it was not started at all.\n"), + edit_uint64(jcr->JobId, ed1), jcr->Job, + reason); + } + ret = true; + } else { + ret = true; /* This will be set to false in case of error from any daemon below */ /* Cancel File daemon */ if (jcr->file_bsock) { btimer_t *tid; - /* do not return now, we want to try to cancel the sd */ + /* do not return now, we want to try to cancel the sd. + * We don't want to wait too long because it's pretty hard to synchronize jcrs for both daemons + * and we want to shutdown the connection anyway. */ tid = start_bsock_timer(jcr->file_bsock, 120); - cancel_file_daemon_job(ua, cmd, jcr); + if (!cancel_file_daemon_job(ua, cmd, jcr)) { + Dmsg1(400, "Failed to cancel file dameon job id=%d\n", jcr->JobId); + ret = false; + } stop_bsock_timer(tid); } @@ -773,15 +799,18 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) /* Cancel Storage daemon */ if (jcr->store_bsock) { btimer_t *tid; - /* do not return now, we want to try to cancel the sd socket */ - tid = start_bsock_timer(jcr->store_bsock, 120); - cancel_sd_job(ua, cmd, jcr); + /* Do not return now, we want to try to cancel the sd socket. + * We don't want to wait too long because it's pretty hard to synchronize jcrs freeing for both daemons + * and we want to shutdown the connection anyway. */ + tid = start_bsock_timer(jcr->store_bsock, 20); + if (!cancel_sd_job(ua, cmd, jcr)) { + Dmsg1(400, "Failed to cancel storage dameon job id=%d\n", jcr->JobId); + ret = false; + } stop_bsock_timer(tid); } - /* We test file_bsock because the previous operation can take - * several minutes - */ + /* We test file_bsock because the previous operation can take some time */ if (jcr->store_bsock && cancel) { jcr->store_bsock->set_timed_out(); jcr->store_bsock->set_terminated(); @@ -796,14 +825,17 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) if (wjcr->store_bsock) { btimer_t *tid; - /* do not return now, we want to try to cancel the sd socket */ - tid = start_bsock_timer(wjcr->store_bsock, 120); - cancel_sd_job(ua, cmd, wjcr); + /* Do not return now, we want to try to cancel the sd socket. + * We don't want to wait too long because it's pretty hard to synchronize jcrs freeing for both daemons + * and we want to shutdown the connection anyway. */ + tid = start_bsock_timer(wjcr->store_bsock, 20); + if (!cancel_sd_job(ua, cmd, wjcr)) { + Dmsg1(400, "Failed to cancel storage dameon job id=%d\n", jcr->JobId); + ret = false; + } stop_bsock_timer(tid); } - /* We test store_bsock because the previous operation can take - * several minutes - */ + /* We test store_bsock because the previous operation can take some time */ if (wjcr->store_bsock && cancel) { wjcr->store_bsock->set_timed_out(); wjcr->store_bsock->set_terminated(); @@ -811,10 +843,15 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) wjcr->my_thread_send_signal(TIMEOUT_SIGNAL); } } - break; } - return true; + if (ret) { + ua->info_msg(_("JobId %s, Job %s marked to be %s.\n"), + edit_uint64(jcr->JobId, ed1), jcr->Job, + reason); + } +bail_out: + return ret; } void cancel_storage_daemon_job(JCR *jcr) @@ -1522,6 +1559,7 @@ void dird_free_jcr(JCR *jcr) /* Free bsock packets */ free_bsock(jcr->file_bsock); free_bsock(jcr->store_bsock); + if (jcr->term_wait_inited) { pthread_cond_destroy(&jcr->term_wait); jcr->term_wait_inited = false; diff --git a/bacula/src/filed/job.c b/bacula/src/filed/job.c index c29412f7c..09713d29a 100644 --- a/bacula/src/filed/job.c +++ b/bacula/src/filed/job.c @@ -3485,7 +3485,7 @@ int response(JCR *jcr, BSOCK *sd, char *resp, const char *cmd) return 1; } } - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { return 0; /* if canceled avoid useless error messages */ } if (sd->is_error()) { diff --git a/bacula/src/jcr.h b/bacula/src/jcr.h index 3c2bae804..f8ef5c314 100644 --- a/bacula/src/jcr.h +++ b/bacula/src/jcr.h @@ -107,7 +107,6 @@ enum { #define job_canceled(jcr) \ (jcr->JobStatus == JS_Canceled || \ - jcr->JobStatus == JS_Incomplete || \ jcr->JobStatus == JS_ErrorTerminated || \ jcr->JobStatus == JS_FatalError \ ) diff --git a/bacula/src/stored/acquire.c b/bacula/src/stored/acquire.c index b1436ced1..9421df907 100644 --- a/bacula/src/stored/acquire.c +++ b/bacula/src/stored/acquire.c @@ -418,7 +418,7 @@ DCR *acquire_device_for_append(DCR *dcr) dev->Unlock(); Dmsg1(190, "jid=%u Do mount_next_write_vol\n", (uint32_t)jcr->JobId); if (!dcr->mount_next_write_volume()) { - if (!job_canceled(jcr)) { + if (!job_canceled(jcr) && !jcr->is_incomplete()) { /* Reduce "noise" -- don't print if job canceled */ Mmsg2(jcr->errmsg, _("Could not ready %s device %s for append.\n"), dev->print_type(), dev->print_name()); diff --git a/bacula/src/stored/askdir.c b/bacula/src/stored/askdir.c index 1c3125092..7bcedc2b8 100644 --- a/bacula/src/stored/askdir.c +++ b/bacula/src/stored/askdir.c @@ -822,14 +822,14 @@ bool dir_ask_sysop_to_create_appendable_volume(DCR *dcr) JCR *jcr = dcr->jcr; bool got_vol = false; - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { dev->poll = false; return false; } Dmsg0(400, "enter dir_ask_sysop_to_create_appendable_volume\n"); ASSERT(dev->blocked()); for ( ;; ) { - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { Mmsg(dev->errmsg, _("Job %s canceled while waiting for mount on Storage Device \"%s\".\n"), jcr->Job, dev->print_name()); @@ -866,6 +866,14 @@ bool dir_ask_sysop_to_create_appendable_volume(DCR *dcr) Dmsg1(dbglvl, "Poll timeout in create append vol on device %s\n", dev->print_name()); continue; } + if (stat == W_WAKE) { + /* Job could be marked to stopped, need to break */ + Mmsg0(dev->errmsg, _("Job was stopped by the user.\n")); + Jmsg(jcr, M_INFO, 0, "%s", dev->errmsg); + Dmsg1(dbglvl, "Job marked to be stopped. Gave up waiting on device %s\n", dev->print_name()); + dev->poll = false; + return false; + } if (stat == W_TIMEOUT) { if (!double_dev_wait_time(dev)) { Mmsg(dev->errmsg, _("Max time exceeded waiting to mount Storage Device %s for Job %s\n"), @@ -930,7 +938,7 @@ bool dir_ask_sysop_to_mount_volume(DCR *dcr, bool write_access) } for ( ;; ) { - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { Mmsg(dev->errmsg, _("Job %s canceled while waiting for mount on Storage Device %s.\n"), jcr->Job, dev->print_name()); dev->poll = false; @@ -978,6 +986,14 @@ bool dir_ask_sysop_to_mount_volume(DCR *dcr, bool write_access) goto get_out; } + if (stat == W_WAKE) { + /* Job could be marked to stopped, need to break */ + Mmsg0(dev->errmsg, _("Job was stopped by the user.\n")); + Jmsg(jcr, M_INFO, 0, "%s", dev->errmsg); + Dmsg1(dbglvl, "Job marked to be stopped. Gave up waiting on device %s\n", dev->print_name()); + dev->poll = false; + return false; + } if (stat == W_TIMEOUT) { if (!double_dev_wait_time(dev)) { Mmsg(dev->errmsg, _("Max time exceeded waiting to mount Storage Device %s for Job %s\n"), @@ -1001,7 +1017,7 @@ bool dir_ask_sysop_to_mount_volume(DCR *dcr, bool write_access) } get_out: - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { Mmsg(dev->errmsg, _("Job %s canceled while waiting for mount on Storage Device %s.\n"), jcr->Job, dev->print_name()); dev->poll = false; diff --git a/bacula/src/stored/dircmd.c b/bacula/src/stored/dircmd.c index 264109760..d203e205a 100644 --- a/bacula/src/stored/dircmd.c +++ b/bacula/src/stored/dircmd.c @@ -503,7 +503,7 @@ static bool cancel_cmd(JCR *cjcr) BSOCK *dir = cjcr->dir_bsock; int oldStatus; char Job[MAX_NAME_LENGTH]; - JCR *jcr; + JCR *jcr = NULL; int status; const char *reason; @@ -547,13 +547,17 @@ static bool cancel_cmd(JCR *cjcr) pthread_cond_broadcast(&wait_device_release); } jcr->my_thread_send_signal(TIMEOUT_SIGNAL); + /* Inform the Director about the result and send EOD signal before free_jcr() below, + * since it might take some time if we hold the last reference */ dir->fsend(_("3000 JobId=%ld Job=\"%s\" marked to be %s.\n"), jcr->JobId, jcr->Job, reason); - free_jcr(jcr); } bail_out: dir->signal(BNET_EOD); + if (jcr) { + free_jcr(jcr); + } return 1; } diff --git a/bacula/src/stored/fd_cmds.c b/bacula/src/stored/fd_cmds.c index c9e3169db..d9b0ce97c 100644 --- a/bacula/src/stored/fd_cmds.c +++ b/bacula/src/stored/fd_cmds.c @@ -235,7 +235,7 @@ void do_client_commands(JCR *jcr) jcr->errmsg[0] = 0; if (!fd_cmds[i].func(jcr)) { /* do command */ /* Note fd->msg command may be destroyed by comm activity */ - if (!job_canceled(jcr)) { + if (!job_canceled(jcr) && !jcr->is_incomplete()) { strip_trailing_junk(fd->msg); if (jcr->errmsg[0]) { strip_trailing_junk(jcr->errmsg); diff --git a/bacula/src/stored/job.c b/bacula/src/stored/job.c index a0ad05fdb..7155bfc81 100644 --- a/bacula/src/stored/job.c +++ b/bacula/src/stored/job.c @@ -247,7 +247,7 @@ bool run_cmd(JCR *jcr) memset(jcr->sd_auth_key, 0, strlen(jcr->sd_auth_key)); - if (jcr->authenticated && !job_canceled(jcr)) { + if (jcr->authenticated && !job_canceled(jcr) && !jcr->is_incomplete()) { Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr); run_job(jcr); /* Run the job */ } diff --git a/bacula/src/stored/mount.c b/bacula/src/stored/mount.c index 6fc35baf2..083f0259c 100644 --- a/bacula/src/stored/mount.c +++ b/bacula/src/stored/mount.c @@ -99,7 +99,7 @@ mount_next_vol: P(mount_mutex); Dmsg1(90, "Continue after dir_ask_sysop_to_mount. must_load=%d\n", dev->must_load()); } - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { Jmsg(jcr, M_FATAL, 0, _("Job %d canceled.\n"), jcr->JobId); goto bail_out; } @@ -116,7 +116,7 @@ mount_next_vol: goto bail_out; } - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { goto bail_out; } Dmsg3(100, "After find_a_volume. Vol=%s Slot=%d VolType=%d\n", @@ -175,7 +175,7 @@ mount_next_vol: } P(mount_mutex); } - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { goto bail_out; } Dmsg3(100, "want vol=%s devvol=%s dev=%s\n", VolumeName, @@ -370,7 +370,7 @@ bool DCR::find_a_volume() Dmsg0(200, "Before dir_find_next_appendable_volume.\n"); while (!dir_find_next_appendable_volume(dcr)) { Dmsg0(200, "not dir_find_next\n"); - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { return false; } /* @@ -387,7 +387,7 @@ bool DCR::find_a_volume() ok = dir_ask_sysop_to_create_appendable_volume(dcr); } P(mount_mutex); - if (!ok || job_canceled(jcr)) { + if (!ok || job_canceled(jcr) || jcr->is_incomplete()) { return false; } Dmsg0(150, "Again dir_find_next_append...\n"); diff --git a/bacula/src/stored/reserve.c b/bacula/src/stored/reserve.c index 79d272428..e85e58bd6 100644 --- a/bacula/src/stored/reserve.c +++ b/bacula/src/stored/reserve.c @@ -367,7 +367,7 @@ static bool use_device_cmd(JCR *jcr) rctx.jcr->read_dcr = jcr->dcr; } lock_reservations(); - for ( ; !fail && !job_canceled(jcr); ) { + for ( ; !fail && !job_canceled(jcr) && !jcr->is_incomplete(); ) { int i; pop_reserve_messages(jcr); rctx.suitable_device = false; diff --git a/bacula/src/stored/vol_mgr.c b/bacula/src/stored/vol_mgr.c index f1de74ffd..d25556cd6 100644 --- a/bacula/src/stored/vol_mgr.c +++ b/bacula/src/stored/vol_mgr.c @@ -370,7 +370,7 @@ VOLRES *reserve_volume(DCR *dcr, const char *VolumeName) JCR *jcr = dcr->jcr; jcr->errmsg[0] = 0; - if (job_canceled(dcr->jcr)) { + if (job_canceled(dcr->jcr) || jcr->is_incomplete()) { Mmsg1(jcr->errmsg, _("Could not reserve volume \"%s\", because job canceled.\n"), dev->VolHdr.VolumeName); return NULL; @@ -846,7 +846,7 @@ bool DCR::can_i_use_volume() bool rtn = true; VOLRES *vol; - if (job_canceled(jcr)) { + if (job_canceled(jcr) || jcr->is_incomplete()) { Mmsg(jcr->errmsg, "Job is canceled\n"); return false; } diff --git a/bacula/src/stored/wait.c b/bacula/src/stored/wait.c index ef7f1efc2..48ed3f6e7 100644 --- a/bacula/src/stored/wait.c +++ b/bacula/src/stored/wait.c @@ -83,7 +83,7 @@ int wait_for_sysop(DCR *dcr) dev->set_blocked(BST_WAITING_FOR_SYSOP); /* indicate waiting for mount */ } - for ( ; !job_canceled(jcr); ) { + for ( ; !job_canceled(jcr) && !jcr->is_incomplete(); ) { time_t now, start, total_waited; gettimeofday(&tv, &tz);