From: Michal Rakowski Date: Thu, 11 Feb 2021 18:22:49 +0000 (+0100) Subject: Fix #7321 About issue when stopping jobs waiting for resources X-Git-Tag: Release-11.0.2~35 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e269d7eb2e856516db4b1ea17abe68d7aac15c4c;p=thirdparty%2Fbacula.git Fix #7321 About issue when stopping jobs waiting for resources Description: ----------- The user reports that the jobs were queued on the Jan 25th and he issued the stop command at the end of January. However they started on Feb 3rd: jobid: 51,070 schedtime: 2021-01-25 20:33:36 starttime: 2021-02-03 13:59:05 Workaround: ---------- Users can issue the cancel command instead of the stop command for jobs that are waiting on resources. --- diff --git a/bacula/src/dird/job.c b/bacula/src/dird/job.c index c3457b948..023cfa265 100644 --- a/bacula/src/dird/job.c +++ b/bacula/src/dird/job.c @@ -696,6 +696,7 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) char ed1[50]; int32_t old_status = jcr->JobStatus; int status; + bool ret = false; const char *reason, *cmd; /* Keep track of this important event */ @@ -735,20 +736,26 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) case JS_WaitMaxJobs: case JS_WaitStartTime: case JS_WaitDevice: - ua->info_msg(_("JobId %s, Job %s marked to be %s.\n"), - edit_uint64(jcr->JobId, ed1), jcr->Job, - reason); - jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ + status = jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ + if (status != 0) { + ua->error_msg(_("Cannot %s JobId %s, Job %s is not in work queue\n"), + cmd, edit_uint64(jcr->JobId, ed1), jcr->Job); + goto bail_out; + } break; default: + ret = true; /* This will be set to false in case of error from any daemon below */ /* Cancel File daemon */ if (jcr->file_bsock) { btimer_t *tid; /* do not return now, we want to try to cancel the sd */ tid = start_bsock_timer(jcr->file_bsock, 120); - cancel_file_daemon_job(ua, cmd, jcr); + if (!cancel_file_daemon_job(ua, cmd, jcr)) { + Dmsg1(400, "Failed to cancel file dameon job id=%d\n", jcr->JobId); + ret = false; + } stop_bsock_timer(tid); } @@ -765,7 +772,10 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) btimer_t *tid; /* do not return now, we want to try to cancel the sd socket */ tid = start_bsock_timer(jcr->store_bsock, 120); - cancel_sd_job(ua, cmd, jcr); + if (!cancel_sd_job(ua, cmd, jcr)) { + Dmsg1(400, "Failed to cancel storage dameon job id=%d\n", jcr->JobId); + ret = false; + } stop_bsock_timer(tid); } @@ -788,7 +798,10 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) btimer_t *tid; /* do not return now, we want to try to cancel the sd socket */ tid = start_bsock_timer(wjcr->store_bsock, 120); - cancel_sd_job(ua, cmd, wjcr); + if (!cancel_sd_job(ua, cmd, wjcr)) { + Dmsg1(400, "Failed to cancel storage dameon job id=%d\n", jcr->JobId); + ret = false; + } stop_bsock_timer(tid); } /* We test store_bsock because the previous operation can take @@ -801,10 +814,20 @@ cancel_job(UAContext *ua, JCR *jcr, int wait, bool cancel) wjcr->my_thread_send_signal(TIMEOUT_SIGNAL); } } + + if (!ret) { + goto bail_out; + } + break; } - return true; + ua->info_msg(_("JobId %s, Job %s marked to be %s.\n"), + edit_uint64(jcr->JobId, ed1), jcr->Job, + reason); + +bail_out: + return ret; } void cancel_storage_daemon_job(JCR *jcr) diff --git a/bacula/src/dird/ua_cmds.c b/bacula/src/dird/ua_cmds.c index dbb577047..2d2d69d65 100644 --- a/bacula/src/dird/ua_cmds.c +++ b/bacula/src/dird/ua_cmds.c @@ -507,6 +507,14 @@ static int cancel_cmd(UAContext *ua, const char *cmd) foreach_alist(jcr, jcrs) { /* Execute the cancel command only if we don't have an error */ if (nb != -1) { + /* If runtime is not set it means that job is waiting on something (e.g. some resource to appear). + * It then makes more sense to cancel job rather than stop it since it hasn't been started at all. */ + if (!jcr->run_time && !cancel) { + ua->info_msg(_("1000 Trying to cancel Job %s since it was not started yet hence " + "no need for stopping it.\n"), jcr->Job); + } + cancel = jcr->run_time ? cancel : true; + ret &= cancel_job(ua, jcr, 60, cancel); } free_jcr(jcr); @@ -2367,6 +2375,7 @@ int wait_cmd(UAContext *ua, const char *cmd) for (bool running=true; running; ) { running = false; foreach_jcr(jcr) { + if (!jcr->is_internal_job()) { running = true; break; diff --git a/bacula/src/stored/acquire.c b/bacula/src/stored/acquire.c index a94da236f..4ef880e62 100644 --- a/bacula/src/stored/acquire.c +++ b/bacula/src/stored/acquire.c @@ -418,7 +418,7 @@ DCR *acquire_device_for_append(DCR *dcr) dev->Unlock(); Dmsg1(190, "jid=%u Do mount_next_write_vol\n", (uint32_t)jcr->JobId); if (!dcr->mount_next_write_volume()) { - if (!job_canceled(jcr)) { + if (!job_canceled(jcr) && !jcr->is_incomplete()) { /* Reduce "noise" -- don't print if job canceled */ Mmsg2(jcr->errmsg, _("Could not ready %s device %s for append.\n"), dev->print_type(), dev->print_name()); diff --git a/bacula/src/stored/askdir.c b/bacula/src/stored/askdir.c index e1a2dc2f4..b467f4bbe 100644 --- a/bacula/src/stored/askdir.c +++ b/bacula/src/stored/askdir.c @@ -867,6 +867,15 @@ bool dir_ask_sysop_to_create_appendable_volume(DCR *dcr) continue; } + if (stat == W_WAKE) { + /* Job could be marked to stopped, need to brea */ + Mmsg0(dev->errmsg, _("Job was stopped by the user.\n")); + Jmsg(jcr, M_FATAL, 0, "%s", dev->errmsg); + Dmsg1(dbglvl, "Job marked to be stopped. Gave up waiting on device %s\n", dev->print_name()); + dev->poll = false; + return false; + } + if (stat == W_TIMEOUT) { if (!double_dev_wait_time(dev)) { Mmsg(dev->errmsg, _("Max time exceeded waiting to mount Storage Device %s for Job %s\n"), diff --git a/bacula/src/stored/fd_cmds.c b/bacula/src/stored/fd_cmds.c index 2a5244271..e1e787964 100644 --- a/bacula/src/stored/fd_cmds.c +++ b/bacula/src/stored/fd_cmds.c @@ -225,7 +225,7 @@ void do_client_commands(JCR *jcr) jcr->errmsg[0] = 0; if (!fd_cmds[i].func(jcr)) { /* do command */ /* Note fd->msg command may be destroyed by comm activity */ - if (!job_canceled(jcr)) { + if (!job_canceled(jcr) && !jcr->is_incomplete()) { strip_trailing_junk(fd->msg); if (jcr->errmsg[0]) { strip_trailing_junk(jcr->errmsg);