From: Alain Spineux <alain@baculasystems.com>
Date: Wed, 17 Jan 2024 15:25:45 +0000 (+0100)
Subject: Detect unsolvable volume cycle in split_bsr_loop()
X-Git-Tag: Release-15.0.3~57
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8bdeca571d9d0044b93a985ad7ccc12fba9f400c;p=thirdparty%2Fbacula.git

Detect unsolvable volume cycle in split_bsr_loop()

- Fix empty part: when a cycle inside a job was detected, the function
  was creating an empty part (splitting at offset 0), that was causing
  a fatal error in the restore
- this make that the first part to restore is totally empty and
  generate the messages from the SD and from the FD:
  SD - Fatal error: No Volume names found for restore.
  FD - Fatal error: job.c:3694 Bad response from SD to Read Data command. Wanted 3000 OK data

- in the trace file we have a split at off=0
  BSR: Split the BSR at off=0 at the beginning of jobnum=1 ....

Signed-off-by: Alain Spineux <alain@baculasystems.com>
---

diff --git a/bacula/src/dird/bsr.c b/bacula/src/dird/bsr.c
index 243e9b271..34d11061e 100644
--- a/bacula/src/dird/bsr.c
+++ b/bacula/src/dird/bsr.c
@@ -839,7 +839,12 @@ struct bsr_vol_list {
 };
 
 /* Generate a list of split position in the BSR to break any cycle
- * returns true if a cycle was detected
+ * return
+ *  0: OK no split needed
+ *  1: OK but some split are required
+ * -1: ERROR We found a cycles inside one job, the code in the SD must
+ *     be improved to handle this situation. See last sample at the end of the
+ *     comment
  *
  * When a volume appears multiple time into a BSR, interleaved with other
  * volumes, this can be a problem because the SD mount each volume only once and
@@ -867,12 +872,13 @@ struct bsr_vol_list {
  * As the BSR is written job by job the split is enough, no need to reorganize
  * the BSR an move all the parts of the first job before the second one.
  *
- * The only case that this function cannot solve is the end of a job is written
- * before the beginning inside the same volume. This could happens in a copy job
- * if not copying the volume in the wrong order.
+ * This function don't work inside a job. If two parts of a job are already
+ * written in reverse order on the same volume (after a copy job) or if a part
+ * between the two are written into another volume. It can detect the problem
+ * but the "split" must be done inside the SD.
  *
  * It is possible to split the BSR at multiple place, I split it at the beginning
- * of the job where  the problem is detected.
+ * of the job where the problem is detected.
  *
  * For example:
  * number are jobs, letters are volumes
@@ -881,11 +887,11 @@ struct bsr_vol_list {
  * 2 B
  * 2 A
  * 3 A
- * Here I must split between 1A & 2B to be sure that 2A will be read after 2B
- * else the file that span 2B & 2A will never be fully restored as the blocks
- * should have been read just after 1A and befor 2B by the SD
+ * Normally here the SD will mount A once read blocks 1A, 2A, 3A, then mount
+ * B for 2B. Reading 2B before 2A is a mistake.
+ * Here I must split between 1A & 2B to be sure that 2A will be read after 2B.
  *
- * Another example
+ * Another example where offset matter
  * 1 A start=0 end=1000
  * ---
  * 2 A start=0 end=2000
@@ -902,8 +908,26 @@ struct bsr_vol_list {
  * 4 B start=0 end=2000
  * I must split between 3C & 4B to avoid to include blocks of 4B while restoring
  * 2B. But this is handled by the 1st example
+ *
+ * In this case the volume 2 was reused for the same job, despite it was already
+ * full. This is a bug, the DIR don't know about the right size of the volume!
+ * The job start writing (with other jobs) on the volume 2, when the Maximum
+ * Size is reached, the SD close the volume and switch to another volume, that
+ * get full quickly and the DIR tell to continue back on volume 2. The SD
+ * notice the differrence in size between the Catalog and the file but still
+ * write on it.
+ * In this situation despite the data are not "corrupted", the SD is unable
+ * to restore the backup and no split can be used inside a job.
+ * +---------+------------+----------+
+ * | MediaId | StartBlock | EndBlock |
+ * +---------+------------+----------+
+ * | 2       | 10028425   | 10355984 |
+ * | 30      | 10201502   | 10267009 |
+ * | 2       | 10421493   | 10471710 |
+ * +---------+------------+----------+
+ *
  */
-bool split_bsr_loop(JCR *jcr, bootstrap_info &info)
+int split_bsr_loop(JCR *jcr, bootstrap_info &info)
 {
    UAContext *ua = info.ua;
    FILE *bs = info.bs;
@@ -924,7 +948,8 @@ bool split_bsr_loop(JCR *jcr, bootstrap_info &info)
    bool first = true;
    bool after_eof = false;     // used to handle the after EOF inside the loop
    int job_num = 0;            // internal job numbering from 0,1..N
-   int last_split_job_num = 0; // the volumes that matter in "volumes" have a job_num >= last_split_job_num
+   int last_split_job_num = 1; // the volumes that matter in "volumes" have a job_num >= last_split_job_num
+   bool internal_cycle = false; /* no cycle detecte inside a job */
 
    if (info.split_list == NULL) {
       info.split_list = New(alist(100, owned_by_alist));
@@ -972,7 +997,13 @@ bool split_bsr_loop(JCR *jcr, bootstrap_info &info)
                   Dmsg2(120, "BSR: insert volume %s jobnum=%d\n", item->volume, item->job_num);
                } else {
                   /* we already know about this volume, but is it used in the current part of the BSR? */
-                  if (item->job_num >= last_split_job_num) {
+                  if (item->job_num == job_num) {
+                     /* the volume is used again in the same job, we have an issue */
+                     /* This is would be up to the SD to fix the problem */
+                     Dmsg8(1, "BSR: unfixable cycle inside the BSR at off=%lld at the beginning of jobnum=%d sess=%lu:%lu because volume %s is previously used by jobnum=%d sess=%lu:%lu\n",
+                              start_job_off, job_num, VolSessionTime, VolSessionId, volume.c_str(), item->job_num, item->VolSessionTime, item->VolSessionId);
+                     internal_cycle = true;
+                  } else if (item->job_num >= last_split_job_num) {
                      /* the volume is used again in this part, we need to split the BSR into a new part */
                      boffset_t *p = (boffset_t *)malloc(sizeof(boffset_t));
                      *p = start_job_off;
@@ -1049,5 +1080,11 @@ bool split_bsr_loop(JCR *jcr, bootstrap_info &info)
    }
    fseeko(bs, 0, SEEK_SET);
    info.next_split_off = (boffset_t *)info.split_list->first();
-   return info.next_split_off != NULL;
+   if (internal_cycle) {
+      return -1; // we have a probleme that we cannot solve
+   } else if (info.next_split_off != NULL) {
+      return 1; // we have detected cycles and added a splits
+   } else {
+      return 0; // no cycle found
+   }
 }
diff --git a/bacula/src/dird/protos.h b/bacula/src/dird/protos.h
index 1750dba3e..885c12a07 100644
--- a/bacula/src/dird/protos.h
+++ b/bacula/src/dird/protos.h
@@ -76,7 +76,7 @@ RBSR_FINDEX *new_findex();
 void make_unique_restore_filename(UAContext *ua, POOLMEM **fname);
 void print_bsr(UAContext *ua, RESTORE_CTX &rx);
 void scan_bsr(JCR *jcr);
-bool split_bsr_loop(JCR *jcr, bootstrap_info &info);
+int split_bsr_loop(JCR *jcr, bootstrap_info &info);
 
 
 /* catreq.c */
diff --git a/bacula/src/dird/restore.c b/bacula/src/dird/restore.c
index 396bd673c..ed45dc285 100644
--- a/bacula/src/dird/restore.c
+++ b/bacula/src/dird/restore.c
@@ -307,14 +307,20 @@ bool restore_bootstrap(JCR *jcr)
    bootstrap_info info;
    POOL_MEM restore_cmd(PM_MESSAGE), buf(PM_FNAME);
    bool ret = false;
+   int r;
 
    /* Open the bootstrap file */
    if (!open_bootstrap_file(jcr, info)) {
       goto bail_out;
    }
 
-   if (split_bsr_loop(jcr, info)) { /* create the split list to break volume cycle */
+   r = split_bsr_loop(jcr, info);
+   if (r == 0) {
+      // Everything is ok, no change
+   } else if (r == 1) {
       Jmsg(jcr, M_INFO, 0, _("Found a volume cycle in the bootstrap, fixing automatically the reading process\n"));
+   } else {
+      Jmsg(jcr, M_WARNING, 0, _("Found a volume cycle in the bootstrap that cannot be solved, try to restore the data anyway\n"));
    }
 
    /* Read the bootstrap file */
diff --git a/bacula/src/dird/vbackup.c b/bacula/src/dird/vbackup.c
index 58144b06d..40abf5238 100644
--- a/bacula/src/dird/vbackup.c
+++ b/bacula/src/dird/vbackup.c
@@ -319,7 +319,7 @@ _("This Job is not an Accurate backup so is not equivalent to a Full backup.\n")
          return false;
       }
 
-      if (split_bsr_loop(jcr, info)) { /* create the split list to break volume cycle */
+      if (split_bsr_loop(jcr, info) != 0 ) {
          Jmsg(jcr, M_FATAL, 0, _("Found a volume cycle in the bootstrap, Virtual Full is not possible on this Job\n"));
       }