PR28514: debuginfod: limit groom operation times

author Frank Ch. Eigler <fche@redhat.com>

Thu, 4 Nov 2021 17:08:35 +0000 (13:08 -0400)

committer Frank Ch. Eigler <fche@redhat.com>

Fri, 5 Nov 2021 12:16:06 +0000 (08:16 -0400)
author Frank Ch. Eigler <fche@redhat.com>
Thu, 4 Nov 2021 17:08:35 +0000 (13:08 -0400)
committer Frank Ch. Eigler <fche@redhat.com>
Fri, 5 Nov 2021 12:16:06 +0000 (08:16 -0400)
diff --git a/NEWS b/NEWS

index 9ab58422ab3b8db6b063e5fd94a5e73db6469322..b812b7438967179b9ac838b110fc831b058bec37 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,7 @@ debuginfod: Supply extra HTTP response headers, describing archive/file
              Protect against loops in federated server configurations.
              Add -r option to use -I/-X regexes for grooming stale files.
              Protect against wasted CPU from duplicate concurrent requests.
+            Limit the duration of groom ops roughly to rescan (-t) times.
              Several other performance improvements & prometheus metrics.
  
  Version 0.185
diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog

index e0616b0d576fddff4b21ec1c8493467b09d247f2..15b2ba40fa0f748b435409140ee120f9e62ca446 100644 (file)
--- a/debuginfod/ChangeLog
+++ b/debuginfod/ChangeLog
@@ -1,3 +1,10 @@
+2021-11-04  Frank Ch. Eigler  <fche@redhat.com>
+
+       PR28514
+       * debuginfod.cxx (groom): Rework into separate decision/action
+       phases.  Add new metrics to monitor progress.  Limit indefinite
+       operation times to avoid starving rescan.
+
  2021-10-23  Frank Ch. Eigler  <fche@redhat.com>
  
         PR28240
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx

index d22571ad3041c6bdce3b897c1c230c53c55a286b..45981d8d42791371fd4cc4dac666cdc539058c0d 100644 (file)
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -3431,16 +3431,30 @@ void groom()
    clock_gettime (CLOCK_MONOTONIC, &ts_start);
  
    // scan for files that have disappeared
-  sqlite_ps files (db, "check old files", "select s.mtime, s.file, f.name from "
-                       BUILDIDS "_file_mtime_scanned s, " BUILDIDS "_files f "
-                       "where f.id = s.file");
-  sqlite_ps files_del_f_de (db, "nuke f_de", "delete from " BUILDIDS "_f_de where file = ? and mtime = ?");
-  sqlite_ps files_del_r_de (db, "nuke r_de", "delete from " BUILDIDS "_r_de where file = ? and mtime = ?");
-  sqlite_ps files_del_scan (db, "nuke f_m_s", "delete from " BUILDIDS "_file_mtime_scanned "
-                            "where file = ? and mtime = ?");
+  sqlite_ps files (db, "check old files",
+                   "select distinct s.mtime, s.file, f.name from "
+                   BUILDIDS "_file_mtime_scanned s, " BUILDIDS "_files f "
+                   "where f.id = s.file");
+  // NB: Because _ftime_mtime_scanned can contain both F and
+  // R records for the same file, this query would return duplicates if the
+  // DISTINCT qualifier were not there.
    files.reset();
+
+  // DECISION TIME - we enumerate stale fileids/mtimes
+  deque<pair<int64_t,int64_t> > stale_fileid_mtime;
+  
+  time_t time_start = time(NULL);
    while(1)
      {
+      // PR28514: limit grooming iteration to O(rescan time), to avoid
+      // slow filesystem tests over many files locking out rescans for
+      // too long.
+      if (rescan_s > 0 && (long)time(NULL) > (long)(time_start + rescan_s))
+        {
+          inc_metric("groomed_total", "decision", "aborted");
+          break;
+        }
+
        if (interrupted) break;
  
        int rc = files.step();
@@ -3458,19 +3472,67 @@ void groom()
        if ( (regex_groom && reg_exclude && !reg_include) ||  rc < 0 || (mtime != (int64_t) s.st_mtime) )
          {
            if (verbose > 2)
-            obatched(clog) << "groom: forgetting file=" << filename << " mtime=" << mtime << endl;
-          files_del_f_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
-          files_del_r_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
-          files_del_scan.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
+            obatched(clog) << "groom: stale file=" << filename << " mtime=" << mtime << endl;
+          stale_fileid_mtime.push_back(make_pair(fileid,mtime));
            inc_metric("groomed_total", "decision", "stale");
+          set_metric("thread_work_pending","role","groom", stale_fileid_mtime.size());
          }
        else
          inc_metric("groomed_total", "decision", "fresh");
+      
        if (sigusr1 != forced_rescan_count) // stop early if scan triggered
          break;
      }
    files.reset();
  
+  // ACTION TIME
+
+  // Now that we know which file/mtime tuples are stale, actually do
+  // the deletion from the database.  Doing this during the SELECT
+  // iteration above results in undefined behaviour in sqlite, as per
+  // https://www.sqlite.org/isolation.html
+
+  // We could shuffle stale_fileid_mtime[] here.  It'd let aborted
+  // sequences of nuke operations resume at random locations, instead
+  // of just starting over.  But it doesn't matter much either way,
+  // as long as we make progress.
+
+  sqlite_ps files_del_f_de (db, "nuke f_de", "delete from " BUILDIDS "_f_de where file = ? and mtime = ?");
+  sqlite_ps files_del_r_de (db, "nuke r_de", "delete from " BUILDIDS "_r_de where file = ? and mtime = ?");
+  sqlite_ps files_del_scan (db, "nuke f_m_s", "delete from " BUILDIDS "_file_mtime_scanned "
+                            "where file = ? and mtime = ?");
+
+  while (! stale_fileid_mtime.empty())
+    {
+      auto stale = stale_fileid_mtime.front();
+      stale_fileid_mtime.pop_front();
+      set_metric("thread_work_pending","role","groom", stale_fileid_mtime.size());
+
+      // PR28514: limit grooming iteration to O(rescan time), to avoid
+      // slow nuke_* queries over many files locking out rescans for too
+      // long.  We iterate over the files in random() sequence to avoid
+      // partial checks going over the same set.
+      if (rescan_s > 0 && (long)time(NULL) > (long)(time_start + rescan_s))
+        {
+          inc_metric("groomed_total", "action", "aborted");
+          break;
+        }
+
+      if (interrupted) break;
+
+      int64_t fileid = stale.first;
+      int64_t mtime = stale.second;
+      files_del_f_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
+      files_del_r_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
+      files_del_scan.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
+      inc_metric("groomed_total", "action", "cleaned");
+      
+       if (sigusr1 != forced_rescan_count) // stop early if scan triggered
+        break;
+    }
+  stale_fileid_mtime.clear(); // no need for this any longer
+  set_metric("thread_work_pending","role","groom", stale_fileid_mtime.size());
+      
    // delete buildids with no references in _r_de or _f_de tables;
    // cascades to _r_sref & _f_s records
    sqlite_ps buildids_del (db, "nuke orphan buildids",
diff --git a/tests/ChangeLog b/tests/ChangeLog

index 46302b5680cd062e9e611bb8a4cb0f0c29fa7a67..b791cd7f0d9567764a142397ce68195d95417949 100644 (file)
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,8 @@
+2021-11-04  Frank Ch. Eigler  <fche@redhat.com>
+
+       PR28514
+       * run-debuginfod-archive-groom.sh: Look for new groom metric.
+
  2021-10-23  Frank Ch. Eigler  <fche@redhat.com>
  
         PR28240
diff --git a/tests/run-debuginfod-archive-groom.sh b/tests/run-debuginfod-archive-groom.sh

index 7813ee28b8f2088c486c24969b7b13c9c4ef68a7..030e0aa62de46149605da0f06427361d64a847d3 100755 (executable)
--- a/tests/run-debuginfod-archive-groom.sh
+++ b/tests/run-debuginfod-archive-groom.sh
@@ -150,6 +150,8 @@ kill -USR2 $PID1  # groom cycle
  wait_ready $PORT1 'thread_work_total{role="groom"}' 2
  # Expect 4 rpms containing 2 buildids to be deleted by the groom
  wait_ready $PORT1 'groomed_total{decision="stale"}' 4
+# Expect no more groom actions pending
+wait_ready $PORT1 'thread_work_pending{role="groom"}' 0
  
  rm -rf $DEBUGINFOD_CACHE_PATH # clean it from previous tests
author	Frank Ch. Eigler <fche@redhat.com>
	Thu, 4 Nov 2021 17:08:35 +0000 (13:08 -0400)
committer	Frank Ch. Eigler <fche@redhat.com>
	Fri, 5 Nov 2021 12:16:06 +0000 (08:16 -0400)
NEWS		patch \| blob \| blame \| history
debuginfod/ChangeLog		patch \| blob \| blame \| history
debuginfod/debuginfod.cxx		patch \| blob \| blame \| history
tests/ChangeLog		patch \| blob \| blame \| history
tests/run-debuginfod-archive-groom.sh		patch \| blob \| blame \| history