debuginfod: extracted-from-archive file cache

author Frank Ch. Eigler <fche@redhat.com>

Fri, 27 Dec 2019 00:06:46 +0000 (19:06 -0500)

committer Frank Ch. Eigler <fche@redhat.com>

Sun, 12 Jan 2020 03:41:54 +0000 (22:41 -0500)
author Frank Ch. Eigler <fche@redhat.com>
Fri, 27 Dec 2019 00:06:46 +0000 (19:06 -0500)
committer Frank Ch. Eigler <fche@redhat.com>
Sun, 12 Jan 2020 03:41:54 +0000 (22:41 -0500)
diff --git a/config/ChangeLog b/config/ChangeLog

index 2296e8155ae32e9bce9d177d4dfad270cf7b1ae0..e5e0a55c6ce04f26f671e56f6d5b908b12570ed8 100644 (file)
--- a/config/ChangeLog
+++ b/config/ChangeLog
@@ -1,3 +1,7 @@
+2020-01-11  Frank Ch. Eigler  <fche@redhat.com>
+
+       * debuginfod.service: Set PrivateTmp=yes.
+
  2020-01-01  Dmitry V. Levin  <ldv@altlinux.org>
  
         * Makefile.am (pkgconfig_DATA): Conditionalize libdebuginfod.pc
diff --git a/config/debuginfod.service b/config/debuginfod.service

index d8ef072be9ef40f331c9f27e96760058fe836891..8fca343fb70edd97bce00bd0df2c662c322d36ac 100644 (file)
--- a/config/debuginfod.service
+++ b/config/debuginfod.service
@@ -10,6 +10,7 @@ Group=debuginfod
  #CacheDirectory=debuginfod
  ExecStart=/usr/bin/debuginfod -d /var/cache/debuginfod/debuginfod.sqlite -p $DEBUGINFOD_PORT $DEBUGINFOD_VERBOSE $DEBUGINFOD_PRAGMAS $DEBUGINFOD_PATHS
  TimeoutStopSec=10
+PrivateTmp=yes
  
  [Install]
  WantedBy=multi-user.target
diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog

index 795b617ba4e1c7741bbf46e6e3a57b4bb3c467fa..8a25e1f7a7805d7d9b3165699a395c274fc3dd82 100644 (file)
--- a/debuginfod/ChangeLog
+++ b/debuginfod/ChangeLog
@@ -1,3 +1,15 @@
+2020-01-11  Frank Ch. Eigler  <fche@redhat.com>
+
+       * debuginfod.cxx (libarchive_fdcache): New class/facility to own a
+       cache of temporary files that were previously extracted from an
+       archive.  If only it could store just unlinked fd's instead of
+       filenames.
+       (handle_buildid_r_match): Use it to answer dwz/altdebug and webapi
+       requests.
+       (groom): Clean it.
+       (main): Initialize the cache control parameters from heuristics.
+       Use a consistent tmpdir for these and tmp files elsewhere.
+
  2020-01-11  Frank Ch. Eigler  <fche@redhat.com>
  
         * debuginfod.cxx (conninfo): Print User-Agent and X-Forwarded-For
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx

index 1c60e8d39e15eb195b8e390ada71ea4237515f27..9ca9bc61b097656989f2bb7fa34de6a534ac672f 100644 (file)
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -52,6 +52,7 @@ extern "C" {
  #include <signal.h>
  #include <sys/stat.h>
  #include <sys/time.h>
+#include <sys/vfs.h>
  #include <unistd.h>
  #include <fcntl.h>
  #include <netdb.h>
@@ -349,7 +350,10 @@ static const struct argp_option options[] =
     { "database", 'd', "FILE", 0, "Path to sqlite database.", 0 },
     { "ddl", 'D', "SQL", 0, "Apply extra sqlite ddl/pragma to connection.", 0 },
     { "verbose", 'v', NULL, 0, "Increase verbosity.", 0 },
-
+#define ARGP_KEY_FDCACHE_FDS 0x1001
+   { "fdcache-fds", ARGP_KEY_FDCACHE_FDS, "NUM", 0, "Maximum number of archive files to keep in fdcache.", 0 },
+#define ARGP_KEY_FDCACHE_MBS 0x1002
+   { "fdcache-mbs", ARGP_KEY_FDCACHE_MBS, "MB", 0, "Maximum total size of archive file fdcache.", 0 },
     { NULL, 0, NULL, 0, NULL, 0 }
    };
  
@@ -378,7 +382,7 @@ static volatile sig_atomic_t sigusr2 = 0;
  static unsigned http_port = 8002;
  static unsigned rescan_s = 300;
  static unsigned groom_s = 86400;
-static unsigned maxigroom = false;
+static bool maxigroom = false;
  static unsigned concurrency = std::thread::hardware_concurrency() ?: 1;
  static set<string> source_paths;
  static bool scan_files = false;
@@ -387,6 +391,9 @@ static vector<string> extra_ddl;
  static regex_t file_include_regex;
  static regex_t file_exclude_regex;
  static bool traverse_logical;
+static long fdcache_fds;
+static long fdcache_mbs;
+static string tmpdir;
  
  static void set_metric(const string& key, int64_t value);
  // static void inc_metric(const string& key);
@@ -451,6 +458,12 @@ parse_opt (int key, char *arg,
        if (rc != 0)
          argp_failure(state, 1, EINVAL, "regular expession");
        break;
+    case ARGP_KEY_FDCACHE_FDS:
+      fdcache_fds = atol (arg);
+      break;
+    case ARGP_KEY_FDCACHE_MBS:
+      fdcache_mbs = atol (arg);
+      break;
      case ARGP_KEY_ARG:
        source_paths.insert(string(arg));
        break;
@@ -886,6 +899,148 @@ shell_escape(const string& str)
  }
  
  
+// A map-like class that owns a cache of file descriptors (indexed by
+// file / content names).
+//
+// If only it could use fd's instead of file names ... but we can't
+// dup(2) to create independent descriptors for the same unlinked
+// files, so would have to use some goofy linux /proc/self/fd/%d
+// hack such as the following
+
+#if 0
+int superdup(int fd)
+{
+#ifdef __linux__
+  char *fdpath = NULL;
+  int rc = asprintf(& fdpath, "/proc/self/fd/%d", fd);
+  int newfd;
+  if (rc >= 0)
+    newfd = open(fdpath, O_RDONLY);
+  else
+    newfd = -1;
+  free (fdpath);
+  return newfd;
+#else
+  return -1;
+#endif
+}
+#endif
+
+class libarchive_fdcache
+{
+private:
+  mutex fdcache_lock;
+
+  struct fdcache_entry
+  {
+    string archive;
+    string entry;
+    string fd;
+    long fd_size_mb; // rounded up megabytes
+  };
+  deque<fdcache_entry> lru; // @head: most recently used
+  long max_fds;
+  long max_mbs;
+
+public:
+  void intern(const string& a, const string& b, string fd, off_t sz)
+  {
+    {
+      unique_lock<mutex> lock(fdcache_lock);
+      for (auto i = lru.begin(); i < lru.end(); i++) // nuke preexisting copy
+        {
+          if (i->archive == a && i->entry == b)
+            {
+              unlink (i->fd.c_str());
+              lru.erase(i);
+              break; // must not continue iterating
+            }
+        }
+      long mb = ((sz+1023)/1024+1023)/1024;
+      fdcache_entry n = { a, b, fd, mb };
+      lru.push_front(n);
+    if (verbose > 3)
+      obatched(clog) << "fdcache interned a=" << a << " b=" << b << " fd=" << fd << " mb=" << mb << endl;
+    }
+
+    this->limit(max_fds, max_mbs); // age cache if required
+  }
+
+  int lookup(const string& a, const string& b)
+  {
+    unique_lock<mutex> lock(fdcache_lock);
+    for (auto i = lru.begin(); i < lru.end(); i++)
+      {
+        if (i->archive == a && i->entry == b)
+          { // found it; move it to head of lru
+            fdcache_entry n = *i;
+            lru.erase(i); // invalidates i, so no more iteration!
+            lru.push_front(n);
+
+            return open(n.fd.c_str(), O_RDONLY); // NB: no problem if dup() fails; looks like cache miss
+          }
+      }
+    return -1;
+  }
+
+  void clear(const string& a, const string& b)
+  {
+    unique_lock<mutex> lock(fdcache_lock);
+    for (auto i = lru.begin(); i < lru.end(); i++)
+      {
+        if (i->archive == a && i->entry == b)
+          { // found it; move it to head of lru
+            fdcache_entry n = *i;
+            lru.erase(i); // invalidates i, so no more iteration!
+            unlink (n.fd.c_str());
+            return;
+          }
+      }
+  }
+
+  void limit(long maxfds, long maxmbs)
+  {
+    if (verbose > 3 && (this->max_fds != maxfds || this->max_mbs != maxmbs))
+      obatched(clog) << "fdcache limited to maxfds=" << maxfds << " maxmbs=" << maxmbs << endl;
+
+    unique_lock<mutex> lock(fdcache_lock);
+    this->max_fds = maxfds;
+    this->max_mbs = maxmbs;
+
+    long total_fd = 0;
+    long total_mb = 0;
+    for (auto i = lru.begin(); i < lru.end(); i++)
+      {
+        // accumulate totals from most recently used one going backward
+        total_fd ++;
+        total_mb += i->fd_size_mb;
+        if (total_fd > max_fds || total_mb > max_mbs)
+          {
+            // found the cut here point!
+
+            for (auto j = i; j < lru.end(); j++) // close all the fds from here on in
+              {
+                if (verbose > 3)
+                  obatched(clog) << "fdcache evicted a=" << j->archive << " b=" << j->entry
+                                 << " fd=" << j->fd << " mb=" << j->fd_size_mb << endl;
+                unlink (j->fd.c_str());
+              }
+
+            lru.erase(i, lru.end()); // erase the nodes generally
+            break;
+          }
+
+      }
+  }
+
+  ~libarchive_fdcache()
+  {
+    limit(0, 0);
+  }
+};
+static libarchive_fdcache fdcache;
+
+
  static struct MHD_Response*
  handle_buildid_r_match (int64_t b_mtime,
                          const string& b_source0,
@@ -904,6 +1059,41 @@ handle_buildid_r_match (int64_t b_mtime,
        return 0;
      }
  
+  int fd = fdcache.lookup(b_source0, b_source1);
+  while (fd >= 0) // got one!; NB: this is really an if() with a possible branch out to the end
+    {
+      rc = fstat(fd, &fs);
+      if (rc < 0) // disappeared?
+        {
+          if (verbose)
+            obatched(clog) << "cannot fstat fdcache " << b_source0 << endl;
+          close(fd);
+          fdcache.clear(b_source0, b_source1);
+          break; // branch out of if "loop", to try new libarchive fetch attempt
+        }
+
+      struct MHD_Response* r = MHD_create_response_from_fd (fs.st_size, fd);
+      if (r == 0)
+        {
+          if (verbose)
+            obatched(clog) << "cannot create fd-response for " << b_source0 << endl;
+          close(fd);
+          break; // branch out of if "loop", to try new libarchive fetch attempt
+        }
+
+      inc_metric ("http_responses_total","result","archive fdcache");
+
+      MHD_add_response_header (r, "Content-Type", "application/octet-stream");
+      add_mhd_last_modified (r, fs.st_mtime);
+      if (verbose > 1)
+        obatched(clog) << "serving fdcache archive " << b_source0 << " file " << b_source1 << endl;
+      /* libmicrohttpd will close it. */
+      if (result_fd)
+        *result_fd = fd;
+      return r;
+      // NB: see, we never go around the 'loop' more than once
+    }
+
    string archive_decoder = "/dev/null";
    string archive_extension = "";
    for (auto&& arch : scan_archives)
@@ -950,19 +1140,28 @@ handle_buildid_r_match (int64_t b_mtime,
          continue;
  
        // extract this file to a temporary file
-      char tmppath[PATH_MAX] = "/tmp/debuginfod.XXXXXX"; // XXX: $TMP_DIR etc.
-      int fd = mkstemp (tmppath);
+      char* tmppath = NULL;
+      rc = asprintf (&tmppath, "%s/debuginfod.XXXXXX", tmpdir.c_str());
+      if (rc < 0)
+        throw libc_exception (ENOMEM, "cannot allocate tmppath");
+      defer_dtor<void*,void> tmmpath_freer (tmppath, free);
+      fd = mkstemp (tmppath);
        if (fd < 0)
          throw libc_exception (errno, "cannot create temporary file");
-      unlink (tmppath); // unlink now so OS will release the file as soon as we close the fd
+      // NB: don't unlink (tmppath), as fdcache will take charge of it.
  
        rc = archive_read_data_into_fd (a, fd);
-      if (rc != ARCHIVE_OK)
+      if (rc != ARCHIVE_OK) // e.g. ENOSPC!
          {
            close (fd);
+          unlink (tmppath);
            throw archive_exception(a, "cannot extract file");
          }
  
+      // NB: now we know we have a complete reusable file; make fdcache
+      // responsible for unlinking it later.
+      fdcache.intern(b_source0, b_source1, tmppath, archive_entry_size(e));
+
        inc_metric ("http_responses_total","result",archive_extension + " archive");
        struct MHD_Response* r = MHD_create_response_from_fd (archive_entry_size(e), fd);
        if (r == 0)
@@ -1836,9 +2035,8 @@ archive_classify (const string& rps, string& archive_extension,
              obatched(clog) << "libarchive checking " << fn << endl;
  
            // extract this file to a temporary file
-          const char *tmpdir_env = getenv ("TMPDIR") ?: "/tmp";
            char* tmppath = NULL;
-          rc = asprintf (&tmppath, "%s/debuginfod.XXXXXX", tmpdir_env);
+          rc = asprintf (&tmppath, "%s/debuginfod.XXXXXX", tmpdir.c_str());
            if (rc < 0)
              throw libc_exception (ENOMEM, "cannot allocate tmppath");
            defer_dtor<void*,void> tmmpath_freer (tmppath, free);
@@ -2373,6 +2571,9 @@ void groom()
  
    sqlite3_db_release_memory(db); // shrink the process if possible
  
+  fdcache.limit(0,0); // release the fdcache contents
+  fdcache.limit(fdcache_fds,fdcache_mbs); // restore status quo parameters
+
    gettimeofday (&tv_end, NULL);
    double deltas = (tv_end.tv_sec - tv_start.tv_sec) + (tv_end.tv_usec - tv_start.tv_usec)*0.000001;
  
@@ -2488,6 +2689,8 @@ main (int argc, char *argv[])
    /* Tell the library which version we are expecting.  */
    elf_version (EV_CURRENT);
  
+  tmpdir = string(getenv("TMPDIR") ?: "/tmp");
+
    /* Set computed default values. */
    db_path = string(getenv("HOME") ?: "/") + string("/.debuginfod.sqlite"); /* XDG? */
    int rc = regcomp (& file_include_regex, ".*", REG_EXTENDED|REG_NOSUB); // match everything
@@ -2497,6 +2700,15 @@ main (int argc, char *argv[])
    if (rc != 0)
      error (EXIT_FAILURE, 0, "regcomp failure: %d", rc);
  
+  // default parameters for fdcache are computed from system stats
+  struct statfs sfs;
+  rc = statfs(tmpdir.c_str(), &sfs);
+  if (rc < 0)
+    fdcache_mbs = 1024; // 1 gigabyte
+  else
+    fdcache_mbs = sfs.f_bavail * sfs.f_bsize / 1024 / 1024 / 4; // 25% of free space
+  fdcache_fds = concurrency * 2;
+
    /* Parse and process arguments.  */
    int remaining;
    argp_program_version_hook = print_version; // this works
@@ -2508,6 +2720,8 @@ main (int argc, char *argv[])
    if (scan_archives.size()==0 && !scan_files && source_paths.size()>0)
      obatched(clog) << "warning: without -F -R -U, ignoring PATHs" << endl;
  
+  fdcache.limit(fdcache_fds, fdcache_mbs);
+
    (void) signal (SIGPIPE, SIG_IGN); // microhttpd can generate it incidentally, ignore
    (void) signal (SIGINT, signal_handler); // ^C
    (void) signal (SIGHUP, signal_handler); // EOF
@@ -2625,6 +2839,9 @@ main (int argc, char *argv[])
  
    obatched(clog) << "search concurrency " << concurrency << endl;
    obatched(clog) << "rescan time " << rescan_s << endl;
+  obatched(clog) << "fdcache fds " << fdcache_fds << endl;
+  obatched(clog) << "fdcache mbs " << fdcache_mbs << endl;
+  obatched(clog) << "fdcache tmpdir " << tmpdir << endl;
    obatched(clog) << "groom time " << groom_s << endl;
    if (scan_archives.size()>0)
      {
diff --git a/doc/ChangeLog b/doc/ChangeLog

index 370b185db17235970058dd00bf70707081775cc1..51088dfaf9b09f2149e166b33e8421006b649d4b 100644 (file)
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2020-01-11  Frank Ch. Eigler  <fche@redhat.com
+
+       * debuginfod.8: Document --fdcache-fds and --fdcache-mbs opts.
+
  2020-01-11  Frank Ch. Eigler  <fche@redhat.com>
  
         * debuginfod.8: Rework sections dealing with traversal/scanning,
diff --git a/doc/debuginfod.8 b/doc/debuginfod.8

index e288e8817d02d1f025eabcffa4a7ce4fdcaddbfc..166c7c4590edc38c841ba9f7f402c071e6d37d35 100644 (file)
--- a/doc/debuginfod.8
+++ b/doc/debuginfod.8
@@ -175,6 +175,16 @@ device, and ignore symlinks - as in \fIfind\ -P\ -xdev\fP.  Caution: a
  loops in the symbolic directory tree might lead to \fIinfinite
  traversal\fP.
  
+.TP
+.B "\-\-fdcache-fds=NUM"  "\-\-fdcache-mbs=MB"
+Configure limits on a cache that keeps recently extracted files from
+archives.  Up to NUM files and up to a total of MB megabytes will be
+kept extracted, in order to avoid having to decompress their archives
+again.  The default NUM and MB values depend on the concurrency of the
+system, and on the available disk space on the $TMPDIR or \fB/tmp\fP
+filesystem.  This is because that is where the most recently used
+extracted files are kept.  Grooming cleans this cache.
+
  .TP
  .B "\-v"
  Increase verbosity of logging to the standard error file descriptor.
diff --git a/tests/ChangeLog b/tests/ChangeLog

index 2057e2146697fab34f703e342ff63d4e96874408..f322b6a447ea86ca2dd1a75bc01f3745cd8909ee 100644 (file)
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,7 @@
+2020-01-11  Frank Ch. Eigler  <fche@redhat.com>
+
+       * run-debuginfod-find.sh: Test --fdcache* options.
+
  2020-01-11  Frank Ch. Eigler  <fche@redhat.com>
  
         * run-debuginfod-find.sh: Adjust to new work-queue metrics.
diff --git a/tests/run-debuginfod-find.sh b/tests/run-debuginfod-find.sh

index 9c9b748d1da1f6b85d52acd7d8de73738a6e2d2c..6862db7252763bb1bbf0fc78c871f15eb9be724e 100755 (executable)
--- a/tests/run-debuginfod-find.sh
+++ b/tests/run-debuginfod-find.sh
@@ -1,6 +1,6 @@
  #!/bin/bash
  #
-# Copyright (C) 2019 Red Hat, Inc.
+# Copyright (C) 2019-2020 Red Hat, Inc.
  # This file is part of elfutils.
  #
  # This file is free software; you can redistribute it and/or modify
@@ -87,7 +87,7 @@ wait_ready()
    fi
  }
  
-env LD_LIBRARY_PATH=$ldpath DEBUGINFOD_URLS= ${abs_builddir}/../debuginfod/debuginfod $VERBOSE -F -R -d $DB -p $PORT1 -t0 -g0 R F L &
+env LD_LIBRARY_PATH=$ldpath DEBUGINFOD_URLS= ${abs_builddir}/../debuginfod/debuginfod $VERBOSE -F -R -d $DB -p $PORT1 -t0 -g0 --fdcache-fds 1 --fdcache-mbs 2 R F L &
  PID1=$!
  # Server must become ready
  wait_ready $PORT1 'ready' 1
@@ -213,6 +213,12 @@ archive_test() {
               -a $filename | grep 'Build ID' | cut -d ' ' -f 7`
      test $__BUILDID = $buildid
  
+    # run again to assure that fdcache is being enjoyed
+    filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find executable $__BUILDID`
+    buildid=`env LD_LIBRARY_PATH=$ldpath ${abs_builddir}/../src/readelf \
+             -a $filename | grep 'Build ID' | cut -d ' ' -f 7`
+    test $__BUILDID = $buildid
+
      filename=`testrun ${abs_top_builddir}/debuginfod/debuginfod-find debuginfo $__BUILDID`
      buildid=`env LD_LIBRARY_PATH=$ldpath ${abs_builddir}/../src/readelf \
               -a $filename | grep 'Build ID' | cut -d ' ' -f 7`
@@ -275,6 +281,9 @@ env LD_LIBRARY_PATH=$ldpath ${abs_builddir}/../debuginfod/debuginfod $VERBOSE -F
  PID2=$!
  tempfiles ${DB}_2
  wait_ready $PORT2 'ready' 1
+wait_ready $PORT2 'thread_work_total{role="traverse"}' 1
+wait_ready $PORT2 'thread_work_pending{role="scan"}' 0
+wait_ready $PORT2 'thread_busy{role="scan"}' 0
  
  # have clients contact the new server
  export DEBUGINFOD_URLS=http://127.0.0.1:$PORT2
@@ -319,11 +328,12 @@ testrun ${abs_builddir}/debuginfod_build_id_find -e F/prog2 1
  
  # Fetch some metrics, if curl program is installed
  if type curl 2>/dev/null; then
-    curl http://127.0.0.1:$PORT1/badapi
-    curl http://127.0.0.1:$PORT1/metrics
-    curl http://127.0.0.1:$PORT2/metrics
-    curl http://127.0.0.1:$PORT1/metrics | grep -q 'http_responses_total.*result.*error'
-    curl http://127.0.0.1:$PORT2/metrics | grep -q 'http_responses_total.*result.*upstream'
+    curl -s http://127.0.0.1:$PORT1/badapi
+    curl -s http://127.0.0.1:$PORT1/metrics
+    curl -s http://127.0.0.1:$PORT2/metrics
+    curl -s http://127.0.0.1:$PORT1/metrics | grep -q 'http_responses_total.*result.*error'
+    curl -s http://127.0.0.1:$PORT1/metrics | grep -q 'http_responses_total.*result.*fdcache'
+    curl -s http://127.0.0.1:$PORT2/metrics | grep -q 'http_responses_total.*result.*upstream'
  fi
  
  ########################################################################
author	Frank Ch. Eigler <fche@redhat.com>
	Fri, 27 Dec 2019 00:06:46 +0000 (19:06 -0500)
committer	Frank Ch. Eigler <fche@redhat.com>
	Sun, 12 Jan 2020 03:41:54 +0000 (22:41 -0500)
config/ChangeLog		patch \| blob \| blame \| history
config/debuginfod.service		patch \| blob \| blame \| history
debuginfod/ChangeLog		patch \| blob \| blame \| history
debuginfod/debuginfod.cxx		patch \| blob \| blame \| history
doc/ChangeLog		patch \| blob \| blame \| history
doc/debuginfod.8		patch \| blob \| blame \| history
tests/ChangeLog		patch \| blob \| blame \| history
tests/run-debuginfod-find.sh		patch \| blob \| blame \| history