PR29472: debuginfod: add metadata query webapi, C api, client

author Frank Ch. Eigler <fche@redhat.com>

Mon, 31 Oct 2022 21:40:01 +0000 (17:40 -0400)

committer Frank Ch. Eigler <fche@redhat.com>

Mon, 3 Jun 2024 15:22:49 +0000 (11:22 -0400)
author Frank Ch. Eigler <fche@redhat.com>
Mon, 31 Oct 2022 21:40:01 +0000 (17:40 -0400)
committer Frank Ch. Eigler <fche@redhat.com>
Mon, 3 Jun 2024 15:22:49 +0000 (11:22 -0400)
diff --git a/NEWS b/NEWS

index 6f931bb518cce2e7e1ed7ca7ef52d95a9a1c9cdb..300db133526f3da38d4aefda343a65808307a021 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,8 @@ Version 0.192 (one after 0.191)
  debuginfod: Add per-file signature verification for integrity
              checking, using RPM IMA scheme from Fedora/RHEL.
  
+debuginfod: New API for metadata queries: file name -> buildid.
+
  Version 0.191 "Bug fixes in C major"
  
  libdw: dwarf_addrdie now supports binaries lacking a .debug_aranges
diff --git a/config/elfutils.spec.in b/config/elfutils.spec.in

index 460729972420199c55c96f526c5ca3b495d3a17b..eff04575573092e74ca7b753313b0b0205081ab5 100644 (file)
--- a/config/elfutils.spec.in
+++ b/config/elfutils.spec.in
@@ -31,6 +31,8 @@ BuildRequires: pkgconfig(libmicrohttpd) >= 0.9.33
  BuildRequires: pkgconfig(libcurl) >= 7.29.0
  BuildRequires: pkgconfig(sqlite3) >= 3.7.17
  BuildRequires: pkgconfig(libarchive) >= 3.1.2
+# For debugindod metadata query
+BuildRequires: pkgconfig(json-c) >= 0.11
  
  # For tests need to bunzip2 test files.
  BuildRequires: bzip2
@@ -42,6 +44,8 @@ BuildRequires: bsdtar
  BuildRequires: curl
  # For run-debuginfod-response-headers.sh test case
  BuildRequires: socat
+# For run-debuginfod-find-metadata.sh
+BuildRequires: jq
  
  # For debuginfod rpm IMA verification
  BuildRequires: rpm-devel
diff --git a/configure.ac b/configure.ac

index 5adf766720e4f93ae229620c5c2dfc8b4b53ea23..836d61ea6c0d79dcc301a0317ee97f5ace8a2552 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -863,9 +863,6 @@ AS_IF([test "x$enable_libdebuginfod" != "xno"], [
        enable_libdebuginfod=yes # presume success
        PKG_PROG_PKG_CONFIG
        PKG_CHECK_MODULES([libcurl],[libcurl >= 7.29.0],[],[enable_libdebuginfod=no])
-      if test "x$enable_libdebuginfod" = "xno"; then
-        AC_MSG_ERROR([dependencies not found, use --disable-libdebuginfod to disable or --enable-libdebuginfod=dummy to build a (bootstrap) dummy library.])
-      fi
      else
        AC_MSG_NOTICE([building (bootstrap) dummy libdebuginfo library])
      fi
@@ -899,10 +896,8 @@ AS_IF([test "x$enable_debuginfod" != "xno"], [
      PKG_CHECK_MODULES([libmicrohttpd],[libmicrohttpd >= 0.9.33],[],[enable_debuginfod=no])
      PKG_CHECK_MODULES([oldlibmicrohttpd],[libmicrohttpd < 0.9.51],[old_libmicrohttpd=yes],[old_libmicrohttpd=no])
      PKG_CHECK_MODULES([sqlite3],[sqlite3 >= 3.7.17],[],[enable_debuginfod=no])
-    PKG_CHECK_MODULES([libarchive],[libarchive >= 3.1.2],[],[enable_debuginfod=no], AC_DEFINE([HAVE_LIBARCHIVE], [0], [Define to 0 if libarchive is not available]))
-    if test "x$enable_debuginfod" = "xno"; then
-      AC_MSG_ERROR([dependencies not found, use --disable-debuginfod to disable.])
-    fi
+    PKG_CHECK_MODULES([libarchive],[libarchive >= 3.1.2],[],[enable_debuginfod=no])
+    PKG_CHECK_MODULES([jsonc],[json-c >= 0.11],[],[enable_debuginfod=no])    
  ])
  
  AS_IF([test "x$enable_debuginfod" != "xno"],AC_DEFINE([ENABLE_DEBUGINFOD],[1],[Build debuginfod]))
diff --git a/debuginfod/Makefile.am b/debuginfod/Makefile.am

index 5e4f9669d7c1958dfe8e6cdba82891adfe9c63c1..b74e3673a97e96b0ca2e0817c3087fa0efa7972b 100644 (file)
--- a/debuginfod/Makefile.am
+++ b/debuginfod/Makefile.am
@@ -33,7 +33,7 @@ include $(top_srcdir)/config/eu.am
  AM_CPPFLAGS += -I$(srcdir) -I$(srcdir)/../libelf -I$(srcdir)/../libebl \
            -I$(srcdir)/../libdw -I$(srcdir)/../libdwelf \
            $(libmicrohttpd_CFLAGS) $(libcurl_CFLAGS) $(sqlite3_CFLAGS) \
-          $(libarchive_CFLAGS)
+          $(libarchive_CFLAGS) $(jsonc_CFLAGS)
  
  # Disable eu- prefixing for artifacts (binaries & man pages) in this
  # directory, since they do not conflict with binutils tools.
@@ -70,10 +70,10 @@ bin_PROGRAMS += debuginfod-find
  endif
  
  debuginfod_SOURCES = debuginfod.cxx
-debuginfod_LDADD = $(libdw) $(libelf) $(libeu) $(libdebuginfod) $(argp_LDADD) $(fts_LIBS) $(libmicrohttpd_LIBS) $(sqlite3_LIBS) $(libarchive_LIBS) $(rpm_LIBS) -lpthread -ldl
+debuginfod_LDADD = $(libdw) $(libelf) $(libeu) $(libdebuginfod) $(argp_LDADD) $(fts_LIBS) $(libmicrohttpd_LIBS) $(sqlite3_LIBS) $(libarchive_LIBS) $(rpm_LIBS) $(jsonc_LIBS) $(libcurl_LIBS) -lpthread -ldl
  
  debuginfod_find_SOURCES = debuginfod-find.c
-debuginfod_find_LDADD = $(libdw) $(libelf) $(libeu) $(libdebuginfod) $(argp_LDADD) $(fts_LIBS)
+debuginfod_find_LDADD = $(libdw) $(libelf) $(libeu) $(libdebuginfod) $(argp_LDADD) $(fts_LIBS) $(jsonc_LIBS)
  
  if LIBDEBUGINFOD
  noinst_LIBRARIES = libdebuginfod.a
@@ -97,7 +97,7 @@ libdebuginfod_so_LIBS = libdebuginfod_pic.a
  if DUMMY_LIBDEBUGINFOD
  libdebuginfod_so_LDLIBS =
  else
-libdebuginfod_so_LDLIBS = -lpthread $(libcurl_LIBS) $(fts_LIBS) $(libelf) $(crypto_LIBS)
+libdebuginfod_so_LDLIBS = -lpthread $(libcurl_LIBS) $(fts_LIBS) $(libelf) $(crypto_LIBS) $(jsonc_LIBS)
  endif
  $(LIBDEBUGINFOD_SONAME): $(srcdir)/libdebuginfod.map $(libdebuginfod_so_LIBS)
         $(AM_V_CCLD)$(LINK) $(dso_LDFLAGS) -o $@ \
diff --git a/debuginfod/debuginfod-client.c b/debuginfod/debuginfod-client.c

index f01d1f0e55faf5166ba06b604a1933fc5ff0b953..3d6f8d8c4beac8448fc8426d0ca6eb2129970f83 100644 (file)
--- a/debuginfod/debuginfod-client.c
+++ b/debuginfod/debuginfod-client.c
@@ -71,6 +71,8 @@ int debuginfod_find_source (debuginfod_client *c, const unsigned char *b,
  int debuginfod_find_section (debuginfod_client *c, const unsigned char *b,
                              int s, const char *scn, char **p)
                               { return -ENOSYS; }
+int debuginfod_find_metadata (debuginfod_client *c,
+                              const char *k, const char *v, char **p) { return -ENOSYS; }
  void debuginfod_set_progressfn(debuginfod_client *c,
                                debuginfod_progressfn_t fn) { }
  void debuginfod_set_verbose_fd(debuginfod_client *c, int fd) { }
@@ -104,6 +106,7 @@ void debuginfod_end (debuginfod_client *c) { }
  #include <sys/utsname.h>
  #include <curl/curl.h>
  #include <fnmatch.h>
+#include <json-c/json.h>
  
  /* If fts.h is included before config.h, its indirect inclusions may not
     give us the right LFS aliases of these functions, so map them manually.  */
@@ -211,6 +214,11 @@ static const char *cache_miss_filename = "cache_miss_s";
  static const char *cache_max_unused_age_filename = "max_unused_age_s";
  static const long cache_default_max_unused_age_s = 604800; /* 1 week */
  
+/* The metadata_retention_default_s file within the debuginfod cache
+   specifies how long metadata query results should be cached. */
+static const long metadata_retention_default_s = 3600; /* 1 hour */
+static const char *metadata_retention_filename = "metadata_retention_s";
+
  /* Location of the cache of files downloaded from debuginfods.
     The default parent directory is $HOME, or '/' if $HOME doesn't exist.  */
  static const char *cache_default_name = ".debuginfod_client_cache";
@@ -249,9 +257,14 @@ struct handle_data
       to the cache. Used to ensure that a file is not downloaded from
       multiple servers unnecessarily.  */
    CURL **target_handle;
+
    /* Response http headers for this client handle, sent from the server */
    char *response_data;
    size_t response_data_size;
+
+  /* Response metadata values for this client handle, sent from the server */
+  char *metadata;
+  size_t metadata_size;
  };
  
  
@@ -556,7 +569,8 @@ debuginfod_clean_cache(debuginfod_client *c,
      return -errno;
  
    regex_t re;
-  const char * pattern = ".*/[a-f0-9]+(/debuginfo|/executable|/source.*|)$"; /* include dirs */
+  const char * pattern = ".*/(metadata.*|[a-f0-9]+(/debuginfo|/executable|/source.*|))$"; /* include dirs */
+  /* NB: also matches .../section/ subdirs, so extracted section files also get cleaned. */
    if (regcomp (&re, pattern, REG_EXTENDED | REG_NOSUB) != 0)
      return -ENOMEM;
  
@@ -794,18 +808,9 @@ header_callback (char * buffer, size_t size, size_t numitems, void * userdata)
    }
    /* Temporary buffer for realloc */
    char *temp = NULL;
-  if (data->response_data == NULL)
-    {
-      temp = malloc(numitems);
-      if (temp == NULL)
-        return 0;
-    }
-  else
-    {
-      temp = realloc(data->response_data, data->response_data_size + numitems);
-      if (temp == NULL)
-        return 0;
-    }
+  temp = realloc(data->response_data, data->response_data_size + numitems);
+  if (temp == NULL)
+    return 0;
  
    memcpy(temp + data->response_data_size, buffer, numitems-1);
    data->response_data = temp;
@@ -815,6 +820,384 @@ header_callback (char * buffer, size_t size, size_t numitems, void * userdata)
    return numitems;
  }
  
+
+static size_t
+metadata_callback (char * buffer, size_t size, size_t numitems, void * userdata)
+{
+  if (size != 1)
+    return 0;
+  /* Temporary buffer for realloc */
+  char *temp = NULL;
+  struct handle_data *data = (struct handle_data *) userdata;
+  temp = realloc(data->metadata, data->metadata_size + numitems + 1);
+  if (temp == NULL)
+    return 0;
+  
+  memcpy(temp + data->metadata_size, buffer, numitems);
+  data->metadata = temp;
+  data->metadata_size += numitems;
+  data->metadata[data->metadata_size] = '\0';
+  return numitems;
+}
+
+
+/* This function takes a copy of DEBUGINFOD_URLS, server_urls, and
+ * separates it into an array of urls to query, each with a
+ * corresponding IMA policy. The url_subdir is either 'buildid' or
+ * 'metadata', corresponding to the query type. Returns 0 on success
+ * and -Posix error on failure.
+ */
+int
+init_server_urls(char* url_subdir, const char* type,
+                 char *server_urls, char ***server_url_list, ima_policy_t **url_ima_policies,
+                 int *num_urls, int vfd)
+{
+  /* Initialize the memory to zero */
+  char *strtok_saveptr;
+  ima_policy_t verification_mode = ignore; // The default mode  
+  char *server_url = strtok_r(server_urls, url_delim, &strtok_saveptr);
+  /* Count number of URLs.  */
+  int n = 0;
+
+  while (server_url != NULL)
+    {
+      // When we encountered a (well-formed) token off the form
+      // ima:foo, we update the policy under which results from that
+      // server will be ima verified
+      if (startswith(server_url, "ima:"))
+        {
+#ifdef ENABLE_IMA_VERIFICATION
+          ima_policy_t m = ima_policy_str2enum(server_url + strlen("ima:"));
+          if(m != undefined)
+            verification_mode = m;
+          else if (vfd >= 0)
+            dprintf(vfd, "IMA mode not recognized, skipping %s\n", server_url);
+#else
+          if (vfd >= 0)
+            dprintf(vfd, "IMA signature verification is not enabled, treating %s as ima:ignore\n", server_url);
+#endif
+          goto continue_next_url;
+        }
+
+      if (verification_mode==enforcing &&
+          0==strcmp(url_subdir, "buildid") &&
+          0==strcmp(type,"section")) // section queries are unsecurable
+        {
+          if (vfd >= 0)
+            dprintf(vfd, "skipping server %s section query in IMA enforcing mode\n", server_url);
+          goto continue_next_url;
+        }
+
+      // Construct actual URL for libcurl
+      int r;
+      char *tmp_url;
+      if (strlen(server_url) > 1 && server_url[strlen(server_url)-1] == '/')
+        r = asprintf(&tmp_url, "%s%s", server_url, url_subdir);
+      else
+        r = asprintf(&tmp_url, "%s/%s", server_url, url_subdir);
+
+      if (r == -1)
+        return -ENOMEM;
+      
+      /* PR 27983: If the url is duplicate, skip it */
+      int url_index;
+      for (url_index = 0; url_index < n; ++url_index)
+        {
+          if(strcmp(tmp_url, (*server_url_list)[url_index]) == 0)
+            {
+              url_index = -1;
+              break;
+            }
+        }
+      if (url_index == -1)
+        {
+          if (vfd >= 0)
+            dprintf(vfd, "duplicate url: %s, skipping\n", tmp_url);
+          free(tmp_url);
+        }
+      else
+        {
+          /* Have unique URL, save it, along with its IMA verification tag. */
+          n ++;
+          if (NULL == (*server_url_list = reallocarray(*server_url_list, n, sizeof(char*)))
+              || NULL == (*url_ima_policies = reallocarray(*url_ima_policies, n, sizeof(ima_policy_t))))
+            {
+              free (tmp_url);
+              return -ENOMEM;
+            }
+          (*server_url_list)[n-1] = tmp_url;
+          if(NULL != url_ima_policies) (*url_ima_policies)[n-1] = verification_mode;
+        }
+
+    continue_next_url:
+      server_url = strtok_r(NULL, url_delim, &strtok_saveptr);
+    }
+  *num_urls = n;
+  return 0;
+}
+
+/* Some boilerplate for checking curl_easy_setopt.  */
+#define curl_easy_setopt_ck(H,O,P) do {                        \
+      CURLcode curl_res = curl_easy_setopt (H,O,P);    \
+      if (curl_res != CURLE_OK)                                \
+           {                                           \
+             if (vfd >= 0)                             \
+               dprintf (vfd,                           \
+                         "Bad curl_easy_setopt: %s\n", \
+                         curl_easy_strerror(curl_res));        \
+             return -EINVAL;                           \
+           }                                           \
+      } while (0)
+
+
+/*
+ * This function initializes a CURL handle. It takes optional callbacks for the write
+ * function and the header function, which if defined will use userdata of type struct handle_data*.
+ * Specifically the data[i] within an array of struct handle_data's.
+ * Returns 0 on success and -Posix error on failure.
+ */
+int
+init_handle(debuginfod_client *client,
+  size_t (*w_callback)(char *buffer, size_t size, size_t nitems, void *userdata),
+  size_t (*h_callback)(char *buffer, size_t size, size_t nitems, void *userdata),
+  struct handle_data *data, int i, long timeout,
+  int vfd)
+{
+  data->handle = curl_easy_init();
+  if (data->handle == NULL)
+    return -ENETUNREACH;
+
+  if (vfd >= 0)
+    dprintf (vfd, "url %d %s\n", i, data->url);
+
+  /* Only allow http:// + https:// + file:// so we aren't being
+    redirected to some unsupported protocol.
+    libcurl will fail if we request a single protocol that is not
+    available. https missing is the most likely issue  */
+#if CURL_AT_LEAST_VERSION(7, 85, 0)
+  curl_easy_setopt_ck(data->handle, CURLOPT_PROTOCOLS_STR,
+                      curl_has_https ? "https,http,file" : "http,file");
+#else
+  curl_easy_setopt_ck(data->handle, CURLOPT_PROTOCOLS,
+                      ((curl_has_https ? CURLPROTO_HTTPS : 0) | CURLPROTO_HTTP | CURLPROTO_FILE));
+#endif
+  curl_easy_setopt_ck(data->handle, CURLOPT_URL, data->url);
+  if (vfd >= 0)
+    curl_easy_setopt_ck(data->handle, CURLOPT_ERRORBUFFER,
+      data->errbuf);
+  if (w_callback)
+    {
+      curl_easy_setopt_ck(data->handle,
+                          CURLOPT_WRITEFUNCTION, w_callback);
+      curl_easy_setopt_ck(data->handle, CURLOPT_WRITEDATA, data);
+    }
+  if (timeout > 0)
+    {
+      /* Make sure there is at least some progress,
+         try to get at least 100K per timeout seconds.  */
+      curl_easy_setopt_ck (data->handle, CURLOPT_LOW_SPEED_TIME,
+                           timeout);
+      curl_easy_setopt_ck (data->handle, CURLOPT_LOW_SPEED_LIMIT,
+                           100 * 1024L);
+    }
+  curl_easy_setopt_ck(data->handle, CURLOPT_FILETIME, (long) 1);
+  curl_easy_setopt_ck(data->handle, CURLOPT_FOLLOWLOCATION, (long) 1);
+  curl_easy_setopt_ck(data->handle, CURLOPT_FAILONERROR, (long) 1);
+  curl_easy_setopt_ck(data->handle, CURLOPT_NOSIGNAL, (long) 1);
+  if (h_callback)
+    {
+      curl_easy_setopt_ck(data->handle,
+                          CURLOPT_HEADERFUNCTION, h_callback);
+      curl_easy_setopt_ck(data->handle, CURLOPT_HEADERDATA, data);
+    }
+  #if LIBCURL_VERSION_NUM >= 0x072a00 /* 7.42.0 */
+  curl_easy_setopt_ck(data->handle, CURLOPT_PATH_AS_IS, (long) 1);
+  #else
+  /* On old curl; no big deal, canonicalization here is almost the
+      same, except perhaps for ? # type decorations at the tail. */
+  #endif
+  curl_easy_setopt_ck(data->handle, CURLOPT_AUTOREFERER, (long) 1);
+  curl_easy_setopt_ck(data->handle, CURLOPT_ACCEPT_ENCODING, "");
+  curl_easy_setopt_ck(data->handle, CURLOPT_HTTPHEADER, client->headers);
+
+  return 0;
+}
+
+
+/*
+ * This function busy-waits on one or more curl queries to complete. This can
+ * be controlled via only_one, which, if true, will find the first winner and exit
+ * once found. If positive maxtime and maxsize dictate the maximum allowed wait times
+ * and download sizes respectively. Returns 0 on success and -Posix error on failure.
+ */
+int
+perform_queries(CURLM *curlm, CURL **target_handle, struct handle_data *data, debuginfod_client *c,
+                int num_urls, long maxtime, long maxsize, bool only_one, int vfd, int *committed_to)
+{
+  int still_running = -1;
+  long loops = 0;
+  *committed_to = -1;
+  bool verbose_reported = false;
+  struct timespec start_time, cur_time;
+  if (c->winning_headers != NULL)
+    {
+      free (c->winning_headers);
+      c->winning_headers = NULL;
+    }
+  if (maxtime > 0 && clock_gettime(CLOCK_MONOTONIC_RAW, &start_time) == -1)
+    return -errno;
+  long delta = 0;
+  do
+    {
+      /* Check to see how long querying is taking. */
+      if (maxtime > 0)
+        {
+          if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time) == -1)
+            return -errno;
+          delta = cur_time.tv_sec - start_time.tv_sec;
+          if ( delta >  maxtime)
+            {
+              dprintf(vfd, "Timeout with max time=%lds and transfer time=%lds\n", maxtime, delta );
+              return -ETIME;
+            }
+        }
+      /* Wait 1 second, the minimum DEBUGINFOD_TIMEOUT.  */
+      curl_multi_wait(curlm, NULL, 0, 1000, NULL);
+      CURLMcode curlm_res = curl_multi_perform(curlm, &still_running);
+      
+      if (only_one)
+        {
+          /* If the target file has been found, abort the other queries.  */
+          if (target_handle && *target_handle != NULL)
+            {
+              for (int i = 0; i < num_urls; i++)
+                if (data[i].handle != *target_handle)
+                  curl_multi_remove_handle(curlm, data[i].handle);
+                else
+                  {
+                    *committed_to = i;
+                    if (c->winning_headers == NULL)
+                      {
+                        c->winning_headers = data[*committed_to].response_data;
+                        if (vfd >= 0 && c->winning_headers != NULL)
+                          dprintf(vfd, "\n%s", c->winning_headers);
+                        data[*committed_to].response_data = NULL;
+                        data[*committed_to].response_data_size = 0;
+                      }
+                  }
+            }
+          
+          if (vfd >= 0 && !verbose_reported && *committed_to >= 0)
+            {
+              bool pnl = (c->default_progressfn_printed_p && vfd == STDERR_FILENO);
+              dprintf (vfd, "%scommitted to url %d\n", pnl ? "\n" : "",
+                       *committed_to);
+              if (pnl)
+                c->default_progressfn_printed_p = 0;
+              verbose_reported = true;
+            }
+        }
+      
+      if (curlm_res != CURLM_OK)
+        {
+          switch (curlm_res)
+            {
+            case CURLM_CALL_MULTI_PERFORM: continue;
+            case CURLM_OUT_OF_MEMORY: return -ENOMEM;
+            default: return -ENETUNREACH;
+            }
+        }
+      
+      long dl_size = -1;
+      if (target_handle && *target_handle && (c->progressfn || maxsize > 0))
+        {
+          /* Get size of file being downloaded. NB: If going through
+             deflate-compressing proxies, this number is likely to be
+             unavailable, so -1 may show. */
+          CURLcode curl_res;
+#if CURL_AT_LEAST_VERSION(7, 55, 0)
+          curl_off_t cl;
+          curl_res = curl_easy_getinfo(*target_handle,
+                                       CURLINFO_CONTENT_LENGTH_DOWNLOAD_T,
+                                       &cl);
+          if (curl_res == CURLE_OK && cl >= 0)
+            dl_size = (cl > LONG_MAX ? LONG_MAX : (long)cl);
+#else
+          double cl;
+          curl_res = curl_easy_getinfo(*target_handle,
+                                       CURLINFO_CONTENT_LENGTH_DOWNLOAD,
+                                       &cl);
+          if (curl_res == CURLE_OK && cl >= 0)
+            dl_size = (cl >= (double)(LONG_MAX+1UL) ? LONG_MAX : (long)cl);
+#endif
+          /* If Content-Length is -1, try to get the size from
+             X-Debuginfod-Size */
+          if (dl_size == -1 && c->winning_headers != NULL)
+            {
+              long xdl;
+              char *hdr = strcasestr(c->winning_headers, "x-debuginfod-size");
+              size_t off = strlen("x-debuginfod-size:");
+              
+              if (hdr != NULL && sscanf(hdr + off, "%ld", &xdl) == 1)
+                dl_size = xdl;
+            }
+        }
+      
+      if (c->progressfn) /* inform/check progress callback */
+        {
+          loops ++;
+          long pa = loops; /* default param for progress callback */
+          if (target_handle && *target_handle) /* we've committed to a server; report its download progress */
+            {
+              /* PR30809: Check actual size of cached file.  This same
+                 fd is shared by all the multi-curl handles (but only
+                 one will end up writing to it).  Another way could be
+                 to tabulate totals in debuginfod_write_callback(). */
+              struct stat cached;
+              int statrc = fstat(data[*committed_to].fd, &cached);
+              if (statrc == 0)
+                pa = (long) cached.st_size;
+              else
+                {
+                  /* Otherwise, query libcurl for its tabulated total.
+                     However, that counts http body length, not
+                     decoded/decompressed content length, so does not
+                     measure quite the same thing as dl. */
+                  CURLcode curl_res;
+#if CURL_AT_LEAST_VERSION(7, 55, 0)
+                  curl_off_t dl;
+                  curl_res = curl_easy_getinfo(target_handle,
+                                               CURLINFO_SIZE_DOWNLOAD_T,
+                                               &dl);
+                  if (curl_res == 0 && dl >= 0)
+                    pa = (dl > LONG_MAX ? LONG_MAX : (long)dl);
+#else
+                  double dl;
+                  curl_res = curl_easy_getinfo(target_handle,
+                                               CURLINFO_SIZE_DOWNLOAD,
+                                               &dl);
+                  if (curl_res == 0)
+                    pa = (dl >= (double)(LONG_MAX+1UL) ? LONG_MAX : (long)dl);
+#endif
+                }
+              
+              if ((*c->progressfn) (c, pa, dl_size == -1 ? 0 : dl_size))
+                break;
+            }
+        }
+      /* Check to see if we are downloading something which exceeds maxsize, if set.*/
+      if (target_handle && *target_handle && dl_size > maxsize && maxsize > 0)
+        {
+          if (vfd >=0)
+            dprintf(vfd, "Content-Length too large.\n");
+          return -EFBIG;
+        }
+    } while (still_running);
+  
+  return 0;
+}
+
+
  /* Copy SRC to DEST, s,/,#,g */
  
  static void
@@ -1258,43 +1641,121 @@ debuginfod_validate_imasig (debuginfod_client *c, int fd)
  
  
  
-/* Query each of the server URLs found in $DEBUGINFOD_URLS for the file
-   with the specified build-id and type (debuginfo, executable, source or
-   section).  If type is source, then type_arg should be a filename.  If
-   type is section, then type_arg should be the name of an ELF/DWARF
-   section.  Otherwise type_arg may be NULL.  Return a file descriptor
-   for the target if successful, otherwise return an error code.
-*/
-static int
-debuginfod_query_server (debuginfod_client *c,
-                        const unsigned char *build_id,
-                         int build_id_len,
-                         const char *type,
-                         const char *type_arg,
-                         char **path)
-{
-  char *server_urls;
-  char *urls_envvar;
-  const char *section = NULL;
-  const char *filename = NULL;
-  char *cache_path = NULL;
-  char *maxage_path = NULL;
-  char *interval_path = NULL;
-  char *cache_miss_path = NULL;
-  char *target_cache_dir = NULL;
-  char *target_cache_path = NULL;
-  char *target_cache_tmppath = NULL;
-  char suffix[PATH_MAX + 1]; /* +1 for zero terminator.  */
-  char build_id_bytes[MAX_BUILD_ID_BYTES * 2 + 1];
-  int vfd = c->verbose_fd;
-  int rc;
  
-  c->progressfn_cancel = false;
+/* Helper function to create client cache directory.
+   $XDG_CACHE_HOME takes priority over $HOME/.cache.
+   $DEBUGINFOD_CACHE_PATH takes priority over $HOME/.cache and $XDG_CACHE_HOME.
  
-  if (strcmp (type, "source") == 0)
-    filename = type_arg;
-  else if (strcmp (type, "section") == 0)
-    {
+   Return resulting path name or NULL on error.  Caller must free resulting string.
+ */
+static char *
+make_cache_path(void)
+{
+  char* cache_path = NULL;
+  int rc = 0;
+  /* Determine location of the cache. The path specified by the debuginfod
+     cache environment variable takes priority.  */
+  char *cache_var = getenv(DEBUGINFOD_CACHE_PATH_ENV_VAR);
+  if (cache_var != NULL && strlen (cache_var) > 0)
+    xalloc_str (cache_path, "%s", cache_var);
+  else
+    {
+      /* If a cache already exists in $HOME ('/' if $HOME isn't set), then use
+         that. Otherwise use the XDG cache directory naming format.  */
+      xalloc_str (cache_path, "%s/%s", getenv ("HOME") ?: "/", cache_default_name);
+
+      struct stat st;
+      if (stat (cache_path, &st) < 0)
+        {
+          char cachedir[PATH_MAX];
+          char *xdg = getenv ("XDG_CACHE_HOME");
+
+          if (xdg != NULL && strlen (xdg) > 0)
+            snprintf (cachedir, PATH_MAX, "%s", xdg);
+          else
+            snprintf (cachedir, PATH_MAX, "%s/.cache", getenv ("HOME") ?: "/");
+
+          /* Create XDG cache directory if it doesn't exist.  */
+          if (stat (cachedir, &st) == 0)
+            {
+              if (! S_ISDIR (st.st_mode))
+                {
+                  rc = -EEXIST;
+                  goto out1;
+                }
+            }
+          else
+            {
+              rc = mkdir (cachedir, 0700);
+
+              /* Also check for EEXIST and S_ISDIR in case another client just
+                 happened to create the cache.  */
+              if (rc < 0
+                  && (errno != EEXIST
+                      || stat (cachedir, &st) != 0
+                      || ! S_ISDIR (st.st_mode)))
+                {
+                  rc = -errno;
+                  goto out1;
+                }
+            }
+
+          free (cache_path);
+          xalloc_str (cache_path, "%s/%s", cachedir, cache_xdg_name);
+        }
+    }
+
+  goto out;
+  
+ out1:
+  (void) rc;
+  free (cache_path);
+  cache_path = NULL;
+
+ out:
+  if (cache_path != NULL)
+    (void) mkdir (cache_path, 0700); // failures with this mkdir would be caught later too
+  return cache_path;
+}
+
+
+/* Query each of the server URLs found in $DEBUGINFOD_URLS for the file
+   with the specified build-id and type (debuginfo, executable, source or
+   section).  If type is source, then type_arg should be a filename.  If
+   type is section, then type_arg should be the name of an ELF/DWARF
+   section.  Otherwise type_arg may be NULL.  Return a file descriptor
+   for the target if successful, otherwise return an error code.
+*/
+static int
+debuginfod_query_server_by_buildid (debuginfod_client *c,
+                        const unsigned char *build_id,
+                         int build_id_len,
+                         const char *type,
+                         const char *type_arg,
+                         char **path)
+{
+  char *server_urls;
+  char *urls_envvar;
+  const char *section = NULL;
+  const char *filename = NULL;
+  char *cache_path = NULL;
+  char *maxage_path = NULL;
+  char *interval_path = NULL;
+  char *cache_miss_path = NULL;
+  char *target_cache_dir = NULL;
+  char *target_cache_path = NULL;
+  char *target_cache_tmppath = NULL;
+  char suffix[PATH_MAX + 1]; /* +1 for zero terminator.  */
+  char build_id_bytes[MAX_BUILD_ID_BYTES * 2 + 1];
+  int vfd = c->verbose_fd;
+  int rc, r;
+
+  c->progressfn_cancel = false;
+
+  if (strcmp (type, "source") == 0)
+    filename = type_arg;
+  else if (strcmp (type, "section") == 0)
+    {
        section = type_arg;
        if (section == NULL)
         return -EINVAL;
@@ -1412,70 +1873,22 @@ debuginfod_query_server (debuginfod_client *c,
      dprintf (vfd, "suffix %s\n", suffix);
  
    /* set paths needed to perform the query
-
-     example format
+     example format:
       cache_path:        $HOME/.cache
       target_cache_dir:  $HOME/.cache/0123abcd
       target_cache_path: $HOME/.cache/0123abcd/debuginfo
       target_cache_path: $HOME/.cache/0123abcd/source#PATH#TO#SOURCE ?
-
-     $XDG_CACHE_HOME takes priority over $HOME/.cache.
-     $DEBUGINFOD_CACHE_PATH takes priority over $HOME/.cache and $XDG_CACHE_HOME.
    */
  
-  /* Determine location of the cache. The path specified by the debuginfod
-     cache environment variable takes priority.  */
-  char *cache_var = getenv(DEBUGINFOD_CACHE_PATH_ENV_VAR);
-  if (cache_var != NULL && strlen (cache_var) > 0)
-    xalloc_str (cache_path, "%s", cache_var);
-  else
+  cache_path = make_cache_path();
+  if (!cache_path)
      {
-      /* If a cache already exists in $HOME ('/' if $HOME isn't set), then use
-         that. Otherwise use the XDG cache directory naming format.  */
-      xalloc_str (cache_path, "%s/%s", getenv ("HOME") ?: "/", cache_default_name);
-
-      struct stat st;
-      if (stat (cache_path, &st) < 0)
-        {
-          char cachedir[PATH_MAX];
-          char *xdg = getenv ("XDG_CACHE_HOME");
-
-          if (xdg != NULL && strlen (xdg) > 0)
-            snprintf (cachedir, PATH_MAX, "%s", xdg);
-          else
-            snprintf (cachedir, PATH_MAX, "%s/.cache", getenv ("HOME") ?: "/");
-
-          /* Create XDG cache directory if it doesn't exist.  */
-          if (stat (cachedir, &st) == 0)
-            {
-              if (! S_ISDIR (st.st_mode))
-                {
-                  rc = -EEXIST;
-                  goto out;
-                }
-            }
-          else
-            {
-              rc = mkdir (cachedir, 0700);
-
-              /* Also check for EEXIST and S_ISDIR in case another client just
-                 happened to create the cache.  */
-              if (rc < 0
-                  && (errno != EEXIST
-                      || stat (cachedir, &st) != 0
-                      || ! S_ISDIR (st.st_mode)))
-                {
-                  rc = -errno;
-                  goto out;
-                }
-            }
-
-          free (cache_path);
-          xalloc_str (cache_path, "%s/%s", cachedir, cache_xdg_name);
-        }
+      rc = -ENOMEM;
+      goto out;
      }
-
    xalloc_str (target_cache_dir, "%s/%s", cache_path, build_id_bytes);
+  (void) mkdir (target_cache_dir, 0700); // failures with this mkdir would be caught later too
+
    if (section != NULL)
      xalloc_str (target_cache_path, "%s/%s-%s", target_cache_dir, type, suffix);
    else
@@ -1594,102 +2007,32 @@ debuginfod_query_server (debuginfod_client *c,
    /* thereafter, goto out0 on error*/
  
    /* Because of a race with cache cleanup / rmdir, try to mkdir/mkstemp up to twice. */
-  for(int i=0; i<2; i++) {
-    /* (re)create target directory in cache */
-    (void) mkdir(target_cache_dir, 0700); /* files will be 0400 later */
-
-    /* NB: write to a temporary file first, to avoid race condition of
-       multiple clients checking the cache, while a partially-written or empty
-       file is in there, being written from libcurl. */
-    fd = mkstemp (target_cache_tmppath);
-    if (fd >= 0) break;
-  }
+  for(int i=0; i<2; i++)
+    {
+      /* (re)create target directory in cache */
+      (void) mkdir(target_cache_dir, 0700); /* files will be 0400 later */
+      
+      /* NB: write to a temporary file first, to avoid race condition of
+         multiple clients checking the cache, while a partially-written or empty
+         file is in there, being written from libcurl. */
+      fd = mkstemp (target_cache_tmppath);
+      if (fd >= 0) break;
+    }
    if (fd < 0) /* Still failed after two iterations. */
      {
        rc = -errno;
        goto out0;
      }
  
-  /* Initialize the memory to zero */
-  char *strtok_saveptr;
    char **server_url_list = NULL;
    ima_policy_t* url_ima_policies = NULL;
-  char* server_url;
-  /* Count number of URLs.  */
-  int num_urls = 0;
-
-  ima_policy_t verification_mode = ignore; // The default mode
-  for(server_url = strtok_r(server_urls, url_delim, &strtok_saveptr);
-      server_url != NULL; server_url = strtok_r(NULL, url_delim, &strtok_saveptr))
+  char *server_url;
+  int num_urls;
+  r = init_server_urls("buildid", type, server_urls, &server_url_list, &url_ima_policies, &num_urls, vfd);
+  if (0 != r)
      {
-      // When we encounted a (well-formed) token off the form ima:foo, we update the policy
-      // under which results from that server will be ima verified
-      if(startswith(server_url, "ima:"))
-      {
-#ifdef ENABLE_IMA_VERIFICATION
-        ima_policy_t m = ima_policy_str2enum(server_url + strlen("ima:"));
-        if(m != undefined)
-          verification_mode = m;
-        else if (vfd >= 0)
-          dprintf(vfd, "IMA mode not recognized, skipping %s\n", server_url);
-#else
-        if (vfd >= 0)
-            dprintf(vfd, "IMA signature verification is not enabled, skipping %s\n", server_url);
-#endif
-        continue; // Not a url, just a mode change so keep going
-      }
-
-      if (verification_mode==enforcing && 0==strcmp(type,"section"))
-        {
-          if (vfd >= 0)
-            dprintf(vfd, "skipping server %s section query in IMA enforcing mode\n", server_url);
-          continue;
-        }
-      
-      /* PR 27983: If the url is already set to be used use, skip it */
-      char *slashbuildid;
-      if (strlen(server_url) > 1 && server_url[strlen(server_url)-1] == '/')
-        slashbuildid = "buildid";
-      else
-        slashbuildid = "/buildid";
-
-      char *tmp_url;
-      if (asprintf(&tmp_url, "%s%s", server_url, slashbuildid) == -1)
-        {
-          rc = -ENOMEM;
-          goto out1;
-        }
-      int url_index;
-      for (url_index = 0; url_index < num_urls; ++url_index)
-        {
-          if(strcmp(tmp_url, server_url_list[url_index]) == 0)
-            {
-              url_index = -1;
-              break;
-            }
-        }
-      if (url_index == -1)
-        {
-          if (vfd >= 0)
-            dprintf(vfd, "duplicate url: %s, skipping\n", tmp_url);
-          free(tmp_url);
-        }
-      else
-        {
-          num_urls++;
-          if (NULL == (server_url_list  = reallocarray(server_url_list, num_urls, sizeof(char*)))
-#ifdef ENABLE_IMA_VERIFICATION
-          || NULL == (url_ima_policies = reallocarray(url_ima_policies, num_urls, sizeof(ima_policy_t)))
-#endif
-            )
-            {
-              free (tmp_url);
-              rc = -ENOMEM;
-              goto out1;
-            }
-          server_url_list[num_urls-1] = tmp_url;
-          if(NULL != url_ima_policies) url_ima_policies[num_urls-1] = verification_mode;
-        }
+      rc = r;
+      goto out1;
      }
  
    /* No URLs survived parsing / filtering?  Abort abort abort. */
@@ -1705,7 +2048,6 @@ debuginfod_query_server (debuginfod_client *c,
      retry_limit = atoi (retry_limit_envvar);
  
    CURLM *curlm = c->server_mhandle;
-  assert (curlm != NULL);
  
    /* Tracks which handle should write to fd. Set to the first
       handle that is ready to write the target file to the cache.  */
@@ -1773,262 +2115,43 @@ debuginfod_query_server (debuginfod_client *c,
  
        data[i].fd = fd;
        data[i].target_handle = &target_handle;
-      data[i].handle = curl_easy_init();
-      if (data[i].handle == NULL)
-        {
-          if (filename) curl_free (escaped_string);
-          rc = -ENETUNREACH;
-          goto out2;
-        }
        data[i].client = c;
  
        if (filename) /* must start with / */
-        {
-          /* PR28034 escape characters in completed url to %hh format. */
-          snprintf(data[i].url, PATH_MAX, "%s/%s/%s/%s", server_url,
-                   build_id_bytes, type, escaped_string);
-        }
-      else if (section)
-       snprintf(data[i].url, PATH_MAX, "%s/%s/%s/%s", server_url,
-                build_id_bytes, type, section);
-      else
-        snprintf(data[i].url, PATH_MAX, "%s/%s/%s", server_url, build_id_bytes, type);
-      if (vfd >= 0)
-       dprintf (vfd, "url %d %s\n", i, data[i].url);
-
-      /* Some boilerplate for checking curl_easy_setopt.  */
-#define curl_easy_setopt_ck(H,O,P) do {                        \
-      CURLcode curl_res = curl_easy_setopt (H,O,P);    \
-      if (curl_res != CURLE_OK)                                \
-       {                                               \
-         if (vfd >= 0)                                 \
-           dprintf (vfd,                               \
-                    "Bad curl_easy_setopt: %s\n",      \
-                    curl_easy_strerror(curl_res));     \
-         rc = -EINVAL;                                 \
-         goto out2;                                    \
-       }                                               \
-      } while (0)
-
-      /* Only allow http:// + https:// + file:// so we aren't being
-        redirected to some unsupported protocol.
-         libcurl will fail if we request a single protocol that is not
-         available. https missing is the most likely issue  */
-#if CURL_AT_LEAST_VERSION(7, 85, 0)
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_PROTOCOLS_STR,
-                         curl_has_https ? "https,http,file" : "http,file");
-#else
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_PROTOCOLS,
-                         ((curl_has_https ? CURLPROTO_HTTPS : 0) | CURLPROTO_HTTP | CURLPROTO_FILE));
-#endif
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_URL, data[i].url);
-      if (vfd >= 0)
-       curl_easy_setopt_ck(data[i].handle, CURLOPT_ERRORBUFFER,
-                           data[i].errbuf);
-      curl_easy_setopt_ck(data[i].handle,
-                         CURLOPT_WRITEFUNCTION,
-                         debuginfod_write_callback);
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_WRITEDATA, (void*)&data[i]);
-      if (timeout > 0)
-       {
-         /* Make sure there is at least some progress,
-            try to get at least 100K per timeout seconds.  */
-         curl_easy_setopt_ck (data[i].handle, CURLOPT_LOW_SPEED_TIME,
-                              timeout);
-         curl_easy_setopt_ck (data[i].handle, CURLOPT_LOW_SPEED_LIMIT,
-                              100 * 1024L);
-       }
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_FILETIME, (long) 1);
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_FOLLOWLOCATION, (long) 1);
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_FAILONERROR, (long) 1);
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_NOSIGNAL, (long) 1);
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_HEADERFUNCTION,
-                         header_callback);
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_HEADERDATA,
-                         (void *) &(data[i]));
-#if LIBCURL_VERSION_NUM >= 0x072a00 /* 7.42.0 */
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_PATH_AS_IS, (long) 1);
-#else
-      /* On old curl; no big deal, canonicalization here is almost the
-         same, except perhaps for ? # type decorations at the tail. */
-#endif
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_AUTOREFERER, (long) 1);
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_ACCEPT_ENCODING, "");
-      curl_easy_setopt_ck(data[i].handle, CURLOPT_HTTPHEADER, c->headers);
-
-      curl_multi_add_handle(curlm, data[i].handle);
-    }
-
-  if (filename) curl_free(escaped_string);
-  /* Query servers in parallel.  */
-  if (vfd >= 0)
-    dprintf (vfd, "query %d urls in parallel\n", num_urls);
-  int still_running;
-  long loops = 0;
-  int committed_to = -1;
-  bool verbose_reported = false;
-  struct timespec start_time, cur_time;
-
-  free (c->winning_headers);
-  c->winning_headers = NULL;
-  if ( maxtime > 0 && clock_gettime(CLOCK_MONOTONIC_RAW, &start_time) == -1)
-    {
-      rc = -errno;
-      goto out2;
-    }
-  long delta = 0;
-  do
-    {
-      /* Check to see how long querying is taking. */
-      if (maxtime > 0)
-        {
-          if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time) == -1)
-            {
-              rc = -errno;
-              goto out2;
-            }
-          delta = cur_time.tv_sec - start_time.tv_sec;
-          if ( delta >  maxtime)
-            {
-              dprintf(vfd, "Timeout with max time=%lds and transfer time=%lds\n", maxtime, delta );
-              rc = -ETIME;
-              goto out2;
-            }
-        }
-      /* Wait 1 second, the minimum DEBUGINFOD_TIMEOUT.  */
-      curl_multi_wait(curlm, NULL, 0, 1000, NULL);
-      CURLMcode curlm_res = curl_multi_perform(curlm, &still_running);
-
-      /* If the target file has been found, abort the other queries.  */
-      if (target_handle != NULL)
-       {
-         for (int i = 0; i < num_urls; i++)
-           if (data[i].handle != target_handle)
-             curl_multi_remove_handle(curlm, data[i].handle);
-           else
-              {
-               committed_to = i;
-                if (c->winning_headers == NULL)
-                  {
-                    c->winning_headers = data[committed_to].response_data;
-                    data[committed_to].response_data = NULL;
-                    data[committed_to].response_data_size = 0;
-                  }
-
-              }
-       }
-
-      if (vfd >= 0 && !verbose_reported && committed_to >= 0)
-       {
-         bool pnl = (c->default_progressfn_printed_p && vfd == STDERR_FILENO);
-         dprintf (vfd, "%scommitted to url %d\n", pnl ? "\n" : "",
-                  committed_to);
-         if (pnl)
-           c->default_progressfn_printed_p = 0;
-         verbose_reported = true;
-       }
-
-      if (curlm_res != CURLM_OK)
-        {
-          switch (curlm_res)
-            {
-            case CURLM_CALL_MULTI_PERFORM: continue;
-            case CURLM_OUT_OF_MEMORY: rc = -ENOMEM; break;
-            default: rc = -ENETUNREACH; break;
-            }
-          goto out2;
-        }
-
-      long dl_size = -1;
-      if (target_handle && (c->progressfn || maxsize > 0))
-        {
-          /* Get size of file being downloaded. NB: If going through
-             deflate-compressing proxies, this number is likely to be
-             unavailable, so -1 may show. */
-          CURLcode curl_res;
-#if CURL_AT_LEAST_VERSION(7, 55, 0)
-          curl_off_t cl;
-          curl_res = curl_easy_getinfo(target_handle,
-                                       CURLINFO_CONTENT_LENGTH_DOWNLOAD_T,
-                                       &cl);
-          if (curl_res == CURLE_OK && cl >= 0)
-            dl_size = (cl > LONG_MAX ? LONG_MAX : (long)cl);
-#else
-          double cl;
-          curl_res = curl_easy_getinfo(target_handle,
-                                       CURLINFO_CONTENT_LENGTH_DOWNLOAD,
-                                       &cl);
-          if (curl_res == CURLE_OK && cl >= 0)
-            dl_size = (cl >= (double)(LONG_MAX+1UL) ? LONG_MAX : (long)cl);
-#endif
-          /* If Content-Length is -1, try to get the size from
-             X-Debuginfod-Size */
-          if (dl_size == -1 && c->winning_headers != NULL)
-            {
-              long xdl;
-              char *hdr = strcasestr(c->winning_headers, "x-debuginfod-size");
-              size_t off = strlen("x-debuginfod-size:");
-
-              if (hdr != NULL && sscanf(hdr + off, "%ld", &xdl) == 1)
-                dl_size = xdl;
-            }
-        }
-
-      if (c->progressfn) /* inform/check progress callback */
-        {
-          loops ++;
-          long pa = loops; /* default param for progress callback */
-          if (target_handle) /* we've committed to a server; report its download progress */
-            {
-              /* PR30809: Check actual size of cached file.  This same
-                 fd is shared by all the multi-curl handles (but only
-                 one will end up writing to it).  Another way could be
-                 to tabulate totals in debuginfod_write_callback(). */
-              struct stat cached;
-              int statrc = fstat(fd, &cached);
-              if (statrc == 0)
-                pa = (long) cached.st_size;
-              else
-                {
-                  /* Otherwise, query libcurl for its tabulated total.
-                     However, that counts http body length, not
-                     decoded/decompressed content length, so does not
-                     measure quite the same thing as dl. */
-                  CURLcode curl_res;
-#if CURL_AT_LEAST_VERSION(7, 55, 0)
-                  curl_off_t dl;
-                  curl_res = curl_easy_getinfo(target_handle,
-                                               CURLINFO_SIZE_DOWNLOAD_T,
-                                               &dl);
-                  if (curl_res == 0 && dl >= 0)
-                    pa = (dl > LONG_MAX ? LONG_MAX : (long)dl);
-#else
-                  double dl;
-                  curl_res = curl_easy_getinfo(target_handle,
-                                               CURLINFO_SIZE_DOWNLOAD,
-                                               &dl);
-                  if (curl_res == 0)
-                    pa = (dl >= (double)(LONG_MAX+1UL) ? LONG_MAX : (long)dl);
-#endif
-                }
-            }
-
-          if ((*c->progressfn) (c, pa, dl_size == -1 ? 0 : dl_size))
-           {
-             c->progressfn_cancel = true;
-              break;
-           }
+        {
+          /* PR28034 escape characters in completed url to %hh format. */
+          snprintf(data[i].url, PATH_MAX, "%s/%s/%s/%s", server_url,
+                   build_id_bytes, type, escaped_string);
          }
+      else if (section)
+       snprintf(data[i].url, PATH_MAX, "%s/%s/%s/%s", server_url,
+                build_id_bytes, type, section);
+      else
+        snprintf(data[i].url, PATH_MAX, "%s/%s/%s", server_url, build_id_bytes, type);
  
-      /* Check to see if we are downloading something which exceeds maxsize, if set.*/
-      if (target_handle && dl_size > maxsize && maxsize > 0)
+      r = init_handle(c, debuginfod_write_callback, header_callback, &data[i], i, timeout, vfd);
+      if (0 != r)
          {
-          if (vfd >=0)
-            dprintf(vfd, "Content-Length too large.\n");
-          rc = -EFBIG;
+          rc = r;
+          if (filename) curl_free (escaped_string);
            goto out2;
          }
-    } while (still_running);
+
+      curl_multi_add_handle(curlm, data[i].handle);
+    }
+
+  if (filename) curl_free(escaped_string);
+
+  /* Query servers in parallel.  */
+  if (vfd >= 0)
+    dprintf (vfd, "query %d urls in parallel\n", num_urls);
+  int committed_to;
+  r = perform_queries(curlm, &target_handle, data, c, num_urls, maxtime, maxsize, true,  vfd, &committed_to);
+  if (0 != r)
+    {
+      rc = r;
+      goto out2;
+    }
  
    /* Check whether a query was successful. If so, assign its handle
       to verified_handle.  */
@@ -2180,6 +2303,7 @@ debuginfod_query_server (debuginfod_client *c,
                curl_multi_remove_handle(curlm, data[i].handle); /* ok to repeat */
                curl_easy_cleanup (data[i].handle);
                free(data[i].response_data);
+              data[i].response_data = NULL;
              }
              free(c->winning_headers);
              c->winning_headers = NULL;
@@ -2427,7 +2551,7 @@ debuginfod_find_debuginfo (debuginfod_client *client,
                            const unsigned char *build_id, int build_id_len,
                             char **path)
  {
-  return debuginfod_query_server(client, build_id, build_id_len,
+  return debuginfod_query_server_by_buildid(client, build_id, build_id_len,
                                   "debuginfo", NULL, path);
  }
  
@@ -2438,7 +2562,7 @@ debuginfod_find_executable(debuginfod_client *client,
                            const unsigned char *build_id, int build_id_len,
                             char **path)
  {
-  return debuginfod_query_server(client, build_id, build_id_len,
+  return debuginfod_query_server_by_buildid(client, build_id, build_id_len,
                                   "executable", NULL, path);
  }
  
@@ -2447,7 +2571,7 @@ int debuginfod_find_source(debuginfod_client *client,
                            const unsigned char *build_id, int build_id_len,
                             const char *filename, char **path)
  {
-  return debuginfod_query_server(client, build_id, build_id_len,
+  return debuginfod_query_server_by_buildid(client, build_id, build_id_len,
                                   "source", filename, path);
  }
  
@@ -2456,8 +2580,8 @@ debuginfod_find_section (debuginfod_client *client,
                          const unsigned char *build_id, int build_id_len,
                          const char *section, char **path)
  {
-  int rc = debuginfod_query_server(client, build_id, build_id_len,
-                                  "section", section, path);
+  int rc = debuginfod_query_server_by_buildid(client, build_id, build_id_len,
+                                              "section", section, path);
    if (rc != -EINVAL && rc != -ENOSYS)
      return rc;
    /* NB: we fall through in case of ima:enforcing-filtered DEBUGINFOD_URLS servers,
@@ -2508,6 +2632,380 @@ debuginfod_find_section (debuginfod_client *client,
    return rc;
  }
  
+
+int debuginfod_find_metadata (debuginfod_client *client,
+                              const char* key, const char* value, char **path)
+{
+  char *server_urls = NULL;
+  char *urls_envvar = NULL;
+  char *cache_path = NULL;
+  char *target_cache_dir = NULL;
+  char *target_cache_path = NULL;
+  char *target_cache_tmppath = NULL;
+  char *target_file_name = NULL;
+  char *key_and_value = NULL;
+  int rc = 0, r;
+  int vfd = client->verbose_fd;
+  struct handle_data *data = NULL;
+  
+  json_object *json_metadata = json_object_new_object();
+  json_bool json_metadata_complete = true;
+  json_object *json_metadata_arr = json_object_new_array();
+  if (NULL == json_metadata)
+    {
+      rc = -ENOMEM;
+      goto out;
+    }
+  json_object_object_add(json_metadata, "results",
+                         json_metadata_arr ?: json_object_new_array() /* Empty array */);
+
+  if (NULL == value || NULL == key)
+    {
+      rc = -EINVAL;
+      goto out;
+    }
+
+  if (vfd >= 0)
+    dprintf (vfd, "debuginfod_find_metadata %s %s\n", key, value);
+
+  /* Without query-able URL, we can stop here*/
+  urls_envvar = getenv(DEBUGINFOD_URLS_ENV_VAR);
+  if (vfd >= 0)
+    dprintf (vfd, "server urls \"%s\"\n",
+      urls_envvar != NULL ? urls_envvar : "");
+  if (urls_envvar == NULL || urls_envvar[0] == '\0')
+  {
+    rc = -ENOSYS;
+    goto out;
+  }
+
+  /* set paths needed to perform the query
+     example format:
+     cache_path:        $HOME/.cache
+     target_cache_dir:  $HOME/.cache/metadata
+     target_cache_path: $HOME/.cache/metadata/KEYENCODED_VALUEENCODED
+     target_cache_path: $HOME/.cache/metadata/KEYENCODED_VALUEENCODED.XXXXXX
+  */
+
+  // libcurl > 7.62ish has curl_url_set()/etc. to construct these things more properly.
+  // curl_easy_escape() is older
+  {
+    CURL *c = curl_easy_init();
+    if (!c)
+      {
+        rc = -ENOMEM;
+        goto out;
+      }
+    char *key_escaped = curl_easy_escape(c, key, 0);
+    char *value_escaped = curl_easy_escape(c, value, 0);
+    
+    // fallback to unescaped values in unlikely case of error
+    xalloc_str (key_and_value, "key=%s&value=%s", key_escaped ?: key, value_escaped ?: value);
+    xalloc_str (target_file_name, "%s_%s", key_escaped ?: key, value_escaped ?: value);
+    curl_free(value_escaped);
+    curl_free(key_escaped);
+    curl_easy_cleanup(c);
+  }
+
+  /* Check if we have a recent result already in the cache. */
+  cache_path = make_cache_path();
+  if (! cache_path)
+    {
+      rc = -ENOMEM;
+      goto out;
+    }
+  xalloc_str (target_cache_dir, "%s/metadata", cache_path);
+  (void) mkdir (target_cache_dir, 0700);
+  xalloc_str (target_cache_path, "%s/%s", target_cache_dir, target_file_name);
+  xalloc_str (target_cache_tmppath, "%s/%s.XXXXXX", target_cache_dir, target_file_name);
+
+  int fd = open(target_cache_path, O_RDONLY);
+  if (fd >= 0)
+    {
+      struct stat st;
+      int metadata_retention = 0;
+      time_t now = time(NULL);
+      char *metadata_retention_path = 0;
+
+      xalloc_str (metadata_retention_path, "%s/%s", cache_path, metadata_retention_filename);
+      if (metadata_retention_path)
+        {
+          rc = debuginfod_config_cache(client, metadata_retention_path,
+                                       metadata_retention_default_s, &st);
+          free (metadata_retention_path);
+          if (rc < 0)
+            rc = 0;
+        }
+      else
+        rc = 0;
+      metadata_retention = rc;
+
+      if (fstat(fd, &st) != 0)
+        {
+          rc = -errno;
+          close (fd);
+          goto out;
+        }
+
+      if (metadata_retention > 0 && (now - st.st_mtime <= metadata_retention))
+        {
+          if (client && client->verbose_fd >= 0)
+            dprintf (client->verbose_fd, "cached metadata %s", target_file_name);
+
+          if (path != NULL)
+            {
+              *path = target_cache_path; // pass over the pointer
+              target_cache_path = NULL; // prevent free() in our own cleanup
+            }
+
+          /* Success!!!! */
+          rc = fd;
+          goto out;
+        }
+
+      /* We don't have to clear the likely-expired cached object here
+         by unlinking.  We will shortly make a new request and save
+         results right on top.  Erasing here could trigger a TOCTOU
+         race with another thread just finishing a query and passing
+         its results back.
+      */
+      // (void) unlink (target_cache_path);
+
+      close (fd);
+    }
+
+  /* No valid cached metadata found: time to make the queries. */
+
+  free (client->url);
+  client->url = NULL;
+
+  long maxtime = 0;
+  const char *maxtime_envvar;
+  maxtime_envvar = getenv(DEBUGINFOD_MAXTIME_ENV_VAR);
+  if (maxtime_envvar != NULL)
+    maxtime = atol (maxtime_envvar);
+  if (maxtime && vfd >= 0)
+    dprintf(vfd, "using max time %lds\n", maxtime);
+
+  long timeout = default_timeout;
+  const char* timeout_envvar = getenv(DEBUGINFOD_TIMEOUT_ENV_VAR);
+  if (timeout_envvar != NULL)
+    timeout = atoi (timeout_envvar);
+  if (vfd >= 0)
+    dprintf (vfd, "using timeout %ld\n", timeout);
+
+  add_default_headers(client);
+
+  /* Make a copy of the envvar so it can be safely modified.  */
+  server_urls = strdup(urls_envvar);
+  if (server_urls == NULL)
+  {
+    rc = -ENOMEM;
+    goto out;
+  }
+
+  /* Thereafter, goto out1 on error*/
+
+  char **server_url_list = NULL;
+  ima_policy_t* url_ima_policies = NULL;
+  char *server_url;
+  int num_urls = 0;
+  r = init_server_urls("metadata", NULL, server_urls, &server_url_list, &url_ima_policies, &num_urls, vfd);
+  if (0 != r)
+    {
+      rc = r;
+      goto out1;
+    }
+
+  CURLM *curlm = client->server_mhandle;
+
+  CURL *target_handle = NULL;
+  data = malloc(sizeof(struct handle_data) * num_urls);
+  if (data == NULL)
+    {
+      rc = -ENOMEM;
+      goto out1;
+    }
+
+  /* thereafter, goto out2 on error.  */
+
+  /* Initialize handle_data  */
+  for (int i = 0; i < num_urls; i++)
+    {
+      if ((server_url = server_url_list[i]) == NULL)
+        break;
+      if (vfd >= 0)
+        dprintf (vfd, "init server %d %s\n", i, server_url);
+      
+      data[i].errbuf[0] = '\0';
+      data[i].target_handle = &target_handle;
+      data[i].client = client;
+      data[i].metadata = NULL;
+      data[i].metadata_size = 0;
+      data[i].response_data = NULL;
+      data[i].response_data_size = 0;
+      
+      snprintf(data[i].url, PATH_MAX, "%s?%s", server_url, key_and_value);
+      
+      r = init_handle(client, metadata_callback, header_callback, &data[i], i, timeout, vfd);
+      if (0 != r)
+        {
+          rc = r;
+          goto out2;
+        }
+      curl_multi_add_handle(curlm, data[i].handle);
+    }
+
+  /* Query servers */
+  if (vfd >= 0)
+    dprintf (vfd, "Starting %d queries\n",num_urls);
+  int committed_to;
+  r = perform_queries(curlm, NULL, data, client, num_urls, maxtime, 0, false, vfd, &committed_to);
+  if (0 != r)
+    {
+      rc = r;
+      goto out2;
+    }
+
+  /* NOTE: We don't check the return codes of the curl messages since
+     a metadata query failing silently is just fine. We want to know what's
+     available from servers which can be connected with no issues.
+     If running with additional verbosity, the failure will be noted in stderr */
+
+  /* Building the new json array from all the upstream data and
+     cleanup while at it.
+   */
+  for (int i = 0; i < num_urls; i++)
+    {
+      curl_multi_remove_handle(curlm, data[i].handle); /* ok to repeat */
+      curl_easy_cleanup (data[i].handle);
+      free (data[i].response_data);
+      
+      if (NULL == data[i].metadata)
+        {
+          if (vfd >= 0)
+            dprintf (vfd, "Query to %s failed with error message:\n\t\"%s\"\n",
+                     data[i].url, data[i].errbuf);
+          json_metadata_complete = false;
+          continue;
+        }
+
+      json_object *upstream_metadata = json_tokener_parse(data[i].metadata);
+      json_object *upstream_complete;
+      json_object *upstream_metadata_arr;
+      if (NULL == upstream_metadata ||
+          !json_object_object_get_ex(upstream_metadata, "results", &upstream_metadata_arr) ||
+          !json_object_object_get_ex(upstream_metadata, "complete", &upstream_complete))
+        continue;
+      json_metadata_complete &= json_object_get_boolean(upstream_complete);
+      // Combine the upstream metadata into the json array
+      for (int j = 0, n = json_object_array_length(upstream_metadata_arr); j < n; j++)
+        {
+          json_object *entry = json_object_array_get_idx(upstream_metadata_arr, j);
+          json_object_get(entry); // increment reference count
+          json_object_array_add(json_metadata_arr, entry);
+        }
+      json_object_put(upstream_metadata);
+
+      free (data[i].metadata);
+    }
+
+  /* Because of race with cache cleanup / rmdir, try to mkdir/mkstemp up to twice. */
+  for (int i=0; i<2; i++)
+    {
+      /* (re)create target directory in cache */
+      (void) mkdir(target_cache_dir, 0700); /* files will be 0400 later */
+
+      /* NB: write to a temporary file first, to avoid race condition of
+         multiple clients checking the cache, while a partially-written or empty
+         file is in there, being written from libcurl. */
+      fd = mkstemp (target_cache_tmppath);
+      if (fd >= 0) break;
+    }
+  if (fd < 0) /* Still failed after two iterations. */
+    {
+      rc = -errno;
+      goto out1;
+    }
+    
+  /* Plop the complete json_metadata object into the cache. */
+  json_object_object_add(json_metadata, "complete", json_object_new_boolean(json_metadata_complete));
+  const char* json_string = json_object_to_json_string_ext(json_metadata, JSON_C_TO_STRING_PRETTY);
+  if (json_string == NULL)
+    {
+      rc = -ENOMEM;
+      goto out1;
+    }
+  ssize_t res = write_retry (fd, json_string, strlen(json_string));
+  (void) lseek(fd, 0, SEEK_SET); // rewind file so client can read it from the top
+  
+  /* NB: json_string is auto deleted when json_metadata object is nuked */
+  if (res < 0 || (size_t) res != strlen(json_string))
+    {
+      rc = -EIO;
+      goto out1;
+    }
+  /* PR27571: make cache files casually unwriteable; dirs are already 0700 */
+  (void) fchmod(fd, 0400);
+
+  /* rename tmp->real */
+  rc = rename (target_cache_tmppath, target_cache_path);
+  if (rc < 0)
+    {
+      rc = -errno;
+      goto out1;
+      /* Perhaps we need not give up right away; could retry or something ... */
+    }
+  
+  /* don't close fd - we're returning it */
+  /* don't unlink the tmppath; it's already been renamed. */
+  if (path != NULL)
+   *path = strdup(target_cache_path);
+
+  rc = fd;
+  goto out1;
+
+/* error exits */
+out2:
+  /* remove all handles from multi */
+  for (int i = 0; i < num_urls; i++)
+  {
+    if (data[i].handle != NULL)
+    {
+      curl_multi_remove_handle(curlm, data[i].handle); /* ok to repeat */
+      curl_easy_cleanup (data[i].handle);
+      free (data[i].response_data);
+      free (data[i].metadata);
+    }
+  }
+
+out1:
+  free(data);
+                              
+  for (int i = 0; i < num_urls; ++i)
+    free(server_url_list[i]);
+  free(server_url_list);
+  free(url_ima_policies);
+
+out:
+  free (server_urls);
+  json_object_put(json_metadata);
+  /* Reset sent headers */
+  curl_slist_free_all (client->headers);
+  client->headers = NULL;
+  client->user_agent_set_p = 0;
+
+  free (target_cache_dir);
+  free (target_cache_path);
+  free (target_cache_tmppath);
+  free (key_and_value);
+  free (target_file_name);
+  free (cache_path);
+    
+  return rc;
+}
+
+
  /* Add an outgoing HTTP header.  */
  int debuginfod_add_http_header (debuginfod_client *client, const char* header)
  {
diff --git a/debuginfod/debuginfod-find.c b/debuginfod/debuginfod-find.c

index 080dd8f2c6a35f491b916de358a10198f83fab6b..0ef80377a81bcad9241079e2c774d1a8ad9c50f3 100644 (file)
--- a/debuginfod/debuginfod-find.c
+++ b/debuginfod/debuginfod-find.c
@@ -1,6 +1,6 @@
  /* Command-line frontend for retrieving ELF / DWARF / source files
     from the debuginfod.
-   Copyright (C) 2019-2020 Red Hat, Inc.
+   Copyright (C) 2019-2023 Red Hat, Inc.
     This file is part of elfutils.
  
     This file is free software; you can redistribute it and/or modify
@@ -30,7 +30,7 @@
  #include <fcntl.h>
  #include <gelf.h>
  #include <libdwelf.h>
-
+#include <json-c/json.h>
  
  /* Name and version of program.  */
  ARGP_PROGRAM_VERSION_HOOK_DEF = print_version;
@@ -49,9 +49,10 @@ static const char args_doc[] = N_("debuginfo BUILDID\n"
                                    "executable PATH\n"
                                    "source BUILDID /FILENAME\n"
                                    "source PATH /FILENAME\n"
-                                 "section BUILDID SECTION-NAME\n"
-                                 "section PATH SECTION-NAME\n");
-
+                                  "section BUILDID SECTION-NAME\n"
+                                  "section PATH SECTION-NAME\n"
+                                  "metadata (glob|file|KEY) (GLOB|FILENAME|VALUE)\n"
+                                  );
  
  /* Definitions of arguments for argp functions.  */
  static const struct argp_option options[] =
@@ -145,49 +146,60 @@ main(int argc, char** argv)
    /* If we were passed an ELF file name in the BUILDID slot, look in there. */
    unsigned char* build_id = (unsigned char*) argv[remaining+1];
    int build_id_len = 0; /* assume text */
-
-  int any_non_hex = 0;
-  int i;
-  for (i = 0; build_id[i] != '\0'; i++)
-    if ((build_id[i] >= '0' && build_id[i] <= '9') ||
-        (build_id[i] >= 'a' && build_id[i] <= 'f'))
-      ;
-    else
-      any_non_hex = 1;
-
-  int fd = -1;
    Elf* elf = NULL;
-  if (any_non_hex) /* raw build-id */
-    {
-      fd = open ((char*) build_id, O_RDONLY);
-      if (fd < 0)
-        fprintf (stderr, "Cannot open %s: %s\n", build_id, strerror(errno));
-    }
-  if (fd >= 0)
-    {
-      elf = dwelf_elf_begin (fd);
-      if (elf == NULL)
-        fprintf (stderr, "Cannot open as ELF file %s: %s\n", build_id,
-                elf_errmsg (-1));
-    }
-  if (elf != NULL)
+
+  /* Process optional buildid given via ELF file name, for some query types only. */
+  if (strcmp(argv[remaining], "debuginfo") == 0
+      || strcmp(argv[remaining], "executable") == 0
+      || strcmp(argv[remaining], "source") == 0
+      || strcmp(argv[remaining], "section") == 0)
      {
-      const void *extracted_build_id;
-      ssize_t s = dwelf_elf_gnu_build_id(elf, &extracted_build_id);
-      if (s > 0)
+      int any_non_hex = 0;
+      int i;
+      for (i = 0; build_id[i] != '\0'; i++)
+        if ((build_id[i] >= '0' && build_id[i] <= '9') ||
+            (build_id[i] >= 'a' && build_id[i] <= 'f'))
+          ;
+        else
+          any_non_hex = 1;
+      
+      int fd = -1;
+      if (any_non_hex) /* raw build-id */
+        {
+          fd = open ((char*) build_id, O_RDONLY);
+          if (fd < 0)
+            fprintf (stderr, "Cannot open %s: %s\n", build_id, strerror(errno));
+        }
+      if (fd >= 0)
+        {
+          elf = dwelf_elf_begin (fd);
+          if (elf == NULL)
+            fprintf (stderr, "Cannot open as ELF file %s: %s\n", build_id,
+                     elf_errmsg (-1));
+        }
+      if (elf != NULL)
          {
-          /* Success: replace the build_id pointer/len with the binary blob
-             that elfutils is keeping for us.  It'll remain valid until elf_end(). */
-          build_id = (unsigned char*) extracted_build_id;
-          build_id_len = s;
+          const void *extracted_build_id;
+          ssize_t s = dwelf_elf_gnu_build_id(elf, &extracted_build_id);
+          if (s > 0)
+            {
+              /* Success: replace the build_id pointer/len with the binary blob
+                 that elfutils is keeping for us.  It'll remain valid until elf_end(). */
+              build_id = (unsigned char*) extracted_build_id;
+              build_id_len = s;
+            }
+          else
+            fprintf (stderr, "Cannot extract build-id from %s: %s\n", build_id, elf_errmsg(-1));
          }
-      else
-        fprintf (stderr, "Cannot extract build-id from %s: %s\n", build_id, elf_errmsg(-1));
      }
  
    char *cache_name;
    int rc = 0;
  
+  /* By default the stdout output is the path of the cached file.
+     Some requests (ex. metadata query may instead choose to do a different output,
+     in that case a stringified json object) */
+  bool print_cached_file = true;
    /* Check whether FILETYPE is valid and call the appropriate
       debuginfod_find_* function. If FILETYPE is "source"
       then ensure a FILENAME was also supplied as an argument.  */
@@ -221,6 +233,38 @@ main(int argc, char** argv)
        rc = debuginfod_find_section(client, build_id, build_id_len,
                                    argv[remaining+2], &cache_name);
      }
+  else if (strcmp(argv[remaining], "metadata") == 0) /* no buildid! */
+    {
+      if (remaining+2 == argc)
+        {
+          fprintf(stderr, "Require KEY and VALUE for \"metadata\"\n");
+          return 1;
+        }
+      
+      rc = debuginfod_find_metadata (client, argv[remaining+1], argv[remaining+2],
+                                     &cache_name);
+      if (rc >= 0)
+        {
+          /* We output a pprinted JSON object, not the regular debuginfod-find cached file path */
+          print_cached_file = false;
+          json_object *metadata = json_object_from_file(cache_name);
+          if(metadata)
+            {
+              printf("%s\n", json_object_to_json_string_ext(metadata,
+                                                            JSON_C_TO_STRING_PRETTY
+#ifdef JSON_C_TO_STRING_NOSLASHESCAPE /* json-c 0.15 */
+                                                            | JSON_C_TO_STRING_NOSLASHESCAPE
+#endif
+                                                            ));
+              json_object_put(metadata);
+            }
+          else
+            {
+              fprintf(stderr, "%s does not contain a valid JSON format object\n", cache_name);
+              return 1;
+            }
+        }
+    }
    else
      {
        argp_help (&argp, stderr, ARGP_HELP_USAGE, argv[0]);
@@ -240,8 +284,6 @@ main(int argc, char** argv)
    debuginfod_end (client);
    if (elf)
      elf_end(elf);
-  if (fd >= 0)
-    close (fd);
  
    if (rc < 0)
      {
@@ -251,7 +293,7 @@ main(int argc, char** argv)
    else
      close (rc);
  
-  printf("%s\n", cache_name);
+  if(print_cached_file) printf("%s\n", cache_name);
    free (cache_name);
  
    return 0;
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx

index d9259ad26bb88741d8dedc3556830e87c720e35f..305edde8102139919332edcb4ac282430e694602 100644 (file)
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -76,6 +76,7 @@ extern "C" {
  #include <netdb.h>
  #include <math.h>
  #include <float.h>
+#include <fnmatch.h>
  
  
  /* If fts.h is included before config.h, its indirect inclusions may not
@@ -148,6 +149,7 @@ extern "C" {
  #include "printversion.h"
  #include "system.h"
  }
+#include <json-c/json.h>
  
  
  inline bool
@@ -220,7 +222,7 @@ static const char DEBUGINFOD_SQLITE_DDL[] =
    "        foreign key (buildid) references " BUILDIDS "_buildids(id) on update cascade on delete cascade,\n"
    "        primary key (buildid, file, mtime)\n"
    "        ) " WITHOUT_ROWID ";\n"
-  // Index for faster delete by file identifier
+  // Index for faster delete by file identifier and metadata searches
    "create index if not exists " BUILDIDS "_f_de_idx on " BUILDIDS "_f_de (file, mtime);\n"
    "create table if not exists " BUILDIDS "_f_s (\n"
    "        buildid integer not null,\n"
@@ -246,6 +248,8 @@ static const char DEBUGINFOD_SQLITE_DDL[] =
    "        ) " WITHOUT_ROWID ";\n"
    // Index for faster delete by archive file identifier
    "create index if not exists " BUILDIDS "_r_de_idx on " BUILDIDS "_r_de (file, mtime);\n"
+  // Index for metadata searches
+  "create index if not exists " BUILDIDS "_r_de_idx2 on " BUILDIDS "_r_de (content);\n"  
    "create table if not exists " BUILDIDS "_r_sref (\n" // outgoing dwarf sourcefile references from rpm
    "        buildid integer not null,\n"
    "        artifactsrc integer not null,\n"
@@ -454,6 +458,9 @@ static const struct argp_option options[] =
  #define ARGP_KEY_KOJI_SIGCACHE 0x100B
     { "koji-sigcache", ARGP_KEY_KOJI_SIGCACHE, NULL, 0, "Do a koji specific mapping of rpm paths to get IMA signatures.", 0 },
  #endif
+#define ARGP_KEY_METADATA_MAXTIME 0x100C
+   { "metadata-maxtime", ARGP_KEY_METADATA_MAXTIME, "SECONDS", 0,
+     "Number of seconds to limit metadata query run time, 0=unlimited.", 0 },
     { NULL, 0, NULL, 0, NULL, 0 },
    };
  
@@ -509,6 +516,7 @@ static long scan_checkpoint = 256;
  #ifdef ENABLE_IMA_VERIFICATION
  static bool requires_koji_sigcache_mapping = false;
  #endif
+static unsigned metadata_maxtime_s = 5;
  
  static void set_metric(const string& key, double value);
  static void inc_metric(const string& key);
@@ -711,7 +719,10 @@ parse_opt (int key, char *arg,
      case ARGP_SCAN_CHECKPOINT:
        scan_checkpoint = atol (arg);
        if (scan_checkpoint < 0)
-        argp_failure(state, 1, EINVAL, "scan checkpoint");        
+        argp_failure(state, 1, EINVAL, "scan checkpoint");
+      break;
+    case ARGP_KEY_METADATA_MAXTIME:
+      metadata_maxtime_s = (unsigned) atoi(arg);
        break;
  #ifdef ENABLE_IMA_VERIFICATION
      case ARGP_KEY_KOJI_SIGCACHE:
@@ -2382,6 +2393,58 @@ handle_buildid_r_match (bool internal_req_p,
    return r;
  }
  
+void
+add_client_federation_headers(debuginfod_client *client, MHD_Connection* conn){
+  // Transcribe incoming User-Agent:
+  string ua = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "User-Agent") ?: "";
+  string ua_complete = string("User-Agent: ") + ua;
+  debuginfod_add_http_header (client, ua_complete.c_str());
+
+  // Compute larger XFF:, for avoiding info loss during
+  // federation, and for future cyclicity detection.
+  string xff = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "X-Forwarded-For") ?: "";
+  if (xff != "")
+    xff += string(", "); // comma separated list
+
+  unsigned int xff_count = 0;
+  for (auto&& i : xff){
+    if (i == ',') xff_count++;
+  }
+
+  // if X-Forwarded-For: exceeds N hops,
+  // do not delegate a local lookup miss to upstream debuginfods.
+  if (xff_count >= forwarded_ttl_limit)
+    throw reportable_exception(MHD_HTTP_NOT_FOUND, "not found, --forwared-ttl-limit reached \
+and will not query the upstream servers");
+
+  // Compute the client's numeric IP address only - so can't merge with conninfo()
+  const union MHD_ConnectionInfo *u = MHD_get_connection_info (conn,
+                                                                MHD_CONNECTION_INFO_CLIENT_ADDRESS);
+  struct sockaddr *so = u ? u->client_addr : 0;
+  char hostname[256] = ""; // RFC1035
+  if (so && so->sa_family == AF_INET) {
+    (void) getnameinfo (so, sizeof (struct sockaddr_in), hostname, sizeof (hostname), NULL, 0,
+                        NI_NUMERICHOST);
+  } else if (so && so->sa_family == AF_INET6) {
+    struct sockaddr_in6* addr6 = (struct sockaddr_in6*) so;
+    if (IN6_IS_ADDR_V4MAPPED(&addr6->sin6_addr)) {
+      struct sockaddr_in addr4;
+      memset (&addr4, 0, sizeof(addr4));
+      addr4.sin_family = AF_INET;
+      addr4.sin_port = addr6->sin6_port;
+      memcpy (&addr4.sin_addr.s_addr, addr6->sin6_addr.s6_addr+12, sizeof(addr4.sin_addr.s_addr));
+      (void) getnameinfo ((struct sockaddr*) &addr4, sizeof (addr4),
+                          hostname, sizeof (hostname), NULL, 0,
+                          NI_NUMERICHOST);
+    } else {
+      (void) getnameinfo (so, sizeof (struct sockaddr_in6), hostname, sizeof (hostname), NULL, 0,
+                          NI_NUMERICHOST);
+    }
+  }
+
+  string xff_complete = string("X-Forwarded-For: ")+xff+string(hostname);
+  debuginfod_add_http_header (client, xff_complete.c_str());
+}
  
  static struct MHD_Response*
  handle_buildid_match (bool internal_req_p,
@@ -2615,58 +2678,8 @@ handle_buildid (MHD_Connection* conn,
    debuginfod_set_progressfn (client, & debuginfod_find_progress);
  
    if (conn)
-    {
-      // Transcribe incoming User-Agent:
-      string ua = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "User-Agent") ?: "";
-      string ua_complete = string("User-Agent: ") + ua;
-      debuginfod_add_http_header (client, ua_complete.c_str());
-      
-      // Compute larger XFF:, for avoiding info loss during
-      // federation, and for future cyclicity detection.
-      string xff = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "X-Forwarded-For") ?: "";
-      if (xff != "")
-        xff += string(", "); // comma separated list
-      
-      unsigned int xff_count = 0;
-      for (auto&& i : xff){
-        if (i == ',') xff_count++;
-      }
+    add_client_federation_headers(client, conn);
  
-      // if X-Forwarded-For: exceeds N hops,
-      // do not delegate a local lookup miss to upstream debuginfods.
-      if (xff_count >= forwarded_ttl_limit)
-        throw reportable_exception(MHD_HTTP_NOT_FOUND, "not found, --forwared-ttl-limit reached \
-and will not query the upstream servers");
-
-      // Compute the client's numeric IP address only - so can't merge with conninfo()
-      const union MHD_ConnectionInfo *u = MHD_get_connection_info (conn,
-                                                                   MHD_CONNECTION_INFO_CLIENT_ADDRESS);
-      struct sockaddr *so = u ? u->client_addr : 0;
-      char hostname[256] = ""; // RFC1035
-      if (so && so->sa_family == AF_INET) {
-        (void) getnameinfo (so, sizeof (struct sockaddr_in), hostname, sizeof (hostname), NULL, 0,
-                            NI_NUMERICHOST);
-      } else if (so && so->sa_family == AF_INET6) {
-        struct sockaddr_in6* addr6 = (struct sockaddr_in6*) so;
-        if (IN6_IS_ADDR_V4MAPPED(&addr6->sin6_addr)) {
-          struct sockaddr_in addr4;
-          memset (&addr4, 0, sizeof(addr4));
-          addr4.sin_family = AF_INET;
-          addr4.sin_port = addr6->sin6_port;
-          memcpy (&addr4.sin_addr.s_addr, addr6->sin6_addr.s6_addr+12, sizeof(addr4.sin_addr.s_addr));
-          (void) getnameinfo ((struct sockaddr*) &addr4, sizeof (addr4),
-                              hostname, sizeof (hostname), NULL, 0,
-                              NI_NUMERICHOST);
-        } else {
-          (void) getnameinfo (so, sizeof (struct sockaddr_in6), hostname, sizeof (hostname), NULL, 0,
-                              NI_NUMERICHOST);
-        }
-      }
-          
-      string xff_complete = string("X-Forwarded-For: ")+xff+string(hostname);
-      debuginfod_add_http_header (client, xff_complete.c_str());
-    }
-  
    if (artifacttype == "debuginfo")
      fd = debuginfod_find_debuginfo (client,
                                      (const unsigned char*) buildid.c_str(),
@@ -2873,6 +2886,225 @@ handle_metrics (off_t* size)
    return r;
  }
  
+
+static struct MHD_Response*
+handle_metadata (MHD_Connection* conn,
+                 string key, string value, off_t* size)
+{
+  MHD_Response* r;
+  sqlite3 *thisdb = dbq;
+
+  // Query locally for matching e, d files
+  string op;
+  if (key == "glob")
+    op = "glob";
+  else if (key == "file")
+    op = "=";
+  else
+    throw reportable_exception("/metadata webapi error, unsupported key");
+
+  // Since PR30378, the file names are segmented into two tables.  We
+  // could do a glob/= search over the _files_v view that combines
+  // them, but that means that the entire _files_v thing has to be
+  // materialized & scanned to do the query.  Slow!  Instead, we can
+  // segment the incoming file/glob pattern into dirname / basename
+  // parts, and apply them to the corresponding table.  This is done
+  // by splitting the value at the last "/".  If absent, the same
+  // convention as is used in register_file_name().
+
+  string dirname, bname; // basename is a "poisoned" identifier on some distros
+  size_t slash = value.rfind('/');
+  if (slash == std::string::npos) {
+    dirname = "";
+    bname = value;
+  } else {
+    dirname = value.substr(0, slash);
+    bname = value.substr(slash+1);
+  }
+
+  // NB: further optimization is possible: replacing the 'glob' op
+  // with simple equality, if the corresponding value segment lacks
+  // metacharacters.  sqlite may or may not be smart enough to do so,
+  // so we help out.
+  string metacharacters = "[]*?";
+  string dop = (op == "glob" && dirname.find_first_of(metacharacters) == string::npos) ? "=" : op;
+  string bop = (op == "glob" && bname.find_first_of(metacharacters) == string::npos) ? "=" : op;
+  
+  string sql = string(
+                      // explicit query r_de and f_de once here, rather than the query_d and query_e
+                      // separately, because they scan the same tables, so we'd double the work
+                      "select d1.executable_p, d1.debuginfo_p, 0 as source_p, "
+                      "       b1.hex, f1d.name || '/' || f1b.name as file, a1.name as archive "
+                      "from " BUILDIDS "_r_de d1, " BUILDIDS "_files f1, " BUILDIDS "_fileparts f1b, " BUILDIDS "_fileparts f1d, "
+                      BUILDIDS "_buildids b1, " BUILDIDS "_files_v a1 "
+                      "where f1.id = d1.content and a1.id = d1.file and d1.buildid = b1.id "
+                      "      and f1d.name " + dop + " ? and f1b.name " + bop + " ? and f1.dirname = f1d.id and f1.basename = f1b.id "
+                      "union all \n"
+                      "select d2.executable_p, d2.debuginfo_p, 0, "
+                      "       b2.hex, f2d.name || '/' || f2b.name, NULL "
+                      "from " BUILDIDS "_f_de d2, " BUILDIDS "_files f2, " BUILDIDS "_fileparts f2b, " BUILDIDS "_fileparts f2d, "
+                      BUILDIDS "_buildids b2 "
+                      "where f2.id = d2.file and d2.buildid = b2.id "
+                      "      and f2d.name " + dop + " ? and f2b.name " + bop + " ? "
+                      "      and f2.dirname = f2d.id and f2.basename = f2b.id");
+  
+  // NB: we could query source file names too, thusly:
+  //
+  //    select * from " BUILDIDS "_buildids b, " BUILDIDS "_files_v f1, " BUILDIDS "_r_sref sr
+  //    where b.id = sr.buildid and f1.id = sr.artifactsrc and f1.name " + op + "?"
+  //    UNION ALL something with BUILDIDS "_f_s"
+  //
+  // But the first part of this query cannot run fast without the same index temp-created
+  // during "maxigroom":
+  //    create index " BUILDIDS "_r_sref_arc on " BUILDIDS "_r_sref(artifactsrc);
+  // and unfortunately this index is HUGE.  It's similar to the size of the _r_sref
+  // table, which is already the largest part of a debuginfod index.  Adding that index
+  // would nearly double the .sqlite db size.
+                      
+  sqlite_ps *pp = new sqlite_ps (thisdb, "mhd-query-meta-glob", sql);
+  pp->reset();
+  pp->bind(1, dirname);
+  pp->bind(2, bname);
+  pp->bind(3, dirname);
+  pp->bind(4, bname);
+  unique_ptr<sqlite_ps> ps_closer(pp); // release pp if exception or return
+
+  json_object *metadata = json_object_new_object();
+  if (!metadata) throw libc_exception(ENOMEM, "json allocation");
+  defer_dtor<json_object*,int> metadata_d(metadata, json_object_put);
+  json_object *metadata_arr = json_object_new_array();
+  if (!metadata_arr) throw libc_exception(ENOMEM, "json allocation");
+  json_object_object_add(metadata, "results", metadata_arr);
+  // consume all the rows
+  struct timespec ts_start;
+  clock_gettime (CLOCK_MONOTONIC, &ts_start);
+  
+  int rc;
+  bool metadata_complete = true;
+  while (SQLITE_DONE != (rc = pp->step()))
+    {
+      // break out of loop if we have searched too long
+      struct timespec ts_end;
+      clock_gettime (CLOCK_MONOTONIC, &ts_end);
+      double deltas = (ts_end.tv_sec - ts_start.tv_sec) + (ts_end.tv_nsec - ts_start.tv_nsec)/1.e9;
+      if (metadata_maxtime_s > 0 && deltas > metadata_maxtime_s)
+        {
+          metadata_complete = false;
+          break;
+        }
+      
+      if (rc != SQLITE_ROW) throw sqlite_exception(rc, "step");
+
+      int m_executable_p = sqlite3_column_int (*pp, 0);
+      int m_debuginfo_p  = sqlite3_column_int (*pp, 1);
+      int m_source_p     = sqlite3_column_int (*pp, 2);
+      string m_buildid   = (const char*) sqlite3_column_text (*pp, 3) ?: ""; // should always be non-null
+      string m_file      = (const char*) sqlite3_column_text (*pp, 4) ?: "";
+      string m_archive   = (const char*) sqlite3_column_text (*pp, 5) ?: "";      
+
+      // Confirm that m_file matches in the fnmatch(FNM_PATHNAME)
+      // sense, since sqlite's GLOB operator is a looser filter.
+      if (key == "glob" && fnmatch(value.c_str(), m_file.c_str(), FNM_PATHNAME) != 0)
+        continue;
+      
+      auto add_metadata = [metadata_arr, m_buildid, m_file, m_archive](const string& type) {
+        json_object* entry = json_object_new_object();
+        if (NULL == entry) throw libc_exception (ENOMEM, "cannot allocate json");
+        defer_dtor<json_object*,int> entry_d(entry, json_object_put);
+        
+        auto add_entry_metadata = [entry](const char* k, string v) {
+          json_object* s;
+          if(v != "") {
+            s = json_object_new_string(v.c_str());
+            if (NULL == s) throw libc_exception (ENOMEM, "cannot allocate json");
+            json_object_object_add(entry, k, s);
+          }
+        };
+        
+        add_entry_metadata("type", type.c_str());
+        add_entry_metadata("buildid", m_buildid);
+        add_entry_metadata("file", m_file);
+        if (m_archive != "") add_entry_metadata("archive", m_archive);        
+        if (verbose > 3)
+          obatched(clog) << "metadata found local "
+                         << json_object_to_json_string_ext(entry,
+                                                           JSON_C_TO_STRING_PRETTY)
+                         << endl;
+        
+        // Increase ref count to switch its ownership
+        json_object_array_add(metadata_arr, json_object_get(entry));
+      };
+
+      if (m_executable_p) add_metadata("executable");
+      if (m_debuginfo_p) add_metadata("debuginfo");      
+      if (m_source_p) add_metadata("source");              
+    }
+  pp->reset();
+
+  unsigned num_local_results = json_object_array_length(metadata_arr);
+  
+  // Query upstream as well
+  debuginfod_client *client = debuginfod_pool_begin();
+  if (client != NULL)
+  {
+    add_client_federation_headers(client, conn);
+
+    int upstream_metadata_fd;
+    char *upstream_metadata_file = NULL;
+    upstream_metadata_fd = debuginfod_find_metadata(client, key.c_str(), (char*)value.c_str(),
+                                                    &upstream_metadata_file);
+    if (upstream_metadata_fd >= 0) {
+       /* json-c >= 0.13 has json_object_from_fd(). */
+      json_object *upstream_metadata_json = json_object_from_file(upstream_metadata_file);
+      free (upstream_metadata_file);
+      json_object *upstream_metadata_json_arr;
+      json_object *upstream_complete;
+      if (NULL != upstream_metadata_json &&
+          json_object_object_get_ex(upstream_metadata_json, "results", &upstream_metadata_json_arr) &&
+          json_object_object_get_ex(upstream_metadata_json, "complete", &upstream_complete))
+        {
+          metadata_complete &= json_object_get_boolean(upstream_complete);
+          for (int i = 0, n = json_object_array_length(upstream_metadata_json_arr); i < n; i++)
+            {
+              json_object *entry = json_object_array_get_idx(upstream_metadata_json_arr, i);
+              if (verbose > 3)
+                obatched(clog) << "metadata found remote "
+                               << json_object_to_json_string_ext(entry,
+                                                                 JSON_C_TO_STRING_PRETTY)
+                               << endl;
+              
+              json_object_get(entry); // increment reference count
+              json_object_array_add(metadata_arr, entry);
+            }
+          json_object_put(upstream_metadata_json);
+        }
+      close(upstream_metadata_fd);
+    }
+    debuginfod_pool_end (client);
+  }
+
+  unsigned num_total_results = json_object_array_length(metadata_arr);
+
+  if (verbose > 2)
+    obatched(clog) << "metadata found local=" << num_local_results
+                   << " remote=" << (num_total_results-num_local_results)
+                   << " total=" << num_total_results
+                   << endl;
+  
+  json_object_object_add(metadata, "complete", json_object_new_boolean(metadata_complete));
+  const char* metadata_str = json_object_to_json_string(metadata);
+  if (!metadata_str)
+    throw libc_exception (ENOMEM, "cannot allocate json");
+  r = MHD_create_response_from_buffer (strlen(metadata_str),
+                                       (void*) metadata_str,
+                                       MHD_RESPMEM_MUST_COPY);
+  *size = strlen(metadata_str);
+  if (r)
+    add_mhd_response_header(r, "Content-Type", "application/json");
+  return r;
+}
+
+
  static struct MHD_Response*
  handle_root (off_t* size)
  {
@@ -2939,6 +3171,7 @@ handler_cb (void * /*cls*/,
    clock_gettime (CLOCK_MONOTONIC, &ts_start);
    double afteryou = 0.0;
    string artifacttype, suffix;
+  string urlargs; // for logging
  
    try
      {
@@ -3007,6 +3240,19 @@ handler_cb (void * /*cls*/,
            inc_metric("http_requests_total", "type", artifacttype);
            r = handle_metrics(& http_size);
          }
+      else if (url1 == "/metadata")
+        {
+          tmp_inc_metric m ("thread_busy", "role", "http-metadata");
+          const char* key = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "key");
+          const char* value = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "value");
+          if (NULL == value || NULL == key)
+            throw reportable_exception("/metadata webapi error, need key and value");
+
+          urlargs = string("?key=") + string(key) + string("&value=") + string(value); // apprx., for logging
+          artifacttype = "metadata";
+          inc_metric("http_requests_total", "type", artifacttype);
+          r = handle_metadata(connection, key, value, &http_size);
+        }
        else if (url1 == "/")
          {
            artifacttype = "/";
@@ -3043,7 +3289,7 @@ handler_cb (void * /*cls*/,
    // afteryou: delay waiting for other client's identical query to complete
    // deltas: total latency, including afteryou waiting
    obatched(clog) << conninfo(connection)
-                 << ' ' << method << ' ' << url
+                 << ' ' << method << ' ' << url << urlargs
                   << ' ' << http_code << ' ' << http_size
                   << ' ' << (int)(afteryou*1000) << '+' << (int)((deltas-afteryou)*1000) << "ms"
                   << endl;
@@ -3396,6 +3642,7 @@ register_file_name(sqlite_ps& ps_upsert_fileparts,
        dirname = name.substr(0, slash);
        filename = name.substr(slash+1);
      }
+  // NB: see also handle_metadata()
  
    // intern the two substrings
    ps_upsert_fileparts
@@ -4379,12 +4626,13 @@ void groom()
    if (interrupted) return;
  
    // NB: "vacuum" is too heavy for even daily runs: it rewrites the entire db, so is done as maxigroom -G
-  sqlite_ps g1 (db, "incremental vacuum", "pragma incremental_vacuum");
-  g1.reset().step_ok_done();
-  sqlite_ps g2 (db, "optimize", "pragma optimize");
-  g2.reset().step_ok_done();
-  sqlite_ps g3 (db, "wal checkpoint", "pragma wal_checkpoint=truncate");
-  g3.reset().step_ok_done();
+  { sqlite_ps g (db, "incremental vacuum", "pragma incremental_vacuum"); g.reset().step_ok_done(); }
+  // https://www.sqlite.org/lang_analyze.html#approx
+  { sqlite_ps g (db, "analyze setup", "pragma analysis_limit = 1000;\n"); g.reset().step_ok_done(); }
+  { sqlite_ps g (db, "analyze", "analyze"); g.reset().step_ok_done(); }
+  { sqlite_ps g (db, "analyze reload", "analyze sqlite_schema"); g.reset().step_ok_done(); } 
+  { sqlite_ps g (db, "optimize", "pragma optimize"); g.reset().step_ok_done(); }
+  { sqlite_ps g (db, "wal checkpoint", "pragma wal_checkpoint=truncate"); g.reset().step_ok_done(); }
  
    database_stats_report();
  
@@ -4769,6 +5017,8 @@ main (int argc, char *argv[])
    if (maxigroom)
      {
        obatched(clog) << "maxigrooming database, please wait." << endl;
+      // NB: this index alone can nearly double the database size!
+      // NB: this index would be necessary to run source-file metadata searches fast
        extra_ddl.push_back("create index if not exists " BUILDIDS "_r_sref_arc on " BUILDIDS "_r_sref(artifactsrc);");
        extra_ddl.push_back("delete from " BUILDIDS "_r_sdef where not exists (select 1 from " BUILDIDS "_r_sref b where " BUILDIDS "_r_sdef.content = b.artifactsrc);");
        extra_ddl.push_back("drop index if exists " BUILDIDS "_r_sref_arc;");
diff --git a/debuginfod/debuginfod.h.in b/debuginfod/debuginfod.h.in

index 73f633f0b8e97433f057b1a10bf80e643326ac42..0a6a4a22efd92e7b34b8e927c3d9ee379d642c38 100644 (file)
--- a/debuginfod/debuginfod.h.in
+++ b/debuginfod/debuginfod.h.in
@@ -63,9 +63,9 @@ debuginfod_client *debuginfod_begin (void);
     it is a binary blob of given length.
  
     If successful, return a file descriptor to the target, otherwise
-   return a posix error code.  If successful, set *path to a
-   strdup'd copy of the name of the same file in the cache.
-   Caller must free() it later. */
+   return a negative POSIX error code.  If successful, set *path to a
+   strdup'd copy of the name of the same file in the cache.  Caller
+   must free() it later. */
  
  int debuginfod_find_debuginfo (debuginfod_client *client,
                                const unsigned char *build_id,
@@ -89,6 +89,22 @@ int debuginfod_find_section (debuginfod_client *client,
                              const char *section,
                              char **path);
  
+/* Query the urls contained in $DEBUGINFOD_URLS for metadata
+   with given query key/value.
+   
+   If successful, return a file descriptor to the JSON document
+   describing matches, otherwise return a negative POSIX error code.  If
+   successful, set *path to a strdup'd copy of the name of the same
+   file in the cache.  Caller must free() it later.
+   
+   See the debuginfod-find(1) man page for examples of the supported types
+   of key/value queries and their JSON results.
+   */
+int debuginfod_find_metadata (debuginfod_client *client,
+                              const char *key,
+                              const char* value,
+                              char **path);
+
  typedef int (*debuginfod_progressfn_t)(debuginfod_client *c, long a, long b);
  void debuginfod_set_progressfn(debuginfod_client *c,
                                debuginfod_progressfn_t fn);
diff --git a/debuginfod/libdebuginfod.map b/debuginfod/libdebuginfod.map

index 6334373f01b0e2d9fcfe28039da406c33e48e625..9cee91cd79aa9eb62f5f55e771381aaf91e07031 100644 (file)
--- a/debuginfod/libdebuginfod.map
+++ b/debuginfod/libdebuginfod.map
@@ -22,3 +22,6 @@ ELFUTILS_0.188 {
    debuginfod_get_headers;
    debuginfod_find_section;
  } ELFUTILS_0.183;
+ELFUTILS_0.192 {
+  debuginfod_find_metadata;
+} ELFUTILS_0.188;
diff --git a/doc/Makefile.am b/doc/Makefile.am

index 87de4f0beb7ff3fd389eefa18e519d9823910270..0c094af2289b8d31e5cd8fbb2b439890b25e9c84 100644 (file)
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -39,6 +39,7 @@ notrans_dist_man3_MANS += debuginfod_find_debuginfo.3
  notrans_dist_man3_MANS += debuginfod_find_executable.3
  notrans_dist_man3_MANS += debuginfod_find_source.3
  notrans_dist_man3_MANS += debuginfod_find_section.3
+notrans_dist_man3_MANS += debuginfod_find_metadata.3
  notrans_dist_man3_MANS += debuginfod_get_user_data.3
  notrans_dist_man3_MANS += debuginfod_get_url.3
  notrans_dist_man3_MANS += debuginfod_set_progressfn.3
diff --git a/doc/debuginfod-client-config.7 b/doc/debuginfod-client-config.7

index f16612084e9b5656902bd910f4d8c848318905a9..bb33fb0b8b6e9043c902ea6189b96b9717888a2c 100644 (file)
--- a/doc/debuginfod-client-config.7
+++ b/doc/debuginfod-client-config.7
@@ -167,3 +167,11 @@ are short-circuited (returning an immediate failure instead of sending
  a new query to servers).  This accelerates queries that probably would
  still fail.  The default is 600, 10 minutes.  0 means "forget
  immediately".
+
+.TP
+.B metadata_retention_s
+This control file sets how long to remember the results of a metadata
+query.  New queries for the same artifacts within this time window are
+short-circuited (repeating the same results).  This accelerates
+queries that probably would probably have the same results.  The
+default is 3600, 1 hour.  0 means "do not retain".
diff --git a/doc/debuginfod-find.1 b/doc/debuginfod-find.1

index d7db1bfdd838c46c7b2b78db2ccae10f4e48aebc..38b5b0184dfef9307df48acd60132e5e14a33307 100644 (file)
--- a/doc/debuginfod-find.1
+++ b/doc/debuginfod-find.1
@@ -29,6 +29,8 @@ debuginfod-find \- request debuginfo-related data
  .B debuginfod-find [\fIOPTION\fP]... source \fIBUILDID\fP \fI/FILENAME\fP
  .br
  .B debuginfod-find [\fIOPTION\fP]... source \fIPATH\fP \fI/FILENAME\fP
+.br
+.B debuginfod-find [\fIOPTION\fP]... metadata \fIKEY\fP \fIVALUE\fP
  
  .SH DESCRIPTION
  \fBdebuginfod-find\fP queries one or more \fBdebuginfod\fP servers for
@@ -119,6 +121,64 @@ l l.
  \../bar/foo.c AT_comp_dir=/zoo/        source BUILDID /zoo//../bar/foo.c
  .TE
  
+.SS metadata \fIKEY\fP \fIVALUE\fP
+
+All designated debuginfod servers are queried for metadata about all
+files that match a given key/value query in their index.  The results
+include names and buildids, which may be used in future queries to
+fetch actual files.
+
+.TS
+l l l .
+KEY    VALUE   DESCRIPTION
+
+\fBfile\fP     \fIpath\fP      exact match \fIpath\fP, including in archives
+\fBglob\fP     \fIpattern\fP   shell-style glob match \fIpattern\fP, including in archives, as in fnmatch(FNM_PATHNAME)
+.TE
+
+The resulting output will look something like the following
+{
+  "results":[
+    {
+      "type":"executable",
+      "buildid":"f0aa15b8aba4f3c28cac3c2a73801fefa644a9f2",
+      "file":"/usr/local/bin/hello",
+      "archive":"/opt/elfutils/tests/test-2290642/R/rhel7/hello2-1.0-2.x86_64.rpm"
+    },
+    {
+      "type":"executable",
+      "buildid":"bc1febfd03ca05e030f0d205f7659db29f8a4b30",
+      "file":"hello2"
+    }
+  ],
+  "complete":true
+}'
+
+The results of the search are output to \fBstdout\fP as a JSON object
+containing an array of objects, supplying metadata about each match, as
+well as a boolean value corresponding to the completeness of the result.
+The result is considered complete if all of the queries to upstream servers
+returned complete results and the local query succeeded. This metadata report
+may be cached.  It may be incomplete and may contain duplicates.  
+Additional JSON object fields may be present.
+
+.TS
+l l l .
+NAME   TYPE    DESCRIPTION
+
+\fBbuildid\fP  string  hexadecimal buildid associated with the file
+\fBtype\fP     string  one of \fBdebuginfo\fP or \fBexecutable\fP
+\fBfile\fP     string  matched file name, outside or inside the archive
+\fBarchive\fP  string  archive containing matched file name, if any
+.TE
+
+It's worth noting that \fBtype\fP cannot be \fBsource\fP since in order
+to perform such a search fast enough additional indexing would need to be added to
+the database which would nearly double it's size.
+
+The search also always combines both files and archives in the results
+and at this time further granularity is not availible.
+
  .SH "OPTIONS"
  
  .TP
diff --git a/doc/debuginfod.8 b/doc/debuginfod.8

index 577f58b6ee2e6c1d24bdead93e31ee5deaca38dd..f35ce6c1a9ca0fe804eb1d6ac029139deda1ef45 100644 (file)
--- a/doc/debuginfod.8
+++ b/doc/debuginfod.8
@@ -132,6 +132,14 @@ scanner/groomer server and multiple passive ones, thereby sharing
  service load.  Archive pattern options must still be given, so
  debuginfod can recognize file name extensions for unpacking.
  
+.TP
+.B "\-\-metadata\-maxtime=SECONDS"
+Impose a limit on the runtime of metadata webapi queries.  These
+queries, especially broad "glob" wildcards, can take a large amount of
+time and produce large results.  Public-facing servers may need to
+throttle them.  The default limit is 5 seconds.  Set 0 to disable this
+limit.
+
  .TP
  .B "\-D SQL" "\-\-ddl=SQL"
  Execute given sqlite statement after the database is opened and
@@ -421,6 +429,16 @@ variety of statistics about the operation of the debuginfod server.
  The exact set of metrics and their meanings may change in future
  versions.
  
+.SS /metadata?key=\fIKEY\fP&value=\fIVALUE\fP
+
+This endpoint triggers a search of the files in the index plus any
+upstream federated servers, based on given key and value.  If
+successful, the result is a application/json textual array, listing
+metadata for the matched files.  See \fIdebuginfod-find(1)\fP for
+documentation of the common key/value search parameters, and the
+resulting data schema.
+
+
  .SH DATA MANAGEMENT
  
  debuginfod stores its index in an sqlite database in a densely packed
diff --git a/doc/debuginfod_find_debuginfo.3 b/doc/debuginfod_find_debuginfo.3

index 4e359c8c4bd45ade9e7818603b35266f803d4d37..589a2c2b63b461e074ef2a6f466b8f591285406b 100644 (file)
--- a/doc/debuginfod_find_debuginfo.3
+++ b/doc/debuginfod_find_debuginfo.3
@@ -48,6 +48,10 @@ LOOKUP FUNCTIONS
  .BI "                           int " build_id_len ","
  .BI "                           const char * " section ","
  .BI "                           char ** " path ");"
+.BI "int debuginfod_find_metadata(debuginfod_client *" client ","
+.BI "                            const char *" key ","
+.BI "                            const char *" value ","
+.BI "                            char ** " path ");"
  
  
  OPTIONAL FUNCTIONS
@@ -114,6 +118,14 @@ section queries, debuginfod_find_section may query the server for the
  debuginfo and/or executable with \fIbuild_id\fP in order to retrieve
  and extract the section.
  
+.BR debuginfod_find_metadata ()
+queries all debuginfod server URLs contained in
+.BR $DEBUGINFOD_URLS
+for metadata for all matches of a given key/value query against files
+in their indexes.  The resulting file is a JSON document.  See the
+\fIdebuginfod-find(1)\fP man page for examples of the supported types
+of key/value queries and their JSON results.
+
  If \fIpath\fP is not NULL and the query is successful, \fIpath\fP is set
  to the path of the file in the cache. The caller must \fBfree\fP() this value.
  
diff --git a/doc/debuginfod_find_metadata.3 b/doc/debuginfod_find_metadata.3

new file mode 100644 (file)

index 0000000..1627993
--- /dev/null
+++ b/doc/debuginfod_find_metadata.3
@@ -0,0 +1 @@
+.so man3/debuginfod_find_debuginfo.3
diff --git a/tests/Makefile.am b/tests/Makefile.am

index 4547d95de76caf7aea6a4374b4329183122baf5c..3cc9ded43b6a0447ab75d8466072eace7f16d9d0 100644 (file)
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -266,12 +266,13 @@ TESTS += run-debuginfod-dlopen.sh \
          run-debuginfod-federation-sqlite.sh \
          run-debuginfod-federation-link.sh \
           run-debuginfod-percent-escape.sh \
-         run-debuginfod-x-forwarded-for.sh \
-         run-debuginfod-response-headers.sh \
-         run-debuginfod-extraction-passive.sh \
+        run-debuginfod-x-forwarded-for.sh \
+        run-debuginfod-response-headers.sh \
+        run-debuginfod-extraction-passive.sh \
          run-debuginfod-webapi-concurrency.sh \
          run-debuginfod-section.sh \
-        run-debuginfod-IXr.sh
+        run-debuginfod-IXr.sh \
+        run-debuginfod-find-metadata.sh
  endif
  if !OLD_LIBMICROHTTPD
  # Will crash on too old libmicrohttpd
@@ -603,7 +604,8 @@ EXTRA_DIST = run-arextract.sh run-arsymtest.sh run-ar.sh \
               run-debuginfod-webapi-concurrency.sh \
              run-debuginfod-section.sh \
              run-debuginfod-IXr.sh \
-                run-debuginfod-ima-verification.sh \
+            run-debuginfod-ima-verification.sh \
+            run-debuginfod-find-metadata.sh \
              debuginfod-rpms/fedora30/hello2-1.0-2.src.rpm \
              debuginfod-rpms/fedora30/hello2-1.0-2.x86_64.rpm \
              debuginfod-rpms/fedora30/hello2-debuginfo-1.0-2.x86_64.rpm \
diff --git a/tests/debuginfod-subr.sh b/tests/debuginfod-subr.sh

index c3b0603ddb2e2370d5fb7ae09e193d9fb3c89513..000e27708192fa845f0c2131c4ca463d3626e706 100755 (executable)
--- a/tests/debuginfod-subr.sh
+++ b/tests/debuginfod-subr.sh
@@ -26,6 +26,7 @@ type curl 2>/dev/null || (echo "need curl"; exit 77)
  type rpm2cpio 2>/dev/null || (echo "need rpm2cpio"; exit 77)
  type cpio 2>/dev/null || (echo "need cpio"; exit 77)
  type bzcat 2>/dev/null || (echo "need bzcat"; exit 77)
+type ss 2>/dev/null || (echo "need ss"; exit 77)
  bsdtar --version | grep -q zstd && zstd=true || zstd=false
  echo "zstd=$zstd bsdtar=`bsdtar --version`"
  
diff --git a/tests/run-debuginfod-find-metadata.sh b/tests/run-debuginfod-find-metadata.sh

new file mode 100755 (executable)

index 0000000..78a34f0
--- /dev/null
+++ b/tests/run-debuginfod-find-metadata.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2022-2024 Red Hat, Inc.
+# This file is part of elfutils.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# elfutils is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. $srcdir/debuginfod-subr.sh
+
+# for test case debugging, uncomment:
+set -x
+unset VALGRIND_CMD
+# VALGRIND_CMD="valgrind --enable-debuginfod=no"
+
+type curl 2>/dev/null || { echo "need curl"; exit 77; }
+type jq 2>/dev/null || { echo "need jq"; exit 77; }
+
+pkg-config json-c libcurl || { echo "one or more libraries are missing (libjson-c, libcurl)"; exit 77; }
+
+DB=${PWD}/.debuginfod_tmp.sqlite
+export DEBUGINFOD_CACHE_PATH=${PWD}/.client_cache
+tempfiles $DB ${DB}_2
+
+# This variable is essential and ensures no time-race for claiming ports occurs
+# set base to a unique multiple of 100 not used in any other 'run-debuginfod-*' test
+base=13100
+get_ports
+mkdir R D
+cp -rvp ${abs_srcdir}/debuginfod-rpms/rhel7 R
+cp -rvp ${abs_srcdir}/debuginfod-debs/*deb D
+
+env LD_LIBRARY_PATH=$ldpath DEBUGINFOD_URLS= ${VALGRIND_CMD} ${abs_builddir}/../debuginfod/debuginfod $VERBOSE -R \
+    -d $DB -p $PORT1 -t0 -g0 R > vlog$PORT1 2>&1 &
+PID1=$!
+tempfiles vlog$PORT1
+errfiles vlog$PORT1
+
+wait_ready $PORT1 'ready' 1
+wait_ready $PORT1 'thread_work_total{role="traverse"}' 1
+wait_ready $PORT1 'thread_work_pending{role="scan"}' 0
+wait_ready $PORT1 'thread_busy{role="scan"}' 0
+
+env LD_LIBRARY_PATH=$ldpath DEBUGINFOD_URLS="http://127.0.0.1:$PORT1 https://bad/url.web" ${VALGRIND_CMD} ${abs_builddir}/../debuginfod/debuginfod $VERBOSE -U \
+    -d ${DB}_2 -p $PORT2 -t0 -g0 D > vlog$PORT2 2>&1 &
+PID2=$!
+tempfiles vlog$PORT2
+errfiles vlog$PORT2
+
+wait_ready $PORT2 'ready' 1
+wait_ready $PORT2 'thread_work_total{role="traverse"}' 1
+wait_ready $PORT2 'thread_work_pending{role="scan"}' 0
+wait_ready $PORT2 'thread_busy{role="scan"}' 0
+
+# have clients contact the new server
+export DEBUGINFOD_URLS=http://127.0.0.1:$PORT2
+
+tempfiles json.txt
+# Check that we find correct number of files, both via local and federated links
+RESULTJ=`env LD_LIBRARY_PATH=$ldpath ${VALGRIND_CMD} ${abs_builddir}/../debuginfod/debuginfod-find metadata glob "/u?r/bin/*"`
+echo $RESULTJ
+N_FOUND=`echo $RESULTJ | jq '.results | length'`
+test $N_FOUND -eq 1
+RESULTJ=`env LD_LIBRARY_PATH=$ldpath ${VALGRIND_CMD} ${abs_builddir}/../debuginfod/debuginfod-find metadata glob "/usr/lo?al/bin/*"`
+echo $RESULTJ
+N_FOUND=`echo $RESULTJ | jq '.results | length'`
+test $N_FOUND -eq 2
+
+
+# Query via the webapi as well
+curl http://127.0.0.1:$PORT2'/metadata?key=glob&value=/usr/bin/*hi*'
+test `curl -s http://127.0.0.1:$PORT2'/metadata?key=glob&value=/usr/bin/*hi*' | jq '.results[0].buildid == "f17a29b5a25bd4960531d82aa6b07c8abe84fa66"'` = 'true'
+test `curl -s http://127.0.0.1:$PORT2'/metadata?key=glob&value=/usr/bin/*hi*' | jq '.results[0].file == "/usr/bin/hithere"'` = 'true'
+test `curl -s http://127.0.0.1:$PORT2'/metadata?key=glob&value=/usr/bin/*hi*' | jq '.results[0].archive | test(".*hithere.*deb")'` = 'true'
+# Note we query the upstream server too, since the downstream will have an incomplete result due to the badurl
+test `curl -s http://127.0.0.1:$PORT1'/metadata?key=glob&value=/usr/bin/*hi*' | jq '.complete == true'` = 'true'
+test `curl -s http://127.0.0.1:$PORT2'/metadata?key=glob&value=/usr/bin/*hi*' | jq '.complete == false'` = 'true'
+
+# An empty array is returned on server error or if the file DNE
+RESULTJ=`env LD_LIBRARY_PATH=$ldpath ${VALGRIND_CMD} ${abs_builddir}/../debuginfod/debuginfod-find metadata file "/this/isnt/there"`
+echo $RESULTJ
+test `echo $RESULTJ | jq ".results == [ ]" ` = 'true'
+
+kill $PID1
+kill $PID2
+wait $PID1
+wait $PID2
+PID1=0
+PID2=0
+
+# check it's still in cache
+RESULTJ=`env LD_LIBRARY_PATH=$ldpath ${VALGRIND_CMD} ${abs_builddir}/../debuginfod/debuginfod-find metadata file "/usr/bin/hithere"`
+echo $RESULTJ
+test `echo $RESULTJ | jq ".results == [ ]" ` = 'true'
+
+# invalidate cache, retry previously successful query to now-dead servers
+echo 0 > $DEBUGINFOD_CACHE_PATH/metadata_retention_s
+RESULTJ=`env LD_LIBRARY_PATH=$ldpath ${VALGRIND_CMD} ${abs_builddir}/../debuginfod/debuginfod-find metadata glob "/u?r/bin/*"`
+echo $RESULTJ
+test `echo $RESULTJ | jq ".results == [ ]" ` = 'true'
+test `echo $RESULTJ | jq ".complete == false" ` = 'true'
+
+exit 0
author	Frank Ch. Eigler <fche@redhat.com>
	Mon, 31 Oct 2022 21:40:01 +0000 (17:40 -0400)
committer	Frank Ch. Eigler <fche@redhat.com>
	Mon, 3 Jun 2024 15:22:49 +0000 (11:22 -0400)
NEWS		patch \| blob \| blame \| history
config/elfutils.spec.in		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
debuginfod/Makefile.am		patch \| blob \| blame \| history
debuginfod/debuginfod-client.c		patch \| blob \| blame \| history
debuginfod/debuginfod-find.c		patch \| blob \| blame \| history
debuginfod/debuginfod.cxx		patch \| blob \| blame \| history
debuginfod/debuginfod.h.in		patch \| blob \| blame \| history
debuginfod/libdebuginfod.map		patch \| blob \| blame \| history
doc/Makefile.am		patch \| blob \| blame \| history
doc/debuginfod-client-config.7		patch \| blob \| blame \| history
doc/debuginfod-find.1		patch \| blob \| blame \| history
doc/debuginfod.8		patch \| blob \| blame \| history
doc/debuginfod_find_debuginfo.3		patch \| blob \| blame \| history
doc/debuginfod_find_metadata.3	[new file with mode: 0644]	patch \| blob
tests/Makefile.am		patch \| blob \| blame \| history
tests/debuginfod-subr.sh		patch \| blob \| blame \| history
tests/run-debuginfod-find-metadata.sh	[new file with mode: 0755]	patch \| blob