]> git.ipfire.org Git - thirdparty/tor.git/commitdiff
Try to diagnose bug 12184
authorNick Mathewson <nickm@torproject.org>
Tue, 10 Jun 2014 17:35:45 +0000 (13:35 -0400)
committerNick Mathewson <nickm@torproject.org>
Sat, 14 Jun 2014 15:00:44 +0000 (11:00 -0400)
Check for consistency between the queued destroy cells and the marked
circuit IDs.  Check for consistency in the count of queued destroy
cells in several ways.  Check to see whether any of the marked circuit
IDs have somehow been marked longer than the channel has existed.

changes/bug12184_diagnostic [new file with mode: 0644]
src/or/circuitbuild.c
src/or/circuitlist.c
src/or/circuitlist.h
src/or/circuitmux.c
src/or/circuitmux.h
src/or/relay.c
src/or/relay.h

diff --git a/changes/bug12184_diagnostic b/changes/bug12184_diagnostic
new file mode 100644 (file)
index 0000000..3c783b1
--- /dev/null
@@ -0,0 +1,4 @@
+  o Minor features (diagnostic):
+    - Add significant new logging code to attempt to diagnose bug 12184,
+      where relays seem to run out of available circuit IDs.
+
index 9d06759dec0e895bfa6481561afc68333c037f1f..e743a25f488cc37ec5f993a6a70f956229eb6f60 100644 (file)
@@ -96,10 +96,12 @@ get_unique_circ_id_by_chan(channel_t *chan)
  * N=256. */
 #define MAX_CIRCID_ATTEMPTS 64
   int in_use;
-  unsigned n_with_circ = 0, n_pending_destroy = 0;
+  unsigned n_with_circ = 0, n_pending_destroy = 0, n_weird_pending_destroy = 0;
   circid_t test_circ_id;
   circid_t attempts=0;
   circid_t high_bit, max_range, mask;
+  int64_t pending_destroy_time_total = 0;
+  int64_t pending_destroy_time_max = 0;
 
   tor_assert(chan);
 
@@ -130,15 +132,57 @@ get_unique_circ_id_by_chan(channel_t *chan)
        * whole circuit ID space every time we extend a circuit, which is
        * not so great either.
        */
-      log_fn_ratelim(&chan->last_warned_circ_ids_exhausted, LOG_WARN,
-                 LD_CIRC,"No unused circIDs found on channel %s wide "
+      int64_t queued_destroys;
+      char *m = rate_limit_log(&chan->last_warned_circ_ids_exhausted,
+                               approx_time());
+      if (m == NULL)
+        return 0; /* This message has been rate-limited away. */
+      if (n_pending_destroy)
+        pending_destroy_time_total /= n_pending_destroy;
+      log_warn(LD_CIRC,"No unused circIDs found on channel %s wide "
                  "circID support, with %u inbound and %u outbound circuits. "
                  "Found %u circuit IDs in use by circuits, and %u with "
-                 "pending destroy cells."
-                 "Failing a circuit.",
+                 "pending destroy cells. (%u of those were marked bogusly.) "
+                 "The ones with pending destroy cells "
+                 "have been marked unusable for an average of %ld seconds "
+                 "and a maximum of %ld seconds. This channel is %ld seconds "
+                 "old. Failing a circuit.%s",
                  chan->wide_circ_ids ? "with" : "without",
                  chan->num_p_circuits, chan->num_n_circuits,
-                 n_with_circ, n_pending_destroy);
+                 n_with_circ, n_pending_destroy, n_weird_pending_destroy,
+                 (long)pending_destroy_time_total,
+                 (long)pending_destroy_time_max,
+                 (long)(approx_time() - chan->timestamp_created),
+                 m);
+      tor_free(m);
+
+      if (!chan->cmux) {
+        /* This warning should be impossible. */
+        log_warn(LD_BUG, "  This channel somehow has no cmux on it!");
+        return 0;
+      }
+
+      /* analysis so far on 12184 suggests that we're running out of circuit
+         IDs because it looks like we have too many pending destroy
+         cells. Let's see how many we really have pending.
+      */
+      queued_destroys = circuitmux_count_queued_destroy_cells(chan,
+                                                              chan->cmux);
+
+      log_warn(LD_CIRC, "  Circuitmux on this channel has %u circuits, "
+               "of which %u are active. It says it has "I64_FORMAT
+               " destroy cells queued.",
+               circuitmux_num_circuits(chan->cmux),
+               circuitmux_num_active_circuits(chan->cmux),
+               I64_PRINTF_ARG(queued_destroys));
+
+      /* Change this into "if (1)" in order to get more information about
+       * possible failure modes here.  You'll need to know how to use gdb with
+       * Tor: this will make Tor exit with an assertion failure if the cmux is
+       * corrupt. */
+      if (0)
+        circuitmux_assert_okay(chan->cmux);
+
       return 0;
     }
 
@@ -152,8 +196,20 @@ get_unique_circ_id_by_chan(channel_t *chan)
     in_use = circuit_id_in_use_on_channel(test_circ_id, chan);
     if (in_use == 1)
       ++n_with_circ;
-    else if (in_use == 2)
+    else if (in_use == 2) {
+      time_t since_when;
       ++n_pending_destroy;
+      since_when =
+        circuit_id_when_marked_unusable_on_channel(test_circ_id, chan);
+      if (since_when) {
+        time_t waiting = approx_time() - since_when;
+        pending_destroy_time_total += waiting;
+        if (waiting > pending_destroy_time_max)
+          pending_destroy_time_max = waiting;
+      } else {
+        ++n_weird_pending_destroy;
+      }
+    }
   } while (in_use);
   return test_circ_id;
 }
index 6238e08e1e4025762cb8615ecc88e2621ba5da18..ba16bb1457e1986f1fdc9d41646ce56cc2332ea9 100644 (file)
@@ -59,6 +59,8 @@ typedef struct chan_circid_circuit_map_t {
   channel_t *chan;
   circid_t circ_id;
   circuit_t *circuit;
+  /* For debugging 12184: when was this placeholder item added? */
+  time_t made_placeholder_at;
 } chan_circid_circuit_map_t;
 
 /** Helper for hash tables: compare the channel and circuit ID for a and
@@ -184,6 +186,7 @@ circuit_set_circid_chan_helper(circuit_t *circ, int direction,
   found = HT_FIND(chan_circid_map, &chan_circid_map, &search);
   if (found) {
     found->circuit = circ;
+    found->made_placeholder_at = 0;
   } else {
     found = tor_malloc_zero(sizeof(chan_circid_circuit_map_t));
     found->circ_id = id;
@@ -241,11 +244,14 @@ channel_mark_circid_unusable(channel_t *chan, circid_t id)
              "a circuit there.", (unsigned)id, chan);
   } else if (ent) {
     /* It's already marked. */
+    if (!ent->made_placeholder_at)
+      ent->made_placeholder_at = approx_time();
   } else {
     ent = tor_malloc_zero(sizeof(chan_circid_circuit_map_t));
     ent->chan = chan;
     ent->circ_id = id;
-    /* leave circuit at NULL */
+    /* leave circuit at NULL. */
+    ent->made_placeholder_at = approx_time();
     HT_INSERT(chan_circid_map, &chan_circid_map, ent);
   }
 }
@@ -1090,6 +1096,27 @@ circuit_id_in_use_on_channel(circid_t circ_id, channel_t *chan)
   return 0;
 }
 
+/** Helper for debugging 12184.  Returns the time since which 'circ_id' has
+ * been marked unusable on 'chan'. */
+time_t
+circuit_id_when_marked_unusable_on_channel(circid_t circ_id, channel_t *chan)
+{
+  chan_circid_circuit_map_t search;
+  chan_circid_circuit_map_t *found;
+
+  memset(&search, 0, sizeof(search));
+  search.circ_id = circ_id;
+  search.chan = chan;
+
+  found = HT_FIND(chan_circid_map, &chan_circid_map, &search);
+
+  if (! found || found->circuit)
+    return 0;
+
+  return found->made_placeholder_at;
+}
+
+
 /** Return the circuit that a given edge connection is using. */
 circuit_t *
 circuit_get_by_edge_conn(edge_connection_t *conn)
index 916afba215ee2f5b5cd29b2673db1fc7973bedc5..d48d7c3963e310ef8db15cab1dd02f202d2344ec 100644 (file)
@@ -28,6 +28,8 @@ void circuit_set_n_circid_chan(circuit_t *circ, circid_t id,
                                channel_t *chan);
 void channel_mark_circid_unusable(channel_t *chan, circid_t id);
 void channel_mark_circid_usable(channel_t *chan, circid_t id);
+time_t circuit_id_when_marked_unusable_on_channel(circid_t circ_id,
+                                                  channel_t *chan);
 void circuit_set_state(circuit_t *circ, uint8_t state);
 void circuit_close_all_marked(void);
 int32_t circuit_initial_package_window(void);
index 52ebfef084b7d4c8bf90b25f5aefe5e7ce701b7a..563166eb46293427460471d5323d0dfad72dd58b 100644 (file)
@@ -1909,3 +1909,38 @@ circuitmux_append_destroy_cell(channel_t *chan,
   }
 }
 
+/*DOCDOC; for debugging 12184.  This runs slowly. */
+int64_t
+circuitmux_count_queued_destroy_cells(const channel_t *chan,
+                                      const circuitmux_t *cmux)
+{
+  int64_t n_destroy_cells = cmux->destroy_ctr;
+  int64_t destroy_queue_size = cmux->destroy_cell_queue.n;
+
+  int64_t manual_total = 0;
+  int64_t manual_total_in_map = 0;
+  packed_cell_t *cell;
+
+  TOR_SIMPLEQ_FOREACH(cell, &cmux->destroy_cell_queue.head, next) {
+    circid_t id;
+    ++manual_total;
+
+    id = packed_cell_get_circid(cell, chan->wide_circ_ids);
+    if (circuit_id_in_use_on_channel(id, (channel_t*)chan))
+      ++manual_total_in_map;
+  }
+
+  if (n_destroy_cells != destroy_queue_size ||
+      n_destroy_cells != manual_total ||
+      n_destroy_cells != manual_total_in_map) {
+    log_warn(LD_BUG, "  Discrepancy in counts for queued destroy cells on "
+             "circuitmux. n="I64_FORMAT". queue_size="I64_FORMAT". "
+             "manual_total="I64_FORMAT". manual_total_in_map="I64_FORMAT".",
+             I64_PRINTF_ARG(n_destroy_cells),
+             I64_PRINTF_ARG(destroy_queue_size),
+             I64_PRINTF_ARG(manual_total),
+             I64_PRINTF_ARG(manual_total_in_map));
+  }
+
+  return n_destroy_cells;
+}
index c4c0649c6c0f4d7aa576f9247c7b5557299454fe..2b5fb7e51e0290da1f485cd73187459f741dde7f 100644 (file)
@@ -121,6 +121,10 @@ unsigned int circuitmux_num_cells(circuitmux_t *cmux);
 unsigned int circuitmux_num_circuits(circuitmux_t *cmux);
 unsigned int circuitmux_num_active_circuits(circuitmux_t *cmux);
 
+/* Debuging interface - slow. */
+int64_t circuitmux_count_queued_destroy_cells(const channel_t *chan,
+                                              const circuitmux_t *cmux);
+
 /* Channel interface */
 circuit_t * circuitmux_get_first_active_circuit(circuitmux_t *cmux,
                                            cell_queue_t **destroy_queue_out);
index 509d7ced0479ac9e029f2ac1c9ffb43a3a318311..58d6db2b2f8415b0d833a2a23a14916870d53c48 100644 (file)
@@ -2563,6 +2563,17 @@ packed_cell_get_command(const packed_cell_t *cell, int wide_circ_ids)
   }
 }
 
+/** Extract the circuit ID from a packed cell. */
+circid_t
+packed_cell_get_circid(const packed_cell_t *cell, int wide_circ_ids)
+{
+  if (wide_circ_ids) {
+    return ntohl(get_uint32(cell->body));
+  } else {
+    return ntohs(get_uint16(cell->body));
+  }
+}
+
 /** Pull as many cells as possible (but no more than <b>max</b>) from the
  * queue of the first active circuit on <b>chan</b>, and write them to
  * <b>chan</b>-&gt;outbuf.  Return the number of cells written.  Advance
index 479d474b3552c90d211a536c56fef1abbbe35eac..969c6fb61d329e72ea64a40bdbe18368b300465f 100644 (file)
@@ -82,6 +82,8 @@ void stream_choice_seed_weak_rng(void);
 int relay_crypt(circuit_t *circ, cell_t *cell, cell_direction_t cell_direction,
                 crypt_path_t **layer_hint, char *recognized);
 
+circid_t packed_cell_get_circid(const packed_cell_t *cell, int wide_circ_ids);
+
 #ifdef RELAY_PRIVATE
 STATIC int connected_cell_parse(const relay_header_t *rh, const cell_t *cell,
                          tor_addr_t *addr_out, int *ttl_out);