Implement experimental metric tracking time spent within PowerDNS per query

author bert hubert <bert.hubert@netherlabs.nl>

Fri, 6 Oct 2017 11:20:15 +0000 (13:20 +0200)

committer bert hubert <bert.hubert@netherlabs.nl>

Fri, 6 Oct 2017 11:20:15 +0000 (13:20 +0200)
author bert hubert <bert.hubert@netherlabs.nl>
Fri, 6 Oct 2017 11:20:15 +0000 (13:20 +0200)
committer bert hubert <bert.hubert@netherlabs.nl>
Fri, 6 Oct 2017 11:20:15 +0000 (13:20 +0200)
diff --git a/pdns/pdns_recursor.cc b/pdns/pdns_recursor.cc

index 01fc6e3415f0cf28024d46834c55dc03069ccb63..55b88eeb0e1528e93eefcf122a7581d7b0bf410d 100644 (file)
--- a/pdns/pdns_recursor.cc
+++ b/pdns/pdns_recursor.cc
@@ -1237,7 +1237,7 @@ static void startDoResolve(void *p)
      }
  
      sr.d_outqueries ? t_RC->cacheMisses++ : t_RC->cacheHits++;
-
+            
      if(spent < 0.001)
        g_stats.answers0_1++;
      else if(spent < 0.010)
@@ -1253,6 +1253,28 @@ static void startDoResolve(void *p)
      newLat = min(newLat,(uint64_t)(((uint64_t) g_networkTimeoutMsec)*1000)); // outliers of several minutes exist..
      g_stats.avgLatencyUsec=(1-1.0/g_latencyStatSize)*g_stats.avgLatencyUsec + (float)newLat/g_latencyStatSize;
      // no worries, we do this for packet cache hits elsewhere
+
+    auto ourtime = 1000.0*spent-sr.d_totUsec/1000.0; // in msec
+    if(ourtime < 1)
+      g_stats.ourtime0_1++;
+    else if(ourtime < 2)
+      g_stats.ourtime1_2++;
+    else if(ourtime < 4)
+      g_stats.ourtime2_4++;
+    else if(ourtime < 8)
+      g_stats.ourtime4_8++;
+    else if(ourtime < 16)
+      g_stats.ourtime8_16++;
+    else if(ourtime < 32)
+      g_stats.ourtime16_32++;
+    else {
+      //      cerr<<"SLOW: "<<ourtime<<"ms -> "<<dc->d_mdp.d_qname<<"|"<<DNSRecordContent::NumberToType(dc->d_mdp.d_qtype)<<endl;
+      g_stats.ourtimeSlow++;
+    }
+
+    newLat=ourtime*1000; // usec
+    g_stats.avgLatencyOursUsec=(1-1.0/g_latencyStatSize)*g_stats.avgLatencyOursUsec + (float)newLat/g_latencyStatSize;
+    
      //    cout<<dc->d_mdp.d_qname<<"\t"<<MT->getUsec()<<"\t"<<sr.d_outqueries<<endl;
      delete dc;
      dc=0;
@@ -1691,6 +1713,7 @@ static string* doProcessUDPQuestion(const std::string& question, const ComboAddr
          updateResponseStats(tmpdh.rcode, fromaddr, response.length(), 0, 0);
        }
        g_stats.avgLatencyUsec=(1-1.0/g_latencyStatSize)*g_stats.avgLatencyUsec + 0.0; // we assume 0 usec
+      g_stats.avgLatencyOursUsec=(1-1.0/g_latencyStatSize)*g_stats.avgLatencyOursUsec + 0.0; // we assume 0 usec
        return 0;
      }
    }
diff --git a/pdns/rec_channel_rec.cc b/pdns/rec_channel_rec.cc

index 0e6b34854e3259fa523a9e805ccc867171a804d0..ee26c9de7a7401e84adb939a890aee338266a8bc 100644 (file)
--- a/pdns/rec_channel_rec.cc
+++ b/pdns/rec_channel_rec.cc
@@ -873,6 +873,15 @@ void registerAllStats()
    addGetStat("answers100-1000", &g_stats.answers100_1000);
    addGetStat("answers-slow", &g_stats.answersSlow);
  
+  addGetStat("x-ourtime0-1", &g_stats.ourtime0_1);
+  addGetStat("x-ourtime1-2", &g_stats.ourtime1_2);
+  addGetStat("x-ourtime2-4", &g_stats.ourtime2_4);
+  addGetStat("x-ourtime4-8", &g_stats.ourtime4_8);
+  addGetStat("x-ourtime8-16", &g_stats.ourtime8_16);
+  addGetStat("x-ourtime16-32", &g_stats.ourtime16_32);
+  addGetStat("x-ourtime-slow", &g_stats.ourtimeSlow);
+  
+  
    addGetStat("auth4-answers0-1", &g_stats.auth4Answers0_1);
    addGetStat("auth4-answers1-10", &g_stats.auth4Answers1_10);
    addGetStat("auth4-answers10-100", &g_stats.auth4Answers10_100);
@@ -887,6 +896,7 @@ void registerAllStats()
  
  
    addGetStat("qa-latency", doGetAvgLatencyUsec);
+  addGetStat("x-our-latency", []() { return g_stats.avgLatencyOursUsec; });
    addGetStat("unexpected-packets", &g_stats.unexpectedCount);
    addGetStat("case-mismatches", &g_stats.caseMismatchCount);
    addGetStat("spoof-prevents", &g_stats.spoofCount);
diff --git a/pdns/recursordist/docs/metrics.rst b/pdns/recursordist/docs/metrics.rst

index 760b06e737c4c894eb44040fdd236f9f59b37402..9eb408b22b0d2ebabefc0f00bde53976fe1b2726 100644 (file)
--- a/pdns/recursordist/docs/metrics.rst
+++ b/pdns/recursordist/docs/metrics.rst
@@ -439,3 +439,45 @@ number of seconds process has been running (since 3.1.5)
  user-msec
  ^^^^^^^^^
  number of CPU milliseconds spent in 'user' mode
+
+x-our-latency
+^^^^^^^^^^^^^
+New metric, which is not yet proven to be reliable. PowerDNS measures per query how much time has been spent waiting on
+authoritative servers. In addition, the Recursor measures the total amount of time needed to answer a question. The
+difference between these two durations is a measure of how much time was spent within PowerDNS. This metric is the average
+of that difference, in microseconds. Since 4.1.
+
+x-ourtime0-1
+^^^^^^^^^^^^
+New metric, which is not yet proven to be reliable. See x-our-latency for further details. Counts responses
+where between 0 and 1 milliseconds was spent within the Recursor. Since 4.1.
+
+x-ourtime1-2
+^^^^^^^^^^^^
+New metric, which is not yet proven to be reliable. See x-our-latency for further details. Counts responses
+where between 1 and 2 milliseconds was spent within the Recursor. Since 4.1.
+
+x-ourtime2-4
+^^^^^^^^^^^^
+New metric, which is not yet proven to be reliable. See x-our-latency for further details. Counts responses
+where between 2 and 4 milliseconds was spent within the Recursor. Since 4.1.
+
+x-ourtime4-8
+^^^^^^^^^^^^
+New metric, which is not yet proven to be reliable. See x-our-latency for further details. Counts responses
+where between 4 and 8 milliseconds was spent within the Recursor. Since 4.1.
+
+x-ourtime8-16
+^^^^^^^^^^^^^
+New metric, which is not yet proven to be reliable. See x-our-latency for further details. Counts responses
+where between 8 and 16 milliseconds was spent within the Recursor. Since 4.1.
+
+x-ourtime16-32
+^^^^^^^^^^^^^^
+New metric, which is not yet proven to be reliable. See x-our-latency for further details. Counts responses
+where between 16 and 32 milliseconds was spent within the Recursor. Since 4.1.
+
+x-ourtime-slow
+^^^^^^^^^^^^^^
+New metric, which is not yet proven to be reliable. See x-our-latency for further details. Counts responses
+where more than 32 milliseconds was spent within the Recursor. Since 4.1.
diff --git a/pdns/syncres.hh b/pdns/syncres.hh

index 5e7f20a1a8ac65b2654fc55872a432eddba63e33..e0018d26e8a5fd22152b4e4f4c1d1e5053ee7229 100644 (file)
--- a/pdns/syncres.hh
+++ b/pdns/syncres.hh
@@ -870,7 +870,9 @@ struct RecursorStats
    std::atomic<uint64_t> answers0_1, answers1_10, answers10_100, answers100_1000, answersSlow;
    std::atomic<uint64_t> auth4Answers0_1, auth4Answers1_10, auth4Answers10_100, auth4Answers100_1000, auth4AnswersSlow;
    std::atomic<uint64_t> auth6Answers0_1, auth6Answers1_10, auth6Answers10_100, auth6Answers100_1000, auth6AnswersSlow;
-  double avgLatencyUsec;
+  std::atomic<uint64_t> ourtime0_1, ourtime1_2, ourtime2_4, ourtime4_8, ourtime8_16, ourtime16_32, ourtimeSlow;
+  double avgLatencyUsec{0};
+  double avgLatencyOursUsec{0};
    std::atomic<uint64_t> qcounter;     // not increased for unauth packets
    std::atomic<uint64_t> ipv6qcounter;
    std::atomic<uint64_t> tcpqcounter;
author	bert hubert <bert.hubert@netherlabs.nl>
	Fri, 6 Oct 2017 11:20:15 +0000 (13:20 +0200)
committer	bert hubert <bert.hubert@netherlabs.nl>
	Fri, 6 Oct 2017 11:20:15 +0000 (13:20 +0200)
pdns/pdns_recursor.cc		patch \| blob \| blame \| history
pdns/rec_channel_rec.cc		patch \| blob \| blame \| history
pdns/recursordist/docs/metrics.rst		patch \| blob \| blame \| history
pdns/syncres.hh		patch \| blob \| blame \| history