]> git.ipfire.org Git - thirdparty/pdns.git/blobdiff - pdns/dnsdist.hh
dnsdist: Add steal, iowait and UDP errors metrics
[thirdparty/pdns.git] / pdns / dnsdist.hh
index e654b9e1556a1b0db57342e435966cb823052470..496e2dd6df8e9ba3077c8ed96a4640a09cee19cf 100644 (file)
@@ -113,8 +113,8 @@ struct DNSQuestion
 
 struct DNSResponse : DNSQuestion
 {
-  DNSResponse(const DNSName* name, uint16_t type, uint16_t class_, unsigned int consumed, const ComboAddress* lc, const ComboAddress* rem, struct dnsheader* header, size_t bufferSize, uint16_t responseLen, bool isTcp, const struct timespec* queryTime_):
-    DNSQuestion(name, type, class_, consumed, lc, rem, header, bufferSize, responseLen, isTcp, queryTime_) { }
+  DNSResponse(const DNSName* name, uint16_t type, uint16_t class_, unsigned int consumed_, const ComboAddress* lc, const ComboAddress* rem, struct dnsheader* header, size_t bufferSize, uint16_t responseLen, bool isTcp, const struct timespec* queryTime_):
+    DNSQuestion(name, type, class_, consumed_, lc, rem, header, bufferSize, responseLen, isTcp, queryTime_) { }
   DNSResponse(const DNSResponse&) = delete;
   DNSResponse& operator=(const DNSResponse&) = delete;
   DNSResponse(DNSResponse&&) = default;
@@ -290,14 +290,20 @@ struct DNSDistStats
     {"uptime", uptimeOfProcess},
     {"real-memory-usage", getRealMemoryUsage},
     {"special-memory-usage", getSpecialMemoryUsage},
+    {"udp-in-errors", boost::bind(udpErrorStats, "udp-in-errors")},
+    {"udp-noport-errors", boost::bind(udpErrorStats, "udp-noport-errors")},
+    {"udp-recvbuf-errors", boost::bind(udpErrorStats, "udp-recvbuf-errors")},
+    {"udp-sndbuf-errors", boost::bind(udpErrorStats, "udp-sndbuf-errors")},
     {"noncompliant-queries", &nonCompliantQueries},
     {"noncompliant-responses", &nonCompliantResponses},
     {"rdqueries", &rdQueries},
     {"empty-queries", &emptyQueries},
     {"cache-hits", &cacheHits},
     {"cache-misses", &cacheMisses},
-    {"cpu-user-msec", getCPUTimeUser},
+    {"cpu-iowait", getCPUIOWait},
+    {"cpu-steal", getCPUSteal},
     {"cpu-sys-msec", getCPUTimeSystem},
+    {"cpu-user-msec", getCPUTimeUser},
     {"fd-usage", getOpenFileDescriptors},
     {"dyn-blocked", &dynBlocked},
     {"dyn-block-nmg-size", [](const std::string&) { return g_dynblockNMG.getLocal()->size(); }},
@@ -308,98 +314,6 @@ struct DNSDistStats
   };
 };
 
-// Metric types for Prometheus
-enum class PrometheusMetricType: int {
-    counter = 1,
-    gauge = 2
-};
-
-// Keeps additional information about metrics
-struct MetricDefinition {
-  MetricDefinition(PrometheusMetricType _prometheusType, const std::string& _description): description(_description), prometheusType(_prometheusType) {
-  }
-  MetricDefinition() = default;
-
-  // Metric description
-  std::string description;
-  // Metric type for Prometheus
-  PrometheusMetricType prometheusType;
-};
-
-struct MetricDefinitionStorage {
-  // Return metric definition by name
-  bool getMetricDetails(std::string metricName, MetricDefinition& metric) {
-  auto metricDetailsIter = metrics.find(metricName);
-
-  if (metricDetailsIter == metrics.end()) {
-    return false;
-  }
-
-  metric = metricDetailsIter->second;
-    return true;
-  };
-
-  // Return string representation of Prometheus metric type
-  std::string getPrometheusStringMetricType(PrometheusMetricType metricType) {
-    switch (metricType) { 
-      case PrometheusMetricType::counter:
-        return "counter";
-        break;
-      case PrometheusMetricType::gauge:
-        return "gauge";
-        break;
-      default:
-        return "";
-        break;
-    }
-  };
-
-  std::map<std::string, MetricDefinition> metrics = {
-    { "responses",              MetricDefinition(PrometheusMetricType::counter, "Number of responses received from backends") },
-    { "servfail-responses",     MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received from backends") },
-    { "queries",                MetricDefinition(PrometheusMetricType::counter, "Number of received queries")},
-    { "frontend-nxdomain",      MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers sent to clients")},
-    { "frontend-servfail",      MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers sent to clients")},
-    { "frontend-noerror",       MetricDefinition(PrometheusMetricType::counter, "Number of NoError answers sent to clients")},
-    { "acl-drops",              MetricDefinition(PrometheusMetricType::counter, "Number of packets dropped because of the ACL")},
-    { "rule-drop",              MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a rule")},
-    { "rule-nxdomain",          MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers returned because of a rule")},
-    { "rule-refused",           MetricDefinition(PrometheusMetricType::counter, "Number of Refused answers returned because of a rule")},
-    { "rule-servfail",          MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received because of a rule")},
-    { "self-answered",          MetricDefinition(PrometheusMetricType::counter, "Number of self-answered responses")},
-    { "downstream-timeouts",    MetricDefinition(PrometheusMetricType::counter, "Number of queries not answered in time by a backend")},
-    { "downstream-send-errors", MetricDefinition(PrometheusMetricType::counter, "Number of errors when sending a query to a backend")},
-    { "trunc-failures",         MetricDefinition(PrometheusMetricType::counter, "Number of errors encountered while truncating an answer")},
-    { "no-policy",              MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because no server was available")},
-    { "latency0-1",             MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in less than 1ms")},
-    { "latency1-10",            MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 1-10 ms")},
-    { "latency10-50",           MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 10-50 ms")},
-    { "latency50-100",          MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 50-100 ms")},
-    { "latency100-1000",        MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 100-1000 ms")},
-    { "latency-slow",           MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in more than 1 second")},
-    { "latency-avg100",         MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 100 packets")},
-    { "latency-avg1000",        MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 1000 packets")},
-    { "latency-avg10000",       MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 10000 packets")},
-    { "latency-avg1000000",     MetricDefinition(PrometheusMetricType::gauge,   "Average response latency in microseconds of the last 1000000 packets")},
-    { "uptime",                 MetricDefinition(PrometheusMetricType::gauge,   "Uptime of the dnsdist process in seconds")},
-    { "real-memory-usage",      MetricDefinition(PrometheusMetricType::gauge,   "Current memory usage in bytes")},
-    { "noncompliant-queries",   MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped as non-compliant")},
-    { "noncompliant-responses", MetricDefinition(PrometheusMetricType::counter, "Number of answers from a backend dropped as non-compliant")},
-    { "rdqueries",              MetricDefinition(PrometheusMetricType::counter, "Number of received queries with the recursion desired bit set")},
-    { "empty-queries",          MetricDefinition(PrometheusMetricType::counter, "Number of empty queries received from clients")},
-    { "cache-hits",             MetricDefinition(PrometheusMetricType::counter, "Number of times an answer was retrieved from cache")},
-    { "cache-misses",           MetricDefinition(PrometheusMetricType::counter, "Number of times an answer not found in the cache")},
-    { "cpu-user-msec",          MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the user state")},
-    { "cpu-sys-msec",           MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the system state")},
-    { "fd-usage",               MetricDefinition(PrometheusMetricType::gauge,   "Number of currently used file descriptors")},
-    { "dyn-blocked",            MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a dynamic block")},
-    { "dyn-block-nmg-size",     MetricDefinition(PrometheusMetricType::gauge,   "Number of dynamic blocks entries") },
-    { "security-status",        MetricDefinition(PrometheusMetricType::gauge,   "Security status of this software. 0=unknown, 1=OK, 2=upgrade recommended, 3=upgrade mandatory") },
-  };
-};
-
-extern MetricDefinitionStorage g_metricDefinitions;
 extern struct DNSDistStats g_stats;
 void doLatencyStats(double udiff);
 
@@ -655,6 +569,10 @@ struct QueryCount {
   {
     pthread_rwlock_init(&queryLock, nullptr);
   }
+  ~QueryCount()
+  {
+    pthread_rwlock_destroy(&queryLock);
+  }
   QueryCountRecords records;
   QueryCountFilter filter;
   pthread_rwlock_t queryLock;
@@ -665,7 +583,7 @@ extern QueryCount g_qcount;
 
 struct ClientState
 {
-  ClientState(const ComboAddress& local_, bool isTCP, bool doReusePort, int fastOpenQueue, const std::string& itfName, const std::set<int>& cpus_): cpus(cpus_), local(local_), interface(itfName), fastOpenQueueSize(fastOpenQueue), tcp(isTCP), reuseport(doReusePort)
+  ClientState(const ComboAddress& local_, bool isTCP_, bool doReusePort, int fastOpenQueue, const std::string& itfName, const std::set<int>& cpus_): cpus(cpus_), local(local_), interface(itfName), fastOpenQueueSize(fastOpenQueue), tcp(isTCP_), reuseport(doReusePort)
   {
   }
 
@@ -685,7 +603,13 @@ struct ClientState
   std::atomic<uint64_t> tcpCurrentConnections{0};
   std::atomic<uint64_t> tlsNewSessions{0}; // A new TLS session has been negotiated, no resumption
   std::atomic<uint64_t> tlsResumptions{0}; // A TLS session has been resumed, either via session id or via a TLS ticket
-
+  std::atomic<uint64_t> tlsUnknownTicketKey{0}; // A TLS ticket has been presented but we don't have the associated key (might have expired)
+  std::atomic<uint64_t> tlsInactiveTicketKey{0}; // A TLS ticket has been successfully resumed but the key is no longer active, we should issue a new one
+  std::atomic<uint64_t> tls10queries{0};   // valid DNS queries received via TLSv1.0
+  std::atomic<uint64_t> tls11queries{0};   // valid DNS queries received via TLSv1.1
+  std::atomic<uint64_t> tls12queries{0};   // valid DNS queries received via TLSv1.2
+  std::atomic<uint64_t> tls13queries{0};   // valid DNS queries received via TLSv1.3
+  std::atomic<uint64_t> tlsUnknownqueries{0};   // valid DNS queries received via unknown TLS version
   std::atomic<double> tcpAvgQueriesPerConnection{0.0};
   /* in ms */
   std::atomic<double> tcpAvgConnectionDuration{0.0};
@@ -712,6 +636,11 @@ struct ClientState
     return udpFD == -1;
   }
 
+  bool hasTLS() const
+  {
+    return tlsFrontend != nullptr || dohFrontend != nullptr;
+  }
+
   std::string getType() const
   {
     std::string result = udpFD != -1 ? "UDP" : "TCP";
@@ -749,9 +678,9 @@ struct ClientState
   }
 #endif /* HAVE_EBPF */
 
-  void updateTCPMetrics(size_t queries, uint64_t durationMs)
+  void updateTCPMetrics(size_t nbQueries, uint64_t durationMs)
   {
-    tcpAvgQueriesPerConnection = (99.0 * tcpAvgQueriesPerConnection / 100.0) + (queries / 100.0);
+    tcpAvgQueriesPerConnection = (99.0 * tcpAvgQueriesPerConnection / 100.0) + (nbQueries / 100.0);
     tcpAvgConnectionDuration = (99.0 * tcpAvgConnectionDuration / 100.0) + (durationMs / 100.0);
   }
 };
@@ -824,8 +753,8 @@ struct DownstreamState
 {
    typedef std::function<std::tuple<DNSName, uint16_t, uint16_t>(const DNSName&, uint16_t, uint16_t, dnsheader*)> checkfunc_t;
 
-  DownstreamState(const ComboAddress& remote_, const ComboAddress& sourceAddr_, unsigned int sourceItf, size_t numberOfSockets);
-  DownstreamState(const ComboAddress& remote_): DownstreamState(remote_, ComboAddress(), 0, 1) {}
+  DownstreamState(const ComboAddress& remote_, const ComboAddress& sourceAddr_, unsigned int sourceItf, const std::string& sourceItfName, size_t numberOfSockets, bool connect);
+  DownstreamState(const ComboAddress& remote_): DownstreamState(remote_, ComboAddress(), 0, std::string(), 1, true) {}
   ~DownstreamState()
   {
     for (auto& fd : sockets) {
@@ -834,11 +763,13 @@ struct DownstreamState
         fd = -1;
       }
     }
+    pthread_rwlock_destroy(&d_lock);
   }
   boost::uuids::uuid id;
   std::set<unsigned int> hashes;
   mutable pthread_rwlock_t d_lock;
   std::vector<int> sockets;
+  const std::string sourceItfName;
   std::mutex socketsLock;
   std::mutex connectLock;
   std::unique_ptr<FDMultiplexer> mplexer{nullptr};
@@ -943,9 +874,9 @@ struct DownstreamState
   void setId(const boost::uuids::uuid& newId);
   void setWeight(int newWeight);
 
-  void updateTCPMetrics(size_t queries, uint64_t durationMs)
+  void updateTCPMetrics(size_t nbQueries, uint64_t durationMs)
   {
-    tcpAvgQueriesPerConnection = (99.0 * tcpAvgQueriesPerConnection / 100.0) + (queries / 100.0);
+    tcpAvgQueriesPerConnection = (99.0 * tcpAvgQueriesPerConnection / 100.0) + (nbQueries / 100.0);
     tcpAvgConnectionDuration = (99.0 * tcpAvgConnectionDuration / 100.0) + (durationMs / 100.0);
   }
 };
@@ -988,6 +919,10 @@ struct ServerPool
   {
     pthread_rwlock_init(&d_lock, nullptr);
   }
+  ~ServerPool()
+  {
+    pthread_rwlock_destroy(&d_lock);
+  }
 
   const std::shared_ptr<DNSDistPacketCache> getCache() const { return packetCache; };
 
@@ -1137,7 +1072,6 @@ extern size_t g_maxTCPConnectionDuration;
 extern size_t g_maxTCPConnectionsPerClient;
 extern std::atomic<uint16_t> g_cacheCleaningDelay;
 extern std::atomic<uint16_t> g_cacheCleaningPercentage;
-extern bool g_verboseHealthChecks;
 extern uint32_t g_staleCacheEntriesTTL;
 extern bool g_apiReadWrite;
 extern std::string g_apiConfigDirectory;
@@ -1149,6 +1083,7 @@ extern size_t g_udpVectorSize;
 extern bool g_preserveTrailingData;
 extern bool g_allowEmptyResponse;
 extern bool g_roundrobinFailOnNoServer;
+extern double g_consistentHashBalancingFactor;
 
 #ifdef HAVE_EBPF
 extern shared_ptr<BPFFilter> g_defaultBPFFilter;
@@ -1175,7 +1110,6 @@ struct LocalHolders
 struct dnsheader;
 
 void controlThread(int fd, ComboAddress local);
-vector<std::function<void(void)>> setupLua(bool client, const std::string& config);
 std::shared_ptr<ServerPool> getPool(const pools_t& pools, const std::string& poolName);
 std::shared_ptr<ServerPool> createPoolIfNotExists(pools_t& pools, const string& poolName);
 NumberedServerVector getDownstreamCandidates(const pools_t& pools, const std::string& poolName);
@@ -1232,7 +1166,9 @@ extern bool g_snmpTrapsEnabled;
 extern DNSDistSNMPAgent* g_snmpAgent;
 extern bool g_addEDNSToSelfGeneratedResponses;
 
-static const size_t s_udpIncomingBufferSize{1500};
+extern std::set<std::string> g_capabilitiesToRetain;
+static const uint16_t s_udpIncomingBufferSize{1500}; // don't accept UDP queries larger than this value
+static const size_t s_maxPacketCacheEntrySize{4096}; // don't cache responses larger than this value
 
 enum class ProcessQueryResult { Drop, SendAnswer, PassToBackend };
 ProcessQueryResult processQuery(DNSQuestion& dq, ClientState& cs, LocalHolders& holders, std::shared_ptr<DownstreamState>& selectedBackend);