dnsdist: Add mitigations against misbehaving TCP/TLS clients

author Remi Gacogne <remi.gacogne@powerdns.com>

Fri, 28 Mar 2025 14:52:08 +0000 (15:52 +0100)

committer Remi Gacogne <remi.gacogne@powerdns.com>

Mon, 31 Mar 2025 14:19:05 +0000 (16:19 +0200)
author Remi Gacogne <remi.gacogne@powerdns.com>
Fri, 28 Mar 2025 14:52:08 +0000 (15:52 +0100)
committer Remi Gacogne <remi.gacogne@powerdns.com>
Mon, 31 Mar 2025 14:19:05 +0000 (16:19 +0200)
diff --git a/pdns/dnsdistdist/Makefile.am b/pdns/dnsdistdist/Makefile.am

index fe8185fa833a392ea7d47d76f9a59fe8260f017c..c8aafab7a22e5e26d4e1ecbccbcd8186b8b45bdd 100644 (file)
--- a/pdns/dnsdistdist/Makefile.am
+++ b/pdns/dnsdistdist/Makefile.am
@@ -180,7 +180,7 @@ dnsdist_SOURCES = \
         dnsdist-backoff.hh \
         dnsdist-cache.cc dnsdist-cache.hh \
         dnsdist-carbon.cc dnsdist-carbon.hh \
-       dnsdist-concurrent-connections.hh \
+       dnsdist-concurrent-connections.cc dnsdist-concurrent-connections.hh \
         dnsdist-configuration-yaml-internal.hh \
         dnsdist-configuration-yaml.cc dnsdist-configuration-yaml.hh \
         dnsdist-configuration.cc dnsdist-configuration.hh \
@@ -316,7 +316,7 @@ testrunner_SOURCES = \
         dnsdist-backend.cc dnsdist-backend.hh \
         dnsdist-backoff.hh \
         dnsdist-cache.cc dnsdist-cache.hh \
-       dnsdist-concurrent-connections.hh \
+       dnsdist-concurrent-connections.cc dnsdist-concurrent-connections.hh \
         dnsdist-configuration.cc dnsdist-configuration.hh \
         dnsdist-crypto.cc dnsdist-crypto.hh \
         dnsdist-dnsparser.cc dnsdist-dnsparser.hh \
diff --git a/pdns/dnsdistdist/dnsdist-carbon.cc b/pdns/dnsdistdist/dnsdist-carbon.cc

index 596e0eae1091f38cc9d618348dc0b08a4b79c1ca..27c5001ee314c6b636fae9c586d8081e74b610c7 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-carbon.cc
+++ b/pdns/dnsdistdist/dnsdist-carbon.cc
@@ -149,6 +149,7 @@ static bool doOneCarbonExport(const Carbon::Endpoint& endpoint)
        str << base << "tcpmaxconcurrentconnections" << ' ' << front->tcpMaxConcurrentConnections.load() << " " << now << "\r\n";
        str << base << "tcpavgqueriesperconnection" << ' ' << front->tcpAvgQueriesPerConnection.load() << " " << now << "\r\n";
        str << base << "tcpavgconnectionduration" << ' ' << front->tcpAvgConnectionDuration.load() << " " << now << "\r\n";
+      str << base << "tcpavgreadios" << ' ' << front->tcpAvgIOsPerConnection.load() << " " << now << "\r\n";
        str << base << "tls10-queries" << ' ' << front->tls10queries.load() << " " << now << "\r\n";
        str << base << "tls11-queries" << ' ' << front->tls11queries.load() << " " << now << "\r\n";
        str << base << "tls12-queries" << ' ' << front->tls12queries.load() << " " << now << "\r\n";
diff --git a/pdns/dnsdistdist/dnsdist-concurrent-connections.cc b/pdns/dnsdistdist/dnsdist-concurrent-connections.cc

new file mode 100644 (file)

index 0000000..ba191b9
--- /dev/null
+++ b/pdns/dnsdistdist/dnsdist-concurrent-connections.cc
@@ -0,0 +1,329 @@
+/*
+ * This file is part of PowerDNS or dnsdist.
+ * Copyright -- PowerDNS.COM B.V. and its contributors
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * In addition, for the avoidance of any doubt, permission is granted to
+ * link this program with OpenSSL and to (re)distribute the binaries
+ * produced as the result of such linking.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "dnsdist-concurrent-connections.hh"
+
+#include <boost/multi_index_container.hpp>
+#include <boost/multi_index/ordered_index.hpp>
+#include <boost/multi_index/hashed_index.hpp>
+#include <boost/multi_index/key_extractors.hpp>
+
+#include <utility>
+
+#include "circular_buffer.hh"
+#include "dnsdist-configuration.hh"
+#include "dolog.hh"
+#include "lock.hh"
+
+namespace dnsdist
+{
+
+static constexpr size_t NB_SHARDS = 10;
+
+struct ClientActivity
+{
+  uint64_t tcpConnections{0};
+  uint64_t tlsNewSessions{0}; /* without resumption */
+  uint64_t tlsResumedSessions{0};
+  time_t bucketEndTime{0};
+};
+
+struct ClientEntry
+{
+  mutable boost::circular_buffer<ClientActivity> d_activity;
+  AddressAndPortRange d_addr;
+  mutable uint64_t d_concurrentConnections{0};
+  mutable time_t d_bannedUntil{0};
+  time_t d_lastSeen{0};
+};
+
+struct TimeTag
+{
+};
+struct AddressTag
+{
+};
+
+using map_t = boost::multi_index_container<
+  ClientEntry,
+  boost::multi_index::indexed_by<
+    boost::multi_index::hashed_unique<boost::multi_index::tag<AddressTag>,
+                                      boost::multi_index::member<ClientEntry, AddressAndPortRange, &ClientEntry::d_addr>, AddressAndPortRange::hash>,
+    boost::multi_index::ordered_non_unique<boost::multi_index::tag<TimeTag>,
+                                           boost::multi_index::member<ClientEntry, time_t, &ClientEntry::d_lastSeen>>>>;
+
+static std::vector<LockGuarded<map_t>> s_tcpClientsConnectionMetrics{10};
+
+static AddressAndPortRange getRange(const ComboAddress& from)
+{
+  const auto& immutable = dnsdist::configuration::getImmutableConfiguration();
+  return AddressAndPortRange(from, from.isIPv4() ? immutable.d_tcpConnectionsMaskV4 : immutable.d_tcpConnectionsMaskV6, from.isIPv4() ? immutable.d_tcpConnectionsMaskV4Port : 0);
+}
+
+static size_t getShardID(const AddressAndPortRange& from)
+{
+  auto hash = AddressAndPortRange::hash()(from);
+  return hash % NB_SHARDS;
+}
+
+static bool checkTCPConnectionsRate(const boost::circular_buffer<ClientActivity>& activity, time_t now, uint64_t maxTCPRate, uint64_t maxTLSNewRate, uint64_t maxTLSResumedRate, uint64_t interval, bool isTLS)
+{
+  if (maxTCPRate == 0 && (!isTLS || (maxTLSNewRate == 0 && maxTLSResumedRate == 0))) {
+    return true;
+  }
+  uint64_t bucketsConsidered = 0;
+  uint64_t connectionsSeen = 0;
+  uint64_t tlsNewSeen = 0;
+  uint64_t tlsResumedSeen = 0;
+  time_t cutOff = now - (interval * 60);
+  for (const auto& entry : activity) {
+    if (entry.bucketEndTime < cutOff) {
+      continue;
+    }
+    ++bucketsConsidered;
+    connectionsSeen += entry.tcpConnections;
+    tlsNewSeen += entry.tlsNewSessions;
+    tlsResumedSeen += entry.tlsResumedSessions;
+  }
+  if (bucketsConsidered == 0) {
+    return true;
+  }
+  if (maxTCPRate > 0) {
+    auto rate = connectionsSeen / bucketsConsidered;
+    if (rate > maxTCPRate) {
+      return false;
+    }
+  }
+  if (maxTLSNewRate > 0 && isTLS) {
+    auto rate = tlsNewSeen / bucketsConsidered;
+    if (rate > maxTLSNewRate) {
+      return false;
+    }
+  }
+  if (maxTLSResumedRate > 0 && isTLS) {
+    auto rate = tlsResumedSeen / bucketsConsidered;
+    if (rate > maxTLSResumedRate) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void IncomingConcurrentTCPConnectionsManager::cleanup(time_t now)
+{
+  const auto& immutable = dnsdist::configuration::getImmutableConfiguration();
+  const auto interval = immutable.d_tcpConnectionsRatePerClientInterval;
+  time_t cutOff = now - (interval * 60);
+  for (auto& shard : s_tcpClientsConnectionMetrics) {
+    auto db = shard.lock();
+    auto& index = db->get<TimeTag>();
+    for (auto entry = index.begin(); entry != index.end();) {
+      if (entry->d_lastSeen >= cutOff) {
+        /* this index is ordered on timestamps,
+           so the first valid entry we see means we are done */
+        break;
+      }
+
+      entry = index.erase(entry);
+    }
+  }
+}
+
+static ClientActivity& getCurrentClientActivity(const ClientEntry& entry, time_t now)
+{
+  auto& activity = entry.d_activity;
+  if (activity.empty() || activity.front().bucketEndTime < now) {
+    activity.push_front(ClientActivity{1, 0, 0, now + 60});
+  }
+  return activity.front();
+}
+
+IncomingConcurrentTCPConnectionsManager::NewConnectionResult IncomingConcurrentTCPConnectionsManager::accountNewTCPConnection(const ComboAddress& from, bool isTLS)
+{
+  const auto& immutable = dnsdist::configuration::getImmutableConfiguration();
+  const auto maxConnsPerClient = immutable.d_maxTCPConnectionsPerClient;
+  const auto threshold = immutable.d_tcpConnectionsOverloadThreshold;
+  const auto tcpRate = immutable.d_maxTCPConnectionsRatePerClient;
+  const auto tlsNewRate = immutable.d_maxTLSNewSessionsRatePerClient;
+  const auto tlsResumedRate = immutable.d_maxTLSResumedSessionsRatePerClient;
+  const auto interval = immutable.d_tcpConnectionsRatePerClientInterval;
+  if (maxConnsPerClient == 0 && tcpRate == 0 && tlsResumedRate == 0 && tlsNewRate == 0 && immutable.d_maxTCPReadIOsPerQuery == 0) {
+    return NewConnectionResult::Allowed;
+  }
+
+  auto now = time(nullptr);
+  auto updateActivity = [now](ClientEntry& entry) {
+    ++entry.d_concurrentConnections;
+    entry.d_lastSeen = now;
+    auto& activity = getCurrentClientActivity(entry, now);
+    ++activity.tcpConnections;
+  };
+
+  auto checkConnectionAllowed = [now, from, maxConnsPerClient, threshold, tcpRate, tlsNewRate, tlsResumedRate, interval, isTLS, &immutable](const ClientEntry& entry) {
+    if (entry.d_bannedUntil != 0 && entry.d_bannedUntil >= now) {
+      vinfolog("Refusing TCP connection from %s: banned", from.toStringWithPort());
+      return NewConnectionResult::Denied;
+    }
+    if (maxConnsPerClient > 0 && entry.d_concurrentConnections >= maxConnsPerClient) {
+      vinfolog("Refusing TCP connection from %s: too many connections", from.toStringWithPort());
+      return NewConnectionResult::Denied;
+    }
+    if (!checkTCPConnectionsRate(entry.d_activity, now, tcpRate, tlsNewRate, tlsResumedRate, interval, isTLS)) {
+      entry.d_bannedUntil = now + immutable.d_tcpBanDurationForExceedingTCPTLSRate;
+      vinfolog("Banning TCP connections from %s for %d seconds: too many new TCP/TLS connections per second", from.toStringWithPort(), immutable.d_tcpBanDurationForExceedingTCPTLSRate);
+      return NewConnectionResult::Denied;
+    }
+
+    if (maxConnsPerClient == 0 || threshold == 0) {
+      return NewConnectionResult::Allowed;
+    }
+
+    auto current = (100 * entry.d_concurrentConnections) / maxConnsPerClient;
+    if (current < threshold) {
+      return NewConnectionResult::Allowed;
+    }
+    vinfolog("Restricting TCP connection from %s: nearly reaching the maximum number of concurrent TCP connections", from.toStringWithPort());
+    return NewConnectionResult::Restricted;
+  };
+
+  auto addr = getRange(from);
+  {
+    auto shardID = getShardID(addr);
+    auto db = s_tcpClientsConnectionMetrics.at(shardID).lock();
+    const auto& entry = db->find(addr);
+    if (entry == db->end()) {
+      ClientEntry newEntry;
+      newEntry.d_activity.set_capacity(interval);
+      newEntry.d_addr = addr;
+      newEntry.d_concurrentConnections = 1;
+      newEntry.d_lastSeen = now;
+      db->insert(std::move(newEntry));
+      return NewConnectionResult::Allowed;
+    }
+    auto result = checkConnectionAllowed(*entry);
+    if (result != NewConnectionResult::Denied) {
+      db->modify(entry, updateActivity);
+    }
+    return result;
+  }
+}
+
+bool IncomingConcurrentTCPConnectionsManager::isClientOverThreshold(const ComboAddress& from)
+{
+  const auto& immutable = dnsdist::configuration::getImmutableConfiguration();
+  const auto maxConnsPerClient = immutable.d_maxTCPConnectionsPerClient;
+  if (maxConnsPerClient == 0 || immutable.d_tcpConnectionsOverloadThreshold == 0) {
+    return false;
+  }
+
+  size_t count = 0;
+  auto addr = getRange(from);
+  auto shardID = getShardID(addr);
+  {
+    auto db = s_tcpClientsConnectionMetrics.at(shardID).lock();
+    auto it = db->find(addr);
+    if (it == db->end()) {
+      return false;
+    }
+    count = it->d_concurrentConnections;
+  }
+
+  auto current = (100 * count) / maxConnsPerClient;
+  return current >= immutable.d_tcpConnectionsOverloadThreshold;
+}
+
+void IncomingConcurrentTCPConnectionsManager::banClientFor(const ComboAddress& from, time_t now, uint32_t seconds)
+{
+  auto addr = getRange(from);
+  auto shardID = getShardID(addr);
+  {
+    auto db = s_tcpClientsConnectionMetrics.at(shardID).lock();
+    auto it = db->find(addr);
+    if (it == db->end()) {
+      return;
+    }
+    db->modify(it, [now, seconds](ClientEntry& entry) {
+      entry.d_lastSeen = now;
+      entry.d_bannedUntil = now + seconds;
+    });
+  }
+  vinfolog("Banned TCP client %s for %d seconds", from.toStringWithPort(), seconds);
+}
+
+void IncomingConcurrentTCPConnectionsManager::accountClosedTCPConnection(const ComboAddress& from)
+{
+  const auto maxConnsPerClient = dnsdist::configuration::getImmutableConfiguration().d_maxTCPConnectionsPerClient;
+  if (maxConnsPerClient == 0) {
+    return;
+  }
+  auto addr = getRange(from);
+  auto shardID = getShardID(addr);
+  {
+    auto db = s_tcpClientsConnectionMetrics.at(shardID).lock();
+    auto it = db->find(addr);
+    if (it == db->end()) {
+      return;
+    }
+    auto& count = it->d_concurrentConnections;
+    count--;
+  }
+}
+
+void IncomingConcurrentTCPConnectionsManager::accountTLSNewSession(const ComboAddress& from)
+{
+  const auto maxRate = dnsdist::configuration::getImmutableConfiguration().d_maxTLSNewSessionsRatePerClient > 0;
+  if (maxRate == 0) {
+    return;
+  }
+  auto addr = getRange(from);
+  auto shardID = getShardID(addr);
+  {
+    auto db = s_tcpClientsConnectionMetrics.at(shardID).lock();
+    auto it = db->find(addr);
+    if (it == db->end()) {
+      return;
+    }
+    auto& count = getCurrentClientActivity(*it, time(nullptr)).tlsNewSessions;
+    count++;
+  }
+}
+
+void IncomingConcurrentTCPConnectionsManager::accountTLSResumedSession(const ComboAddress& from)
+{
+  const auto maxRate = dnsdist::configuration::getImmutableConfiguration().d_maxTLSResumedSessionsRatePerClient > 0;
+  if (maxRate == 0) {
+    return;
+  }
+  auto addr = getRange(from);
+  auto shardID = getShardID(addr);
+  {
+    auto db = s_tcpClientsConnectionMetrics.at(shardID).lock();
+    auto it = db->find(addr);
+    if (it == db->end()) {
+      return;
+    }
+    auto& count = getCurrentClientActivity(*it, time(nullptr)).tlsResumedSessions;
+    count++;
+  }
+}
+
+}
diff --git a/pdns/dnsdistdist/dnsdist-concurrent-connections.hh b/pdns/dnsdistdist/dnsdist-concurrent-connections.hh

index 9827bbc33fecc90b65e5b8d6252f8234946ddd53..a77b2fa3a568a430a0928af4a481088d90db3af9 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-concurrent-connections.hh
+++ b/pdns/dnsdistdist/dnsdist-concurrent-connections.hh
@@ -21,47 +21,25 @@
   */
  #pragma once
  
-#include <map>
  #include "iputils.hh"
-#include "lock.hh"
-#include "dnsdist-configuration.hh"
  
  namespace dnsdist
  {
  class IncomingConcurrentTCPConnectionsManager
  {
  public:
-  static bool accountNewTCPConnection(const ComboAddress& from)
+  enum class NewConnectionResult : uint8_t
    {
-    const auto maxConnsPerClient = dnsdist::configuration::getImmutableConfiguration().d_maxTCPConnectionsPerClient;
-    if (maxConnsPerClient == 0) {
-      return true;
-    }
-    auto db = s_tcpClientsConcurrentConnectionsCount.lock();
-    auto& count = (*db)[from];
-    if (count >= maxConnsPerClient) {
-      return false;
-    }
-    ++count;
-    return true;
-  }
-
-  static void accountClosedTCPConnection(const ComboAddress& from)
-  {
-    const auto maxConnsPerClient = dnsdist::configuration::getImmutableConfiguration().d_maxTCPConnectionsPerClient;
-    if (maxConnsPerClient == 0) {
-      return;
-    }
-    auto db = s_tcpClientsConcurrentConnectionsCount.lock();
-    auto& count = db->at(from);
-    count--;
-    if (count == 0) {
-      db->erase(from);
-    }
-  }
-
-private:
-  static LockGuarded<std::map<ComboAddress, size_t, ComboAddress::addressOnlyLessThan>> s_tcpClientsConcurrentConnectionsCount;
+    Allowed = 0,
+    Denied = 1,
+    Restricted = 2,
+  };
+  static NewConnectionResult accountNewTCPConnection(const ComboAddress& from, bool isTLS);
+  static bool isClientOverThreshold(const ComboAddress& from);
+  static void accountTLSNewSession(const ComboAddress& from);
+  static void accountTLSResumedSession(const ComboAddress& from);
+  static void accountClosedTCPConnection(const ComboAddress& from);
+  static void banClientFor(const ComboAddress& from, time_t now, uint32_t seconds);
+  static void cleanup(time_t now);
  };
-
  }
diff --git a/pdns/dnsdistdist/dnsdist-configuration.hh b/pdns/dnsdistdist/dnsdist-configuration.hh

index 435cc0964db946330cb651ce0aa5c7b996521630..29903199d26bed3b993792948d6f19b470184210 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-configuration.hh
+++ b/pdns/dnsdistdist/dnsdist-configuration.hh
@@ -80,6 +80,10 @@ struct ImmutableConfiguration
    uint64_t d_outgoingDoHMaxIdlePerBackend{10};
    uint64_t d_outgoingTCPMaxIdlePerBackend{10};
    uint64_t d_maxTCPClientThreads{10};
+  uint64_t d_maxTCPConnectionsRatePerClient{0};
+  uint64_t d_maxTLSResumedSessionsRatePerClient{0};
+  uint64_t d_maxTLSNewSessionsRatePerClient{0};
+  uint64_t d_tcpConnectionsRatePerClientInterval{5};
    size_t d_maxTCPConnectionsPerClient{0};
    size_t d_udpVectorSize{1};
    size_t d_ringsCapacity{10000};
@@ -88,8 +92,15 @@ struct ImmutableConfiguration
    uint32_t d_socketUDPSendBuffer{0};
    uint32_t d_socketUDPRecvBuffer{0};
    uint32_t d_hashPerturbation{0};
+  uint32_t d_maxTCPReadIOsPerQuery{50};
+  uint32_t d_tcpBanDurationForExceedingMaxReadIOsPerQuery{60};
+  uint32_t d_tcpBanDurationForExceedingTCPTLSRate{10};
    uint16_t d_maxUDPOutstanding{std::numeric_limits<uint16_t>::max()};
    uint8_t d_udpTimeout{2};
+  uint8_t d_tcpConnectionsOverloadThreshold{90};
+  uint8_t d_tcpConnectionsMaskV4{32};
+  uint8_t d_tcpConnectionsMaskV6{128};
+  uint8_t d_tcpConnectionsMaskV4Port{0};
    bool d_randomizeUDPSocketsToBackend{false};
    bool d_randomizeIDsToBackend{false};
    bool d_ringsRecordQueries{true};
diff --git a/pdns/dnsdistdist/dnsdist-console.cc b/pdns/dnsdistdist/dnsdist-console.cc

index dcdd436cb75dde38689fc4d270c67bbc1830de3b..3f1ac4a08bc5e76eb63de39f0ad112d37cabb791 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-console.cc
+++ b/pdns/dnsdistdist/dnsdist-console.cc
@@ -706,6 +706,8 @@ static const std::vector<dnsdist::console::ConsoleKeyword> s_consoleKeywords{
    {"setAddEDNSToSelfGeneratedResponses", true, "add", "set whether to add EDNS to self-generated responses, provided that the initial query had EDNS"},
    {"setAllowEmptyResponse", true, "allow", "Set to true (defaults to false) to allow empty responses (qdcount=0) with a NoError or NXDomain rcode (default) from backends"},
    {"setAPIWritable", true, "bool, dir", "allow modifications via the API. if `dir` is set, it must be a valid directory where the configuration files will be written by the API"},
+  {"setBanDurationForExceedingMaxReadIOsPerQuery", true, "n", "Set for how long, in seconds, a client (or range) will be prevented from opening a new TCP connection when it has exceeded the maximum number of read IOs per query over a TCP connection"},
+  {"setBanDurationForExceedingTCPTLSRate", true, "n", "Set for how long, in seconds, a client (or range) will be prevented from opening a new TCP connection when it has exceeded the TCP connection or TLS session rates"},
    {"setCacheCleaningDelay", true, "num", "Set the interval in seconds between two runs of the cache cleaning algorithm, removing expired entries"},
    {"setCacheCleaningPercentage", true, "num", "Set the percentage of the cache that the cache cleaning algorithm will try to free by removing expired entries. By default (100), all expired entries are remove"},
    {"setConsistentHashingBalancingFactor", true, "factor", "Set the balancing factor for bounded-load consistent hashing"},
@@ -728,9 +730,13 @@ static const std::vector<dnsdist::console::ConsoleKeyword> s_consoleKeywords{
    {"setMaxCachedTCPConnectionsPerDownstream", true, "max", "Set the maximum number of inactive TCP connections to a backend cached by each worker TCP thread"},
    {"setMaxTCPClientThreads", true, "n", "set the maximum of TCP client threads, handling TCP connections"},
    {"setMaxTCPConnectionDuration", true, "n", "set the maximum duration of an incoming TCP connection, in seconds. 0 means unlimited"},
+  {"setMaxTCPConnectionRatePerClient", true, "n", "set the maximum number of new TCP connections that a given client can open per second"},
    {"setMaxTCPConnectionsPerClient", true, "n", "set the maximum number of TCP connections per client. 0 means unlimited"},
    {"setMaxTCPQueriesPerConnection", true, "n", "set the maximum number of queries in an incoming TCP connection. 0 means unlimited"},
    {"setMaxTCPQueuedConnections", true, "n", "set the maximum number of TCP connections queued (waiting to be picked up by a client thread)"},
+  {"setMaxTCPReadIOsPerQuery", true, "n", "set the maximum number of read events needed to receive a new query on a TCP connection"},
+  {"setMaxTLSNewSessionRatePerClient", true, "n", "set the maximum number of new TLS sessions that a given client can open per second"},
+  {"setMaxTLSResumedSessionRatePerClient", true, "n", "set the maximum number of resumed TLS sessions that a given client can open per second"},
    {"setMaxUDPOutstanding", true, "n", "set the maximum number of outstanding UDP queries to a given backend server. This can only be set at configuration time and defaults to 65535"},
    {"setMetric", true, "name, value", "Set the value of a custom metric to the supplied value"},
    {"setPayloadSizeOnSelfGeneratedAnswers", true, "payloadSize", "set the UDP payload size advertised via EDNS on self-generated responses"},
@@ -759,9 +765,15 @@ static const std::vector<dnsdist::console::ConsoleKeyword> s_consoleKeywords{
    {"setStaleCacheEntriesTTL", true, "n", "allows using cache entries expired for at most n seconds when there is no backend available to answer for a query"},
    {"setStructuredLogging", true, "value [, options]", "set whether log messages should be in structured-logging-like format"},
    {"setSyslogFacility", true, "facility", "set the syslog logging facility to 'facility'. Defaults to LOG_DAEMON"},
+  {"setTCPConnectionsMaskV4", true, "n", "Mask to apply to IPv4 addresses when enforcing the TLS connection or TLS sessions rates"},
+  {"setTCPConnectionsMaskV4Port", true, "n", "Mask to apply to the port when enforcing the TLS connection or TLS sessions rates for IPv4 addresses"},
+  {"setTCPConnectionsMaskV6", true, "n", "Mask to apply to IPv6 addresses when enforcing the TLS connection or TLS sessions rates"},
+  {"setTCPConnectionsOverloadThreshold", true, "n", "Set a threshold as a percentage to the maximum number of incoming TCP connections per frontend or per client. When this threshold is reached, new incoming TCP connections are restricted"},
+  {"setTCPConnectionRateInterval", true, "n", "Set the interval, in minutes, over which new TCP and TLS per client connection rates are computed"},
    {"setTCPDownstreamCleanupInterval", true, "interval", "minimum interval in seconds between two cleanups of the idle TCP downstream connections"},
-  {"setTCPFastOpenKey", true, "string", "TCP Fast Open Key"},
    {"setTCPDownstreamMaxIdleTime", true, "time", "Maximum time in seconds that a downstream TCP connection to a backend might stay idle"},
+  {"setTCPConnectionsOverloadThreshold", true, "n", "Set a threshold as a percentage to the maximum number of incoming TCP connections per frontend or per client. When this threshold is reached, new incoming TCP connections are restricted: only query per connection is allowed (no out-of-order processing, no idle time allowed), the receive timeout is reduced to 500 milliseconds and the total duration of the TCP connection is limited to 5 seconds"},
+  {"setTCPFastOpenKey", true, "string", "TCP Fast Open Key"},
    {"setTCPInternalPipeBufferSize", true, "size", "Set the size in bytes of the internal buffer of the pipes used internally to distribute connections to TCP (and DoT) workers threads"},
    {"setTCPRecvTimeout", true, "n", "set the read timeout on TCP connections from the client, in seconds"},
    {"setTCPSendTimeout", true, "n", "set the write timeout on TCP connections from the client, in seconds"},
diff --git a/pdns/dnsdistdist/dnsdist-lua-configuration-items.cc b/pdns/dnsdistdist/dnsdist-lua-configuration-items.cc

index 6c07c07a10e860871c2568037bb2c78fc02dcf46..546f027b20c1512328e4304ca9209cd5169cec55 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-lua-configuration-items.cc
+++ b/pdns/dnsdistdist/dnsdist-lua-configuration-items.cc
@@ -149,6 +149,18 @@ static const std::map<std::string, UnsignedIntegerImmutableConfigurationItems> s
    {"setUDPTimeout", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_udpTimeout = newValue; }, std::numeric_limits<uint8_t>::max()}},
    {"setConsoleMaximumConcurrentConnections", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_consoleMaxConcurrentConnections = newValue; }, std::numeric_limits<uint32_t>::max()}},
    {"setRingBuffersLockRetries", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_ringsNbLockTries = newValue; }, std::numeric_limits<uint64_t>::max()}},
+  {"setMaxTCPConnectionRatePerClient", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_maxTCPConnectionsRatePerClient = newValue; }, std::numeric_limits<uint64_t>::max()}},
+  {"setMaxTLSResumedSessionRatePerClient", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_maxTLSResumedSessionsRatePerClient = newValue; }, std::numeric_limits<uint64_t>::max()}},
+  {"setMaxTLSNewSessionRatePerClient", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_maxTLSNewSessionsRatePerClient = newValue; }, std::numeric_limits<uint64_t>::max()}},
+  {"setTCPConnectionRateInterval", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_tcpConnectionsRatePerClientInterval = newValue; }, std::numeric_limits<uint64_t>::max()}},
+  {"setMaxTCPReadIOsPerQuery", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_maxTCPReadIOsPerQuery = newValue; }, std::numeric_limits<uint32_t>::max()}},
+  {"setBanDurationForExceedingMaxReadIOsPerQuery", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_tcpBanDurationForExceedingMaxReadIOsPerQuery = newValue; }, std::numeric_limits<uint32_t>::max()}},
+  {"setBanDurationForExceedingTCPTLSRate", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_tcpBanDurationForExceedingTCPTLSRate = newValue; }, std::numeric_limits<uint32_t>::max()}},
+  {"setTCPConnectionsOverloadThreshold", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_tcpConnectionsOverloadThreshold = newValue; }, std::numeric_limits<uint8_t>::max()}},
+  {"setTCPConnectionsMaskV4", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_tcpConnectionsMaskV4 = newValue; }, std::numeric_limits<uint8_t>::max()}},
+  {"setTCPConnectionsMaskV6", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_tcpConnectionsMaskV6 = newValue; }, std::numeric_limits<uint8_t>::max()}},
+  {"setTCPConnectionsMaskV4Port", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_tcpConnectionsMaskV4Port = newValue; }, std::numeric_limits<uint8_t>::max()}},
+  {"setTCPConnectionsOverloadThreshold", {[](dnsdist::configuration::ImmutableConfiguration& config, uint64_t newValue) { config.d_tcpConnectionsOverloadThreshold = newValue; }, 100}},
  };
  
  static const std::map<std::string, DoubleImmutableConfigurationItems> s_doubleImmutableConfigItems{
diff --git a/pdns/dnsdistdist/dnsdist-lua-inspection.cc b/pdns/dnsdistdist/dnsdist-lua-inspection.cc

index 95be35ad37cbf77a25d660ece66668deee1cbdf9..ceaa4ba917bdee798cc30788b582431f4f656349 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-lua-inspection.cc
+++ b/pdns/dnsdistdist/dnsdist-lua-inspection.cc
@@ -738,12 +738,12 @@ void setupLuaInspection(LuaContext& luaCtx)
      ret << endl;
  
      ret << "Frontends:" << endl;
-    fmt = boost::format("%-3d %-20.20s %-20d %-20d %-20d %-25d %-20d %-20d %-20d %-20f %-20f %-20d %-20d %-25d %-25d %-15d %-15d %-15d %-15d %-15d");
-    ret << (fmt % "#" % "Address" % "Connections" % "Max concurrent conn" % "Died reading query" % "Died sending response" % "Gave up" % "Client timeouts" % "Downstream timeouts" % "Avg queries/conn" % "Avg duration" % "TLS new sessions" % "TLS Resumptions" % "TLS unknown ticket keys" % "TLS inactive ticket keys" % "TLS 1.0" % "TLS 1.1" % "TLS 1.2" % "TLS 1.3" % "TLS other") << endl;
+    fmt = boost::format("%-3d %-20.20s %-20d %-20d %-20d %-25d %-20d %-20d %-20d %-20f %-20f %-20d %-20d %-25d %-25d %-15d %-15d %-15d %-15d %-15d %-15d");
+    ret << (fmt % "#" % "Address" % "Connections" % "Max concurrent conn" % "Died reading query" % "Died sending response" % "Gave up" % "Client timeouts" % "Downstream timeouts" % "Avg queries/conn" % "Avg duration" % "Avg read IOs/conn" % "TLS new sessions" % "TLS Resumptions" % "TLS unknown ticket keys" % "TLS inactive ticket keys" % "TLS 1.0" % "TLS 1.1" % "TLS 1.2" % "TLS 1.3" % "TLS other") << endl;
  
      size_t counter = 0;
      for (const auto& frontend : dnsdist::getFrontends()) {
-      ret << (fmt % counter % frontend->local.toStringWithPort() % frontend->tcpCurrentConnections % frontend->tcpMaxConcurrentConnections % frontend->tcpDiedReadingQuery % frontend->tcpDiedSendingResponse % frontend->tcpGaveUp % frontend->tcpClientTimeouts % frontend->tcpDownstreamTimeouts % frontend->tcpAvgQueriesPerConnection % frontend->tcpAvgConnectionDuration % frontend->tlsNewSessions % frontend->tlsResumptions % frontend->tlsUnknownTicketKey % frontend->tlsInactiveTicketKey % frontend->tls10queries % frontend->tls11queries % frontend->tls12queries % frontend->tls13queries % frontend->tlsUnknownqueries) << endl;
+      ret << (fmt % counter % frontend->local.toStringWithPort() % frontend->tcpCurrentConnections % frontend->tcpMaxConcurrentConnections % frontend->tcpDiedReadingQuery % frontend->tcpDiedSendingResponse % frontend->tcpGaveUp % frontend->tcpClientTimeouts % frontend->tcpDownstreamTimeouts % frontend->tcpAvgQueriesPerConnection % frontend->tcpAvgConnectionDuration % frontend->tcpAvgIOsPerConnection % frontend->tlsNewSessions % frontend->tlsResumptions % frontend->tlsUnknownTicketKey % frontend->tlsInactiveTicketKey % frontend->tls10queries % frontend->tls11queries % frontend->tls12queries % frontend->tls13queries % frontend->tlsUnknownqueries) << endl;
        ++counter;
      }
      ret << endl;
diff --git a/pdns/dnsdistdist/dnsdist-nghttp2-in.cc b/pdns/dnsdistdist/dnsdist-nghttp2-in.cc

index 8c92b17eea958bf8111b9ccf476dd3bede79b304..d7ff12bf4a5ff4c8668199937a8a8e625e8b4fbf 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-nghttp2-in.cc
+++ b/pdns/dnsdistdist/dnsdist-nghttp2-in.cc
@@ -286,7 +286,8 @@ bool IncomingHTTP2Connection::checkALPN()
  void IncomingHTTP2Connection::handleConnectionReady()
  {
    constexpr std::array<nghttp2_settings_entry, 1> settings{{{NGHTTP2_SETTINGS_MAX_CONCURRENT_STREAMS, 100U}}};
-  auto ret = nghttp2_submit_settings(d_session.get(), NGHTTP2_FLAG_NONE, settings.data(), settings.size());
+  constexpr std::array<nghttp2_settings_entry, 1> nearLimitsSettings{{{NGHTTP2_SETTINGS_MAX_CONCURRENT_STREAMS, 1U}}};
+  auto ret = nghttp2_submit_settings(d_session.get(), NGHTTP2_FLAG_NONE, isNearTCPLimits() ? nearLimitsSettings.data() : settings.data(), isNearTCPLimits() ? nearLimitsSettings.size() : settings.size());
    if (ret != 0) {
      throw std::runtime_error("Fatal error: " + std::string(nghttp2_strerror(ret)));
    }
diff --git a/pdns/dnsdistdist/dnsdist-rust-lib/dnsdist-configuration-yaml-items-generated.cc b/pdns/dnsdistdist/dnsdist-rust-lib/dnsdist-configuration-yaml-items-generated.cc

index ddbea89ed4946b8ff60c2613cbb3468798d41897..1c03496abc883c07faab28743f4c3290930cd197 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-rust-lib/dnsdist-configuration-yaml-items-generated.cc
+++ b/pdns/dnsdistdist/dnsdist-rust-lib/dnsdist-configuration-yaml-items-generated.cc
@@ -166,6 +166,39 @@ void convertImmutableFlatSettingsFromRust(const dnsdist::rust::settings::GlobalC
    if (config.d_maxTCPConnectionsPerClient == 0) {
      config.d_maxTCPConnectionsPerClient = yamlConfig.tuning.tcp.max_connections_per_client;
    }
+  if (config.d_tcpConnectionsOverloadThreshold == 90) {
+    config.d_tcpConnectionsOverloadThreshold = yamlConfig.tuning.tcp.connections_overload_threshold;
+  }
+  if (config.d_maxTCPConnectionsRatePerClient == 0) {
+    config.d_maxTCPConnectionsRatePerClient = yamlConfig.tuning.tcp.max_connection_rate_per_client;
+  }
+  if (config.d_tcpConnectionsRatePerClientInterval == 5) {
+    config.d_tcpConnectionsRatePerClientInterval = yamlConfig.tuning.tcp.connection_rate_interval;
+  }
+  if (config.d_maxTLSNewSessionsRatePerClient == 0) {
+    config.d_maxTLSNewSessionsRatePerClient = yamlConfig.tuning.tcp.max_tls_new_session_rate_per_client;
+  }
+  if (config.d_maxTLSResumedSessionsRatePerClient == 0) {
+    config.d_maxTLSResumedSessionsRatePerClient = yamlConfig.tuning.tcp.max_tls_resumed_session_rate_per_client;
+  }
+  if (config.d_maxTCPReadIOsPerQuery == 50) {
+    config.d_maxTCPReadIOsPerQuery = yamlConfig.tuning.tcp.max_read_ios_per_query;
+  }
+  if (config.d_tcpBanDurationForExceedingMaxReadIOsPerQuery == 60) {
+    config.d_tcpBanDurationForExceedingMaxReadIOsPerQuery = yamlConfig.tuning.tcp.ban_duration_for_exceeding_max_read_ios_per_query;
+  }
+  if (config.d_tcpBanDurationForExceedingTCPTLSRate == 10) {
+    config.d_tcpBanDurationForExceedingTCPTLSRate = yamlConfig.tuning.tcp.ban_duration_for_exceeding_tcp_tls_rate;
+  }
+  if (config.d_tcpConnectionsMaskV4 == 32) {
+    config.d_tcpConnectionsMaskV4 = yamlConfig.tuning.tcp.connections_mask_v4;
+  }
+  if (config.d_tcpConnectionsMaskV6 == 128) {
+    config.d_tcpConnectionsMaskV6 = yamlConfig.tuning.tcp.connections_mask_v6;
+  }
+  if (config.d_tcpConnectionsMaskV4Port == 0) {
+    config.d_tcpConnectionsMaskV4Port = yamlConfig.tuning.tcp.connections_mask_v4_port;
+  }
    if (config.d_udpVectorSize == 1) {
      config.d_udpVectorSize = yamlConfig.tuning.udp.messages_per_round;
    }
diff --git a/pdns/dnsdistdist/dnsdist-rust-lib/rust/src/lib.rs b/pdns/dnsdistdist/dnsdist-rust-lib/rust/src/lib.rs

index 9452e1fcdc0e056f6e2cb1de35766d696379e5b6..1eb879e7ce4f6b7cf28f007217fca1cafd0d995c 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-rust-lib/rust/src/lib.rs
+++ b/pdns/dnsdistdist/dnsdist-rust-lib/rust/src/lib.rs
@@ -1885,6 +1885,28 @@ mod dnsdistsettings {
          max_connections_per_client: u32,
          #[serde(default, skip_serializing_if = "crate::is_default")]
          fast_open_key: String,
+        #[serde(default = "crate::U8::<90>::value", skip_serializing_if = "crate::U8::<90>::is_equal")]
+        connections_overload_threshold: u8,
+        #[serde(default, skip_serializing_if = "crate::is_default")]
+        max_connection_rate_per_client: u64,
+        #[serde(default = "crate::U64::<5>::value", skip_serializing_if = "crate::U64::<5>::is_equal")]
+        connection_rate_interval: u64,
+        #[serde(default, skip_serializing_if = "crate::is_default")]
+        max_tls_new_session_rate_per_client: u64,
+        #[serde(default, skip_serializing_if = "crate::is_default")]
+        max_tls_resumed_session_rate_per_client: u64,
+        #[serde(default = "crate::U32::<50>::value", skip_serializing_if = "crate::U32::<50>::is_equal")]
+        max_read_ios_per_query: u32,
+        #[serde(default = "crate::U32::<60>::value", skip_serializing_if = "crate::U32::<60>::is_equal")]
+        ban_duration_for_exceeding_max_read_ios_per_query: u32,
+        #[serde(default = "crate::U32::<10>::value", skip_serializing_if = "crate::U32::<10>::is_equal")]
+        ban_duration_for_exceeding_tcp_tls_rate: u32,
+        #[serde(default = "crate::U8::<32>::value", skip_serializing_if = "crate::U8::<32>::is_equal")]
+        connections_mask_v4: u8,
+        #[serde(default = "crate::U8::<128>::value", skip_serializing_if = "crate::U8::<128>::is_equal")]
+        connections_mask_v6: u8,
+        #[serde(default, skip_serializing_if = "crate::is_default")]
+        connections_mask_v4_port: u8,
      }
  
      #[derive(Deserialize, Serialize, Debug, PartialEq)]
diff --git a/pdns/dnsdistdist/dnsdist-settings-definitions.yml b/pdns/dnsdistdist/dnsdist-settings-definitions.yml

index 754ccf0cd868d87080fe3830e4e394e39952e03b..a1f4104ce5559d1ab269c76be72ebccd404be8e6 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-settings-definitions.yml
+++ b/pdns/dnsdistdist/dnsdist-settings-definitions.yml
@@ -676,7 +676,7 @@ dynamic_rules:
      - name: "mask_port"
        type: u8
        default: "0"
-      description: "Number of bits of port to consider over IPv4, for CGNAT deployments. Default is 0 meaning that the port is not taken into account. For example passing ``2`` here, which only makes sense if the IPv4 parameter is set to ``32``, will split a given IPv4 address into four port ranges: ``0-16383``, ``16384-32767``, ``32768-49151`` and ``49152-65535``"
+      description: "Number of bits of the port number to consider over IPv4, for CGNAT deployments. Default is 0 meaning that the port is not taken into account. For example passing ``2`` here, which only makes sense if the IPv4 parameter is set to ``32``, will split a given IPv4 address into four port ranges: ``0-16383``, ``16384-32767``, ``32768-49151`` and ``49152-65535``"
      - name: "exclude_ranges"
        type: "Vec<String>"
        default: ""
@@ -1515,6 +1515,83 @@ tcp_tuning:
        default: ""
        lua-name: "setTCPFastOpenKey"
        runtime-configurable: false
+    - name: "connections_overload_threshold"
+      type: "u8"
+      default: "90"
+      lua-name: "setTCPConnectionsOverloadThreshold"
+      internal-field-name: "d_tcpConnectionsOverloadThreshold"
+      runtime-configurable: false
+      description: "Set a threshold as a percentage to the maximum number of incoming TCP connections per frontend or per client. When this threshold is reached, new incoming TCP connections are restricted: only query per connection is allowed (no out-of-order processing, no idle time allowed), the receive timeout is reduced to 500 milliseconds and the total duration of the TCP connection is limited to 5 seconds"
+    - name: "max_connection_rate_per_client"
+      type: "u64"
+      default: "0"
+      lua-name: "setMaxTCPConnectionRatePerClient"
+      internal-field-name: "d_maxTCPConnectionsRatePerClient"
+      runtime-configurable: false
+      description: "Set the maximum number of new TCP connections that a given client (see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) can open, per second, over the last ``connection_rate_interval`` minutes. Clients exceeding this rate will not be able to open new TCP connections for ``ban_duration_for_exceeding_tcp_tls_rate`` seconds. See also ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client``"
+    - name: "connection_rate_interval"
+      type: "u64"
+      default: "5"
+      lua-name: "setTCPConnectionRateInterval"
+      internal-field-name: "d_tcpConnectionsRatePerClientInterval"
+      runtime-configurable: false
+      description: "Set the interval, in minutes, over which new TCP and TLS per client connection rates are computed (see ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client``)"
+    - name: "max_tls_new_session_rate_per_client"
+      type: "u64"
+      default: "0"
+      lua-name: "setMaxTLSNewSessionRatePerClient"
+      internal-field-name: "d_maxTLSNewSessionsRatePerClient"
+      runtime-configurable: false
+      description: "Set the maximum number of new TLS sessions, without resumption, that a given client (see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) can open, per second, over the last ``connection_rate_interval`` minutes. Clients exceeding this rate will not be able to open new TCP connections for ``ban_duration_for_exceeding_tcp_tls_rate`` seconds. See also ``max_connection_rate_per_client`` and ```max_tls_resumed_session_rate_per_client`"
+    - name: "max_tls_resumed_session_rate_per_client"
+      type: "u64"
+      default: "0"
+      lua-name: "setMaxTLSResumedSessionRatePerClient"
+      internal-field-name: "d_maxTLSResumedSessionsRatePerClient"
+      runtime-configurable: false
+      description: "Set the maximum number of resumed TLS sessions that a given client (see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) can open, per second, over the last ``connection_rate_interval`` minutes. Clients exceeding this rate will not be able to open new TCP connections for ``ban_duration_for_exceeding_tcp_tls_rate`` seconds. See also ``max_connection_rate_per_client`` and ```max_tls_new_session_rate_per_client`"
+    - name: "max_read_ios_per_query"
+      type: "u32"
+      default: "50"
+      lua-name: "setMaxTCPReadIOsPerQuery"
+      internal-field-name: "d_maxTCPReadIOsPerQuery"
+      runtime-configurable: false
+      description: "Set the maximum number of read events needed to receive a new query on a TCP connection. Usually reading a DNS query over a TCP connection requires two read events, one to read the query size and one to read the query itself. For large queries, on congested networks, a few short reads might occur, increasing the number of read operations needed to read the full query, but if a large number of read events is needed the client might be misbehaving or even actively trying to hurt the server. When this limit is reached, the TCP connection will be terminated and the offending client IP (or range, see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) will be prevented from opening a new TCP connection for up to ``ban_duration_for_exceeding_max_read_ios_per_query`` seconds"
+    - name: "ban_duration_for_exceeding_max_read_ios_per_query"
+      type: "u32"
+      default: "60"
+      lua-name: "setBanDurationForExceedingMaxReadIOsPerQuery"
+      internal-field-name: "d_tcpBanDurationForExceedingMaxReadIOsPerQuery"
+      runtime-configurable: false
+      description: "Set for how long, in seconds, a client (or range, see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) will be prevented from opening a new TCP connection when it has exceeded ``max_read_ios_per_query`` over a TCP connection"
+    - name: "ban_duration_for_exceeding_tcp_tls_rate"
+      type: "u32"
+      default: "10"
+      lua-name: "setBanDurationForExceedingTCPTLSRate"
+      internal-field-name: "d_tcpBanDurationForExceedingTCPTLSRate"
+      runtime-configurable: false
+      description: "Set for how long, in seconds, a client (or range, see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) will be prevented from opening a new TCP connection when it has exceeded ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` or ``max_tls_resumed_session_rate_per_client``"
+    - name: "connections_mask_v4"
+      type: "u8"
+      default: "32"
+      lua-name: "setTCPConnectionsMaskV4"
+      internal-field-name: "d_tcpConnectionsMaskV4"
+      runtime-configurable: false
+      description: "Mask to apply to IPv4 addresses when enforcing ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client``. In some scenarios it might make sense to apply these settings to a /28 range rather than a single address, for example"
+    - name: "connections_mask_v6"
+      type: "u8"
+      default: "128"
+      lua-name: "setTCPConnectionsMaskV6"
+      internal-field-name: "d_tcpConnectionsMaskV6"
+      runtime-configurable: false
+      description: "Mask to apply to IPv6 addresses when enforcing ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client``. In some scenarios it might make sense to apply these settings to a whole /64 IPv6 range instead of a single address, for example"
+    - name: "connections_mask_v4_port"
+      type: u8
+      default: "0"
+      lua-name: "setTCPConnectionsMaskV4Port"
+      internal-field-name: "d_tcpConnectionsMaskV4Port"
+      runtime-configurable: false
+      description: "Number of bits of port to consider when enforcing ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client`` over IPv4, for CGNAT deployments. Default is 0 meaning that the port is not taken into account. For example passing ``2`` here, which only makes sense if ``connections_mask_v4`` is set to ``32``, will split a given IPv4 address into four port ranges: ``0-16383``, ``16384-32767``, ``32768-49151`` and ``49152-65535``"
  
  udp_tuning:
    category: "tuning.udp"
diff --git a/pdns/dnsdistdist/dnsdist-tcp-upstream.hh b/pdns/dnsdistdist/dnsdist-tcp-upstream.hh

index 984f5d03adfa111e466de67439d649f3dcb79715..27497c0cb0423b9f0f81c89bf462c0db2887036a 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-tcp-upstream.hh
+++ b/pdns/dnsdistdist/dnsdist-tcp-upstream.hh
@@ -51,69 +51,9 @@ public:
  
    void resetForNewQuery();
  
-  boost::optional<struct timeval> getClientReadTTD(struct timeval now) const
-  {
-    const auto& runtimeConfiguration = dnsdist::configuration::getCurrentRuntimeConfiguration();
-    if (runtimeConfiguration.d_maxTCPConnectionDuration == 0 && runtimeConfiguration.d_tcpRecvTimeout == 0) {
-      return boost::none;
-    }
-
-    if (runtimeConfiguration.d_maxTCPConnectionDuration > 0) {
-      auto elapsed = now.tv_sec - d_connectionStartTime.tv_sec;
-      if (elapsed < 0 || (static_cast<size_t>(elapsed) >= runtimeConfiguration.d_maxTCPConnectionDuration)) {
-        return now;
-      }
-      auto remaining = runtimeConfiguration.d_maxTCPConnectionDuration - elapsed;
-      if (runtimeConfiguration.d_tcpRecvTimeout == 0 || remaining <= static_cast<size_t>(runtimeConfiguration.d_tcpRecvTimeout)) {
-        now.tv_sec += remaining;
-        return now;
-      }
-    }
-
-    now.tv_sec += runtimeConfiguration.d_tcpRecvTimeout;
-    return now;
-  }
-
-  boost::optional<struct timeval> getClientWriteTTD(const struct timeval& now) const
-  {
-    const auto& runtimeConfiguration = dnsdist::configuration::getCurrentRuntimeConfiguration();
-    if (runtimeConfiguration.d_maxTCPConnectionDuration == 0 && runtimeConfiguration.d_tcpSendTimeout == 0) {
-      return boost::none;
-    }
-
-    timeval res(now);
-
-    if (runtimeConfiguration.d_maxTCPConnectionDuration > 0) {
-      auto elapsed = res.tv_sec - d_connectionStartTime.tv_sec;
-      if (elapsed < 0 || static_cast<size_t>(elapsed) >= runtimeConfiguration.d_maxTCPConnectionDuration) {
-        return res;
-      }
-      auto remaining = runtimeConfiguration.d_maxTCPConnectionDuration - elapsed;
-      if (runtimeConfiguration.d_tcpSendTimeout == 0 || remaining <= static_cast<size_t>(runtimeConfiguration.d_tcpSendTimeout)) {
-        res.tv_sec += remaining;
-        return res;
-      }
-    }
-
-    res.tv_sec += runtimeConfiguration.d_tcpSendTimeout;
-    return res;
-  }
-
-  bool maxConnectionDurationReached(unsigned int maxConnectionDuration, const struct timeval& now)
-  {
-    if (maxConnectionDuration) {
-      time_t curtime = now.tv_sec;
-      unsigned int elapsed = 0;
-      if (curtime > d_connectionStartTime.tv_sec) { // To prevent issues when time goes backward
-        elapsed = curtime - d_connectionStartTime.tv_sec;
-      }
-      if (elapsed >= maxConnectionDuration) {
-        return true;
-      }
-    }
-
-    return false;
-  }
+  boost::optional<timeval> getClientReadTTD(timeval now) const;
+  boost::optional<timeval> getClientWriteTTD(const timeval& now) const;
+  bool maxConnectionDurationReached(unsigned int maxConnectionDuration, const timeval& now) const;
  
    std::shared_ptr<TCPConnectionToBackend> getDownstreamConnection(std::shared_ptr<DownstreamState>& backend, const std::unique_ptr<std::vector<ProxyProtocolValue>>& tlvs, const struct timeval& now);
    void registerOwnedDownstreamConnection(std::shared_ptr<TCPConnectionToBackend>& conn);
@@ -186,6 +126,7 @@ public:
    IOState handleIncomingQueryReceived(const struct timeval& now);
    void handleExceptionDuringIO(const std::exception& exp);
    bool readIncomingQuery(const timeval& now, IOState& iostate);
+  bool isNearTCPLimits() const;
  
    enum class State : uint8_t { starting, doingHandshake, readingProxyProtocolHeader, waitingForQuery, readingQuerySize, readingQuery, sendingResponse, idle /* in case of XFR, we stop processing queries */ };
  
@@ -206,12 +147,14 @@ public:
    std::unique_ptr<IOStateHandler> d_ioState{nullptr};
    std::unique_ptr<std::vector<ProxyProtocolValue>> d_proxyProtocolValues{nullptr};
    TCPClientThreadData& d_threadData;
+  uint64_t d_readIOsTotal{0};
    size_t d_currentPos{0};
    size_t d_proxyProtocolNeed{0};
    size_t d_queriesCount{0};
    size_t d_currentQueriesCount{0};
    std::thread::id d_creatorThreadID;
    uint16_t d_querySize{0};
+  uint16_t d_readIOsCurrentQuery{0};
    State d_state{State::starting};
    bool d_isXFR{false};
    bool d_proxyProtocolPayloadHasTLV{false};
diff --git a/pdns/dnsdistdist/dnsdist-tcp.cc b/pdns/dnsdistdist/dnsdist-tcp.cc

index c0fc71b32276e5d697180200b9db82ab7e37b93f..a172d1c11dc4aae702c5e48ce7fbabc42dceb4b7 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-tcp.cc
+++ b/pdns/dnsdistdist/dnsdist-tcp.cc
@@ -63,8 +63,6 @@
  
  std::atomic<uint64_t> g_tcpStatesDumpRequested{0};
  
-LockGuarded<std::map<ComboAddress, size_t, ComboAddress::addressOnlyLessThan>> dnsdist::IncomingConcurrentTCPConnectionsManager::s_tcpClientsConcurrentConnectionsCount;
-
  IncomingTCPConnectionState::~IncomingTCPConnectionState()
  {
    dnsdist::IncomingConcurrentTCPConnectionsManager::accountClosedTCPConnection(d_ci.remote);
@@ -74,7 +72,7 @@ IncomingTCPConnectionState::~IncomingTCPConnectionState()
      gettimeofday(&now, nullptr);
  
      auto diff = now - d_connectionStartTime;
-    d_ci.cs->updateTCPMetrics(d_queriesCount, diff.tv_sec * 1000 + diff.tv_usec / 1000);
+    d_ci.cs->updateTCPMetrics(d_queriesCount, diff.tv_sec * 1000 + diff.tv_usec / 1000, d_queriesCount > 0 ? d_readIOsTotal / d_queriesCount : d_readIOsTotal);
    }
  
    // would have been done when the object is destroyed anyway,
@@ -125,6 +123,29 @@ static std::pair<std::shared_ptr<TCPConnectionToBackend>, bool> getOwnedDownstre
    return {nullptr, tlvsMismatch};
  }
  
+bool IncomingTCPConnectionState::isNearTCPLimits() const
+{
+  if (d_ci.d_restricted) {
+    return true;
+  }
+
+  const auto tcpConnectionsOverloadThreshold = dnsdist::configuration::getImmutableConfiguration().d_tcpConnectionsOverloadThreshold;
+  if (tcpConnectionsOverloadThreshold == 0) {
+    return false;
+  }
+
+  const auto& clientState = d_ci.cs;
+  if (clientState->d_tcpConcurrentConnectionsLimit > 0) {
+    auto concurrentConnections = clientState->tcpCurrentConnections.load();
+    auto current = (100 * concurrentConnections) / clientState->d_tcpConcurrentConnectionsLimit;
+    if (current >= tcpConnectionsOverloadThreshold) {
+      return true;
+    }
+  }
+
+  return dnsdist::IncomingConcurrentTCPConnectionsManager::isClientOverThreshold(d_ci.remote);
+}
+
  std::shared_ptr<TCPConnectionToBackend> IncomingTCPConnectionState::getDownstreamConnection(std::shared_ptr<DownstreamState>& backend, const std::unique_ptr<std::vector<ProxyProtocolValue>>& tlvs, const struct timeval& now)
  {
    auto [downstream, tlvsMismatch] = getOwnedDownstreamConnection(d_ownedConnectionsToBackend, backend, tlvs);
@@ -264,6 +285,12 @@ bool IncomingTCPConnectionState::canAcceptNewQueries(const struct timeval& now)
      return false;
    }
  
+  if (isNearTCPLimits()) {
+    d_ci.d_restricted = true;
+    DEBUGLOG("not accepting new queries because we already near our TCP limits");
+    return false;
+  }
+
    // for DoH, this is already handled by the underlying library
    if (!d_ci.cs->dohFrontend && d_currentQueriesCount >= d_ci.cs->d_maxInFlightQueriesPerConn) {
      DEBUGLOG("not accepting new queries because we already have " << d_currentQueriesCount << " out of " << d_ci.cs->d_maxInFlightQueriesPerConn);
@@ -290,6 +317,85 @@ void IncomingTCPConnectionState::resetForNewQuery()
    d_currentPos = 0;
    d_querySize = 0;
    d_state = State::waitingForQuery;
+  d_readIOsTotal += d_readIOsCurrentQuery;
+  d_readIOsCurrentQuery = 0;
+}
+
+boost::optional<timeval> IncomingTCPConnectionState::getClientReadTTD(timeval now) const
+{
+  const auto& runtimeConfiguration = dnsdist::configuration::getCurrentRuntimeConfiguration();
+  if (!isNearTCPLimits() && runtimeConfiguration.d_maxTCPConnectionDuration == 0 && runtimeConfiguration.d_tcpRecvTimeout == 0) {
+    return boost::none;
+  }
+
+  size_t maxTCPConnectionDuration = runtimeConfiguration.d_maxTCPConnectionDuration;
+  uint16_t tcpRecvTimeout = runtimeConfiguration.d_tcpRecvTimeout;
+  uint32_t tcpRecvTimeoutUsec = 0U;
+  if (isNearTCPLimits()) {
+    constexpr size_t maxTCPConnectionDurationNearLimits = 5U;
+    constexpr uint32_t tcpRecvTimeoutUsecNearLimits = 500U * 1000U;
+    maxTCPConnectionDuration = runtimeConfiguration.d_maxTCPConnectionDuration != 0 ? std::min(runtimeConfiguration.d_maxTCPConnectionDuration, maxTCPConnectionDurationNearLimits) : maxTCPConnectionDurationNearLimits;
+    tcpRecvTimeout = 0;
+    tcpRecvTimeoutUsec = tcpRecvTimeoutUsecNearLimits;
+  }
+
+  if (maxTCPConnectionDuration > 0) {
+    auto elapsed = now.tv_sec - d_connectionStartTime.tv_sec;
+    if (elapsed < 0 || (static_cast<size_t>(elapsed) >= maxTCPConnectionDuration)) {
+      return now;
+    }
+    auto remaining = maxTCPConnectionDuration - elapsed;
+    if (!isNearTCPLimits() && (runtimeConfiguration.d_tcpRecvTimeout == 0 || remaining <= static_cast<size_t>(runtimeConfiguration.d_tcpRecvTimeout))) {
+      now.tv_sec += static_cast<time_t>(remaining);
+      return now;
+    }
+  }
+
+  now.tv_sec += static_cast<time_t>(tcpRecvTimeout);
+  now.tv_usec += tcpRecvTimeoutUsec;
+  normalizeTV(now);
+  return now;
+}
+
+boost::optional<timeval> IncomingTCPConnectionState::getClientWriteTTD(const timeval& now) const
+{
+  const auto& runtimeConfiguration = dnsdist::configuration::getCurrentRuntimeConfiguration();
+  if (runtimeConfiguration.d_maxTCPConnectionDuration == 0 && runtimeConfiguration.d_tcpSendTimeout == 0) {
+    return boost::none;
+  }
+
+  timeval res(now);
+
+  if (runtimeConfiguration.d_maxTCPConnectionDuration > 0) {
+    auto elapsed = res.tv_sec - d_connectionStartTime.tv_sec;
+    if (elapsed < 0 || static_cast<size_t>(elapsed) >= runtimeConfiguration.d_maxTCPConnectionDuration) {
+      return res;
+    }
+    auto remaining = runtimeConfiguration.d_maxTCPConnectionDuration - elapsed;
+    if (runtimeConfiguration.d_tcpSendTimeout == 0 || remaining <= static_cast<size_t>(runtimeConfiguration.d_tcpSendTimeout)) {
+      res.tv_sec += static_cast<time_t>(remaining);
+      return res;
+    }
+  }
+
+  res.tv_sec += static_cast<time_t>(runtimeConfiguration.d_tcpSendTimeout);
+  return res;
+}
+
+bool IncomingTCPConnectionState::maxConnectionDurationReached(unsigned int maxConnectionDuration, const timeval& now) const
+{
+  if (maxConnectionDuration > 0) {
+    time_t curtime = now.tv_sec;
+    unsigned int elapsed = 0;
+    if (curtime > d_connectionStartTime.tv_sec) { // To prevent issues when time goes backward
+      elapsed = curtime - d_connectionStartTime.tv_sec;
+    }
+    if (elapsed >= maxConnectionDuration) {
+      return true;
+    }
+  }
+
+  return false;
  }
  
  void IncomingTCPConnectionState::registerOwnedDownstreamConnection(std::shared_ptr<TCPConnectionToBackend>& conn)
@@ -892,9 +998,11 @@ void IncomingTCPConnectionState::handleHandshakeDone(const struct timeval& now)
    if (d_handler.isTLS()) {
      if (!d_handler.hasTLSSessionBeenResumed()) {
        ++d_ci.cs->tlsNewSessions;
+      dnsdist::IncomingConcurrentTCPConnectionsManager::accountTLSNewSession(d_ci.remote);
      }
      else {
        ++d_ci.cs->tlsResumptions;
+      dnsdist::IncomingConcurrentTCPConnectionsManager::accountTLSResumedSession(d_ci.remote);
      }
      if (d_handler.getResumedFromInactiveTicketKey()) {
        ++d_ci.cs->tlsInactiveTicketKey;
@@ -1040,6 +1148,7 @@ bool IncomingTCPConnectionState::readIncomingQuery(const timeval& now, IOState&
    if (!d_lastIOBlocked && (d_state == State::waitingForQuery || d_state == State::readingQuerySize)) {
      DEBUGLOG("reading query size");
      d_buffer.resize(sizeof(uint16_t));
+    d_readIOsCurrentQuery++;
      iostate = d_handler.tryRead(d_buffer, d_currentPos, sizeof(uint16_t));
      if (d_currentPos > 0) {
        /* if we got at least one byte, we can't go around sending responses */
@@ -1070,6 +1179,7 @@ bool IncomingTCPConnectionState::readIncomingQuery(const timeval& now, IOState&
  
    if (!d_lastIOBlocked && d_state == State::readingQuery) {
      DEBUGLOG("reading query");
+    d_readIOsCurrentQuery++;
      iostate = d_handler.tryRead(d_buffer, d_currentPos, d_querySize);
      if (iostate == IOState::Done) {
        iostate = handleIncomingQueryReceived(now);
@@ -1101,6 +1211,13 @@ void IncomingTCPConnectionState::handleIO()
        return;
      }
  
+    const auto& immutable = dnsdist::configuration::getImmutableConfiguration();
+    if (d_readIOsCurrentQuery >= immutable.d_maxTCPReadIOsPerQuery) {
+      vinfolog("Terminating TCP connection from %s for reaching the maximum number of read IO events per query (%d)", d_ci.remote.toStringWithPort(), immutable.d_maxTCPReadIOsPerQuery);
+      dnsdist::IncomingConcurrentTCPConnectionsManager::banClientFor(d_ci.remote, time(nullptr), immutable.d_tcpBanDurationForExceedingMaxReadIOsPerQuery);
+      return;
+    }
+
      d_lastIOBlocked = false;
  
      try {
@@ -1566,6 +1683,7 @@ static void tcpClientThread(pdns::channel::Receiver<ConnectionInfo>&& queryRecei
  
        try {
          t_downstreamTCPConnectionsManager.cleanupClosedConnections(now);
+        dnsdist::IncomingConcurrentTCPConnectionsManager::cleanup(time(nullptr));
  
          if (now.tv_sec > lastTimeoutScan) {
            lastTimeoutScan = now.tv_sec;
@@ -1642,11 +1760,14 @@ static void acceptNewConnection(const TCPAcceptorParam& param, TCPClientThreadDa
        return;
      }
  
-    if (!dnsdist::IncomingConcurrentTCPConnectionsManager::accountNewTCPConnection(remote)) {
-      vinfolog("Dropping TCP connection from %s because we have too many from this client already", remote.toStringWithPort());
+    auto connectionResult = dnsdist::IncomingConcurrentTCPConnectionsManager::accountNewTCPConnection(remote, connInfo.cs->hasTLS());
+    if (connectionResult == dnsdist::IncomingConcurrentTCPConnectionsManager::NewConnectionResult::Denied) {
        return;
      }
      tcpClientCountIncremented = true;
+    if (connectionResult == dnsdist::IncomingConcurrentTCPConnectionsManager::NewConnectionResult::Restricted) {
+      connInfo.d_restricted = true;
+    }
  
      vinfolog("Got TCP connection from %s", remote.toStringWithPort());
  
diff --git a/pdns/dnsdistdist/dnsdist-tcp.hh b/pdns/dnsdistdist/dnsdist-tcp.hh

index f3d827ebf3d30364ce44013b237723332942bbb3..382f1a6187df00d724e90c165b4f1b425da43579 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-tcp.hh
+++ b/pdns/dnsdistdist/dnsdist-tcp.hh
@@ -73,6 +73,7 @@ struct ConnectionInfo
    ComboAddress remote;
    ClientState* cs{nullptr};
    int fd{-1};
+  bool d_restricted{false};
  };
  
  class InternalQuery
diff --git a/pdns/dnsdistdist/dnsdist-web.cc b/pdns/dnsdistdist/dnsdist-web.cc

index 4eb57f6be51a1debfe0841d8e55563a9181a7021..b1f91eebd70e58407f79f3c34cc048c96e7b9bd4 100644 (file)
--- a/pdns/dnsdistdist/dnsdist-web.cc
+++ b/pdns/dnsdistdist/dnsdist-web.cc
@@ -688,6 +688,8 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
    output << "# TYPE " << frontsbase << "tcpavgqueriesperconnection " << "gauge" << "\n";
    output << "# HELP " << frontsbase << "tcpavgconnectionduration " << "The average duration of a TCP connection (ms)" << "\n";
    output << "# TYPE " << frontsbase << "tcpavgconnectionduration " << "gauge" << "\n";
+  output << "# HELP " << frontsbase << "tcpavgreadios " << "The average number of read IO operations per query over a TCP connection" << "\n";
+  output << "# TYPE " << frontsbase << "tcpavgreadios " << "gauge" << "\n";
    output << "# HELP " << frontsbase << "tlsqueries " << "Number of queries received by dnsdist over TLS, by TLS version" << "\n";
    output << "# TYPE " << frontsbase << "tlsqueries " << "counter" << "\n";
    output << "# HELP " << frontsbase << "tlsnewsessions " << "Amount of new TLS sessions negotiated" << "\n";
@@ -734,6 +736,7 @@ static void handlePrometheus(const YaHTTP::Request& req, YaHTTP::Response& resp)
        output << frontsbase << "tcpmaxconcurrentconnections" << label << front->tcpMaxConcurrentConnections.load() << "\n";
        output << frontsbase << "tcpavgqueriesperconnection" << label << front->tcpAvgQueriesPerConnection.load() << "\n";
        output << frontsbase << "tcpavgconnectionduration" << label << front->tcpAvgConnectionDuration.load() << "\n";
+      output << frontsbase << "tcpavgreadios" << label << front->tcpAvgIOsPerConnection << "\n";
        if (front->hasTLS()) {
          output << frontsbase << "tlsnewsessions" << label << front->tlsNewSessions.load() << "\n";
          output << frontsbase << "tlsresumptions" << label << front->tlsResumptions.load() << "\n";
diff --git a/pdns/dnsdistdist/dnsdist.hh b/pdns/dnsdistdist/dnsdist.hh

index 20fe358d1fbebd8dbfa041d6233d5073c6047021..cf0cc99fb3616446bd1ae14160948cd6405dc96e 100644 (file)
--- a/pdns/dnsdistdist/dnsdist.hh
+++ b/pdns/dnsdistdist/dnsdist.hh
@@ -344,6 +344,7 @@ struct ClientState
    stat_t tls12queries{0}; // valid DNS queries received via TLSv1.2
    stat_t tls13queries{0}; // valid DNS queries received via TLSv1.3
    stat_t tlsUnknownqueries{0}; // valid DNS queries received via unknown TLS version
+  pdns::stat_double_t tcpAvgIOsPerConnection{0.0};
    pdns::stat_double_t tcpAvgQueriesPerConnection{0.0};
    /* in ms */
    pdns::stat_double_t tcpAvgConnectionDuration{0.0};
@@ -508,10 +509,11 @@ struct ClientState
      d_filter = bpf;
    }
  
-  void updateTCPMetrics(size_t nbQueries, uint64_t durationMs)
+  void updateTCPMetrics(size_t nbQueries, uint64_t durationMs, size_t nbIOs)
    {
      tcpAvgQueriesPerConnection = (99.0 * tcpAvgQueriesPerConnection / 100.0) + (nbQueries / 100.0);
      tcpAvgConnectionDuration = (99.0 * tcpAvgConnectionDuration / 100.0) + (durationMs / 100.0);
+    tcpAvgIOsPerConnection = (99.0 * tcpAvgIOsPerConnection / 100.0) + (nbIOs / 100.0);
    }
  };
  
diff --git a/pdns/dnsdistdist/docs/reference/tuning.rst b/pdns/dnsdistdist/docs/reference/tuning.rst

index 1614793ec55b2ffe4a6624c1f06e18916d91af58..810249edad87e90d5a7fd14570f77ed6e03e905e 100644 (file)
--- a/pdns/dnsdistdist/docs/reference/tuning.rst
+++ b/pdns/dnsdistdist/docs/reference/tuning.rst
@@ -1,6 +1,20 @@
  Tuning related functions
  ========================
  
+.. function:: setBanDurationForExceedingMaxReadIOsPerQuery(num)
+
+  .. versionadded:: 2.0.0
+
+  Set for how long, in seconds, a client (or range, see :func:`setTCPConnectionsMaskV4`, :func:`setTCPConnectionsMaskV6` and :func:`setTCPConnectionsMaskV4Port` to see how clients can be aggregated) will be prevented from opening a new TCP connection when it has exceeded :func:`setMaxTCPReadIOsPerQuery` over a TCP connection. Default is 60 seconds.
+
+.. function:: setBanDurationForExceedingTCPTLSRate(num)
+
+  .. versionadded:: 2.0.0
+
+  Set for how long, in seconds, a client (or range, see :func:`setTCPConnectionsMaskV4`, :func:`setTCPConnectionsMaskV6` and :func:`setTCPConnectionsMaskV4Port` to see how clients can be aggregated) will be prevented from opening a new TCP connection when it has exceeded :func:`setMaxTCPConnectionRatePerClient`, :func:`setMaxTLSNewSessionRatePerClient` or :func:`setMaxTLSResumedSessionRatePerClient`. Default is 10 seconds.
+
+  :param int num: Duration of the ban in seconds
+
  .. function:: setDoHDownstreamCleanupInterval(interval)
  
    .. versionadded:: 1.7.0
@@ -55,19 +69,27 @@ Tuning related functions
  
  .. function:: setMaxTCPConnectionDuration(num)
  
-  Set the maximum duration of an incoming TCP connection, in seconds. 0 (the default) means unlimited
+  Set the maximum duration of an incoming TCP connection, in seconds. 0 (the default) means unlimited.
  
    :param int num:
  
+.. function:: setMaxTCPConnectionRatePerClient(num)
+
+  .. versionadded:: 2.0.0
+
+  Set the maximum number of new TCP connections that a given client (or range, see :func:`setTCPConnectionsMaskV4`, :func:`setTCPConnectionsMaskV6` and :func:`setTCPConnectionsMaskV4Port` to see how clients can be aggregated) can open, per second, over the last :func:`setTCPConnectionRateInterval` minutes. Clients exceeding this rate will not be able to open new TCP connections for :func:`setBanDurationForExceedingTCPTLSRate` seconds. See also :func:`setMaxTLSNewSessionRatePerClient` and :func:`setMaxTLSResumedSessionRatePerClient`. 0 (the default) means unlimited.
+
+  :param int num: Number of new connections per second
+
  .. function:: setMaxTCPConnectionsPerClient(num)
  
-  Set the maximum number of TCP connections per client. 0 (the default) means unlimited
+  Set the maximum number of TCP connections per client. 0 (the default) means unlimited.
  
    :param int num:
  
  .. function:: setMaxTCPQueriesPerConnection(num)
  
-  Set the maximum number of queries in an incoming TCP connection. 0 (the default) means unlimited
+  Set the maximum number of queries in an incoming TCP connection. 0 (the default) means unlimited.
  
    :param int num:
  
@@ -76,28 +98,52 @@ Tuning related functions
    .. versionchanged:: 1.6.0
      Before 1.6.0 the default value was 1000 on all systems.
  
-  Set the maximum number of TCP connections queued (waiting to be picked up by a client thread), defaults to 1000 (10000 on Linux since 1.6.0). 0 means unlimited
+  Set the maximum number of TCP connections queued (waiting to be picked up by a client thread), defaults to 1000 (10000 on Linux since 1.6.0). 0 means unlimited.
  
    :param int num:
  
+.. function:: setMaxTCPReadIOsPerQuery(num)
+
+  .. versionadded:: 2.0.0
+
+  Set the maximum number of read events needed to receive a new query on a TCP connection. Usually reading a DNS query over a TCP connection requires two read events, one to read the query size and one to read the query itself. For large queries, on congested networks, a few short reads might occur, increasing the number of read operations needed to read the full query, but if a large number of read events is needed the client might be misbehaving or even actively trying to hurt the server. When this limit is reached, the TCP connection will be terminated and the offending client IP (or range, see :func:`setTCPConnectionsMaskV4`, :func:`setTCPConnectionsMaskV6` and :func:`setTCPConnectionsMaskV4Port` to see how clients can be aggregated) will be prevented from opening a new TCP connection for up to :func:`setBanDurationForExceedingMaxReadIOsPerQuery` seconds. Default is 50.
+
+  :param int num: Number of read IO events per query
+
  .. function:: setMaxUDPOutstanding(num)
  
    .. versionchanged:: 1.4.0
      Before 1.4.0 the default value was 10240
  
-  Set the maximum number of outstanding UDP queries to a given backend server. This can only be set at configuration time and defaults to 65535 (10240 before 1.4.0)
+  Set the maximum number of outstanding UDP queries to a given backend server. This can only be set at configuration time and defaults to 65535 (10240 before 1.4.0).
  
    :param int num:
  
+.. function:: setMaxTLSNewSessionRatePerClient(num)
+
+  .. versionadded:: 2.0.0
+
+  Set the maximum number of new TLS sessions, without resumption, that a given client (or range, see :func:`setTCPConnectionsMaskV4`, :func:`setTCPConnectionsMaskV6` and :func:`setTCPConnectionsMaskV4Port` to see how clients can be aggregated) can open, per second, over the last :func:`setTCPConnectionRateInterval` minutes. Clients exceeding this rate will not be able to open new TCP connections for :func:`setBanDurationForExceedingTCPTLSRate` seconds. See also :func:`setMaxTLSNewSessionRatePerClient` and :func:`setMaxTCPConnectionRatePerClient`. 0 (the default) means unlimited.
+
+  :param int num: Number of resumed sessions per second
+
+.. function:: setMaxTLSResumedSessionRatePerClient(num)
+
+  .. versionadded:: 2.0.0
+
+  Set the maximum number of resumed TLS sessions that a given client (or range, see :func:`setTCPConnectionsMaskV4`, :func:`setTCPConnectionsMaskV6` and :func:`setTCPConnectionsMaskV4Port` to see how clients can be aggregated) can open, per second, over the last :func:`setTCPConnectionRateInterval` minutes. Clients exceeding this rate will not be able to open new TCP connections for :func:`setBanDurationForExceedingTCPTLSRate` seconds. See also :func:`setMaxTLSResumedSessionRatePerClient` and :func:`setMaxTCPConnectionRatePerClient`. 0 (the default) means unlimited.
+
+  :param int num: Number of new sessions per second
+
  .. function:: setCacheCleaningDelay(num)
  
-  Set the interval in seconds between two runs of the cache cleaning algorithm, removing expired entries. Default is every 60s
+  Set the interval in seconds between two runs of the cache cleaning algorithm, removing expired entries. Default is every 60s.
  
    :param int num:
  
  .. function:: setCacheCleaningPercentage(num)
  
-  Set the percentage of the cache that the cache cleaning algorithm will try to free by removing expired entries. By default (100), all expired entries are removed
+  Set the percentage of the cache that the cache cleaning algorithm will try to free by removing expired entries. By default (100), all expired entries are removed.
  
    :param int num:
  
@@ -109,10 +155,52 @@ Tuning related functions
  
  .. function:: setStaleCacheEntriesTTL(num)
  
-  Allows using cache entries expired for at most n seconds when no backend available to answer for a query
+  Allows using cache entries expired for at most n seconds when no backend available to answer for a query.
  
    :param int num:
  
+.. function:: setTCPConnectionRateInterval(num)
+
+  .. versionadded:: 2.0.0
+
+  Set the interval, in minutes, over which new TCP and TLS per client connection rates are computed (see :func:`setMaxTCPConnectionRatePerClient`, :func:`setMaxTLSNewSessionRatePerClient` and :func:`setMaxTLSResumedSessionRatePerClient`). Default is 5.
+
+  :param int num: Interval in minutes
+
+.. function:: setTCPConnectionsMaskV4(num)
+
+  .. versionadded:: 2.0.0
+
+  Mask to apply to IPv4 addresses when enforcing :func:`setMaxTCPConnectionRatePerClient`, :func:`setMaxTLSNewSessionRatePerClient` and :func:`setMaxTLSResumedSessionRatePerClient`. In some scenarios it might make sense to apply these settings to a /28 range rather than a single address, for example. Default is 32.
+
+  :param int num: Number of bits to keep
+
+.. function:: setTCPConnectionsMaskV4Port(num)
+
+  .. versionadded:: 2.0.0
+
+  Number of bits of the port number to consider when enforcing :func:`setMaxTCPConnectionRatePerClient`, :func:`setMaxTLSNewSessionRatePerClient` and :func:`setMaxTLSResumedSessionRatePerClient` over IPv4 addresses, for CGNAT deployments. Default is 0 meaning that the port is not taken into account. For example passing ``2`` here, which only makes sense if :func:`setTCPConnectionsMaskV4` is set to ``32``, will split a given IPv4 address into four port ranges: ``0-16383``, ``16384-32767``, ``32768-49151`` and ``49152-65535``.
+
+  :param int num: Number of bits to keep
+
+Number of bits of port to consider when enforcing ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client`` over IPv4, for CGNAT deployments.
+
+.. function:: setTCPConnectionsMaskV6(num)
+
+  .. versionadded:: 2.0.0
+
+  Mask to apply to IPv6 addresses when enforcing :func:`setMaxTCPConnectionRatePerClient`, :func:`setMaxTLSNewSessionRatePerClient` and :func:`setMaxTLSResumedSessionRatePerClient`. In some scenarios it might make sense to apply these settings to a whole /64 IPv6 range rather than a single address, for example. Default is 128.
+
+  :param int num: Number of bits to keep
+
+.. function:: setTCPConnectionsOverloadThreshold(num)
+
+  .. versionadded:: 2.0.0
+
+  Set a threshold as a percentage to the maximum number of incoming TCP connections per frontend or per client. When this threshold is reached, new incoming TCP connections are restricted: only query per connection is allowed (no out-of-order processing, no idle time allowed), the receive timeout is reduced to 500 milliseconds and the total duration of the TCP connection is limited to 5 seconds. Default is 90.
+
+  :param int num: Threshold in percent
+
  .. function:: setTCPDownstreamCleanupInterval(interval)
  
    .. versionadded:: 1.6.0
@@ -168,13 +256,13 @@ Tuning related functions
  
  .. function:: setTCPRecvTimeout(num)
  
-  Set the read timeout on TCP connections from the client, in seconds. Defaults to 2
+  Set the read timeout on TCP connections from the client, in seconds. Defaults to 2.
  
    :param int num:
  
  .. function:: setTCPSendTimeout(num)
  
-  Set the write timeout on TCP connections from the client, in seconds. Defaults to 2
+  Set the write timeout on TCP connections from the client, in seconds. Defaults to 2.
  
    :param int num:
  
@@ -200,6 +288,6 @@ Tuning related functions
  
  .. function:: setUDPTimeout(num)
  
-  Set the maximum time dnsdist will wait for a response from a backend over UDP, in seconds. Defaults to 2
+  Set the maximum time dnsdist will wait for a response from a backend over UDP, in seconds. Defaults to 2.
  
    :param int num:
diff --git a/pdns/dnsdistdist/docs/reference/yaml-settings.rst b/pdns/dnsdistdist/docs/reference/yaml-settings.rst

index ede608b6c1076717d583337708f31ced9e614676..0687b4f71cd25a2cba3edabf9d74708a8ec8b4f5 100644 (file)
--- a/pdns/dnsdistdist/docs/reference/yaml-settings.rst
+++ b/pdns/dnsdistdist/docs/reference/yaml-settings.rst
@@ -926,6 +926,17 @@ TcpTuningConfiguration
  - **outgoing_max_idle_connection_per_backend**: Unsigned integer ``(10)``
  - **max_connections_per_client**: Unsigned integer ``(0)``
  - **fast_open_key**: String ``("")``
+- **connections_overload_threshold**: Unsigned integer ``(90)`` - Set a threshold as a percentage to the maximum number of incoming TCP connections per frontend or per client. When this threshold is reached, new incoming TCP connections are restricted: only query per connection is allowed (no out-of-order processing, no idle time allowed), the receive timeout is reduced to 500 milliseconds and the total duration of the TCP connection is limited to 5 seconds
+- **max_connection_rate_per_client**: Unsigned integer ``(0)`` - Set the maximum number of new TCP connections that a given client (see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) can open, per second, over the last ``connection_rate_interval`` minutes. Clients exceeding this rate will not be able to open new TCP connections for ``ban_duration_for_exceeding_tcp_tls_rate`` seconds. See also ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client``
+- **connection_rate_interval**: Unsigned integer ``(5)`` - Set the interval, in minutes, over which new TCP and TLS per client connection rates are computed (see ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client``)
+- **max_tls_new_session_rate_per_client**: Unsigned integer ``(0)`` - Set the maximum number of new TLS sessions, without resumption, that a given client (see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) can open, per second, over the last ``connection_rate_interval`` minutes. Clients exceeding this rate will not be able to open new TCP connections for ``ban_duration_for_exceeding_tcp_tls_rate`` seconds. See also ``max_connection_rate_per_client`` and ```max_tls_resumed_session_rate_per_client`
+- **max_tls_resumed_session_rate_per_client**: Unsigned integer ``(0)`` - Set the maximum number of resumed TLS sessions that a given client (see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) can open, per second, over the last ``connection_rate_interval`` minutes. Clients exceeding this rate will not be able to open new TCP connections for ``ban_duration_for_exceeding_tcp_tls_rate`` seconds. See also ``max_connection_rate_per_client`` and ```max_tls_new_session_rate_per_client`
+- **max_read_ios_per_query**: Unsigned integer ``(50)`` - Set the maximum number of read events needed to receive a new query on a TCP connection. Usually reading a DNS query over a TCP connection requires two read events, one to read the query size and one to read the query itself. For large queries, on congested networks, a few short reads might occur, increasing the number of read operations needed to read the full query, but if a large number of read events is needed the client might be misbehaving or even actively trying to hurt the server. When this limit is reached, the TCP connection will be terminated and the offending client IP (or range, see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) will be prevented from opening a new TCP connection for up to ``ban_duration_for_exceeding_max_read_ios_per_query`` seconds
+- **ban_duration_for_exceeding_max_read_ios_per_query**: Unsigned integer ``(60)`` - Set for how long, in seconds, a client (or range, see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) will be prevented from opening a new TCP connection when it has exceeded ``max_read_ios_per_query`` over a TCP connection
+- **ban_duration_for_exceeding_tcp_tls_rate**: Unsigned integer ``(10)`` - Set for how long, in seconds, a client (or range, see ``connections_mask_v4``, ``connections_mask_v6`` and ``connection_mask_v4_port`` to see how clients can be aggregated) will be prevented from opening a new TCP connection when it has exceeded ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` or ``max_tls_resumed_session_rate_per_client``
+- **connections_mask_v4**: Unsigned integer ``(32)`` - Mask to apply to IPv4 addresses when enforcing ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client``. In some scenarios it might make sense to apply these settings to a /28 range rather than a single address, for example
+- **connections_mask_v6**: Unsigned integer ``(128)`` - Mask to apply to IPv6 addresses when enforcing ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client``. In some scenarios it might make sense to apply these settings to a whole /64 IPv6 range instead of a single address, for example
+- **connections_mask_v4_port**: Unsigned integer ``(0)`` - Number of bits of port to consider when enforcing ``max_connection_rate_per_client``, ``max_tls_new_session_rate_per_client`` and ``max_tls_resumed_session_rate_per_client`` over IPv4, for CGNAT deployments. Default is 0 meaning that the port is not taken into account. For example passing ``2`` here, which only makes sense if ``connections_mask_v4`` is set to ``32``, will split a given IPv4 address into four port ranges: ``0-16383``, ``16384-32767``, ``32768-49151`` and ``49152-65535``
  
  
  .. _yaml-settings-TlsEngineConfiguration:
diff --git a/pdns/dnsdistdist/doh.cc b/pdns/dnsdistdist/doh.cc

index 62f564aaacc766758c84900cd053d055cde03f12..2354aaada38d7e0d3245b548e8f71d1565a324e2 100644 (file)
--- a/pdns/dnsdistdist/doh.cc
+++ b/pdns/dnsdistdist/doh.cc
@@ -327,7 +327,7 @@ static void on_socketclose(void *data)
        auto diff = now - conn->d_connectionStartTime;
  
        conn->d_acceptCtx->decrementConcurrentConnections();
-      conn->d_acceptCtx->d_cs->updateTCPMetrics(conn->d_nbQueries, diff.tv_sec * 1000 + diff.tv_usec / 1000);
+      conn->d_acceptCtx->d_cs->updateTCPMetrics(conn->d_nbQueries, diff.tv_sec * 1000 + diff.tv_usec / 1000, 0);
      }
  
      dnsdist::IncomingConcurrentTCPConnectionsManager::accountClosedTCPConnection(conn->d_remote);
@@ -1392,8 +1392,8 @@ static void on_accept(h2o_socket_t *listener, const char *err)
      return;
    }
  
-  if (!dnsdist::IncomingConcurrentTCPConnectionsManager::accountNewTCPConnection(remote)) {
-    vinfolog("Dropping DoH connection from %s because we have too many from this client already", remote.toStringWithPort());
+  auto connectionResult = dnsdist::IncomingConcurrentTCPConnectionsManager::accountNewTCPConnection(remote, false);
+  if (connectionResult == dnsdist::IncomingConcurrentTCPConnectionsManager::NewConnectionResult::Denied) {
      h2o_socket_close(sock);
      return;
    }
diff --git a/pdns/dnsdistdist/meson.build b/pdns/dnsdistdist/meson.build

index 070aff8f0033c6b6864f8317b086b87c55647229..ab84049642330836f2f77ceeeefa9bd5c1bff1b5 100644 (file)
--- a/pdns/dnsdistdist/meson.build
+++ b/pdns/dnsdistdist/meson.build
@@ -129,6 +129,7 @@ common_sources += files(
    src_dir / 'dnsdist-backend.cc',
    src_dir / 'dnsdist-cache.cc',
    src_dir / 'dnsdist-carbon.cc',
+  src_dir / 'dnsdist-concurrent-connections.cc',
    src_dir / 'dnsdist-configuration.cc',
    src_dir / 'dnsdist-configuration-yaml.cc',
    src_dir / 'dnsdist-console.cc',
diff --git a/regression-tests.dnsdist/test_TCPLimits.py b/regression-tests.dnsdist/test_TCPLimits.py

index 7f24f54ce31fd8b1408075ce8c2d5ba3ef70390f..4567b246610749c873089244df41d03b90a4c19d 100644 (file)
--- a/regression-tests.dnsdist/test_TCPLimits.py
+++ b/regression-tests.dnsdist/test_TCPLimits.py
@@ -21,11 +21,13 @@ class TestTCPLimits(DNSDistTest):
      _maxTCPConnsPerClient = 3
      _maxTCPConnDuration = 5
      _config_template = """
-    newServer{address="127.0.0.1:%s"}
-    setTCPRecvTimeout(%s)
-    setMaxTCPQueriesPerConnection(%s)
-    setMaxTCPConnectionsPerClient(%s)
-    setMaxTCPConnectionDuration(%s)
+    newServer{address="127.0.0.1:%d"}
+    setTCPRecvTimeout(%d)
+    setMaxTCPQueriesPerConnection(%d)
+    setMaxTCPConnectionsPerClient(%d)
+    setMaxTCPConnectionDuration(%d)
+    -- disable "near limits" otherwise our tests are broken because connections are forcibly closed
+    setTCPConnectionsOverloadThreshold(0)
      """
      _config_params = ['_testServerPort', '_tcpIdleTimeout', '_maxTCPQueriesPerConn', '_maxTCPConnsPerClient', '_maxTCPConnDuration']
      _verboseMode = True
@@ -141,8 +143,10 @@ class TestTCPFrontendLimits(DNSDistTest):
      _tcpIdleTimeout = 2
      _maxTCPConnsPerFrontend = 10
      _config_template = """
-    newServer{address="127.0.0.1:%s"}
+    newServer{address="127.0.0.1:%d"}
      setLocal("%s:%d", {maxConcurrentTCPConnections=%d})
+    -- disable "near limits" otherwise our tests are broken because connections are forcibly closed
+    setTCPConnectionsOverloadThreshold(0)
      """
      _config_params = ['_testServerPort', '_dnsDistListeningAddr', '_dnsDistPort', '_maxTCPConnsPerFrontend']
      _verboseMode = True
author	Remi Gacogne <remi.gacogne@powerdns.com>
	Fri, 28 Mar 2025 14:52:08 +0000 (15:52 +0100)
committer	Remi Gacogne <remi.gacogne@powerdns.com>
	Mon, 31 Mar 2025 14:19:05 +0000 (16:19 +0200)
pdns/dnsdistdist/Makefile.am		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-carbon.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-concurrent-connections.cc	[new file with mode: 0644]	patch \| blob
pdns/dnsdistdist/dnsdist-concurrent-connections.hh		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-configuration.hh		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-console.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-lua-configuration-items.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-lua-inspection.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-nghttp2-in.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-rust-lib/dnsdist-configuration-yaml-items-generated.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-rust-lib/rust/src/lib.rs		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-settings-definitions.yml		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-tcp-upstream.hh		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-tcp.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-tcp.hh		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist-web.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/dnsdist.hh		patch \| blob \| blame \| history
pdns/dnsdistdist/docs/reference/tuning.rst		patch \| blob \| blame \| history
pdns/dnsdistdist/docs/reference/yaml-settings.rst		patch \| blob \| blame \| history
pdns/dnsdistdist/doh.cc		patch \| blob \| blame \| history
pdns/dnsdistdist/meson.build		patch \| blob \| blame \| history
regression-tests.dnsdist/test_TCPLimits.py		patch \| blob \| blame \| history