2 * This file is part of PowerDNS or dnsdist.
3 * Copyright -- PowerDNS.COM B.V. and its contributors
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of version 2 of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * In addition, for the avoidance of any doubt, permission is granted to
10 * link this program with OpenSSL and to (re)distribute the binaries
11 * produced as the result of such linking.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 #include "ext/luawrapper/include/LuaContext.hpp"
32 #include <unordered_map>
34 #include <boost/variant.hpp>
36 #include "bpf-filter.hh"
37 #include "capabilities.hh"
38 #include "circular_buffer.hh"
39 #include "dnscrypt.hh"
40 #include "dnsdist-cache.hh"
41 #include "dnsdist-dynbpf.hh"
44 #include "ednsoptions.hh"
50 #include "tcpiohandler.hh"
51 #include "uuid-utils.hh"
53 void carbonDumpThread();
54 uint64_t uptimeOfProcess(const std::string& str);
56 extern uint16_t g_ECSSourcePrefixV4;
57 extern uint16_t g_ECSSourcePrefixV6;
58 extern bool g_ECSOverride;
60 typedef std::unordered_map<string, string> QTag;
64 DNSQuestion(const DNSName* name, uint16_t type, uint16_t class_, unsigned int consumed_, const ComboAddress* lc, const ComboAddress* rem, struct dnsheader* header, size_t bufferSize, uint16_t queryLen, bool isTcp, const struct timespec* queryTime_):
65 qname(name), local(lc), remote(rem), dh(header), queryTime(queryTime_), size(bufferSize), consumed(consumed_), tempFailureTTL(boost::none), qtype(type), qclass(class_), len(queryLen), ecsPrefixLength(rem->sin4.sin_family == AF_INET ? g_ECSSourcePrefixV4 : g_ECSSourcePrefixV6), tcp(isTcp), ecsOverride(g_ECSOverride) {
66 const uint16_t* flags = getFlagsFromDNSHeader(dh);
69 DNSQuestion(const DNSQuestion&) = delete;
70 DNSQuestion& operator=(const DNSQuestion&) = delete;
71 DNSQuestion(DNSQuestion&&) = default;
74 boost::optional<boost::uuids::uuid> uniqueId;
77 boost::optional<Netmask> subnet;
78 std::string sni; /* Server Name Indication, if any (DoT or DoH) */
80 const DNSName* qname{nullptr};
81 const ComboAddress* local{nullptr};
82 const ComboAddress* remote{nullptr};
83 std::shared_ptr<QTag> qTag{nullptr};
84 std::shared_ptr<std::map<uint16_t, EDNSOptionView> > ednsOptions;
85 std::shared_ptr<DNSCryptQuery> dnsCryptQuery{nullptr};
86 std::shared_ptr<DNSDistPacketCache> packetCache{nullptr};
87 struct dnsheader* dh{nullptr};
88 const struct timespec* queryTime{nullptr};
89 struct DOHUnit* du{nullptr};
91 unsigned int consumed{0};
93 boost::optional<uint32_t> tempFailureTTL;
94 uint32_t cacheKeyNoECS;
97 const uint16_t qclass;
99 uint16_t ecsPrefixLength;
101 uint8_t ednsRCode{0};
103 bool skipCache{false};
108 bool ecsAdded{false};
109 bool ednsAdded{false};
110 bool useZeroScope{false};
111 bool dnssecOK{false};
114 struct DNSResponse : DNSQuestion
116 DNSResponse(const DNSName* name, uint16_t type, uint16_t class_, unsigned int consumed, const ComboAddress* lc, const ComboAddress* rem, struct dnsheader* header, size_t bufferSize, uint16_t responseLen, bool isTcp, const struct timespec* queryTime_):
117 DNSQuestion(name, type, class_, consumed, lc, rem, header, bufferSize, responseLen, isTcp, queryTime_) { }
118 DNSResponse(const DNSResponse&) = delete;
119 DNSResponse& operator=(const DNSResponse&) = delete;
120 DNSResponse(DNSResponse&&) = default;
123 /* so what could you do:
126 provide actual answer,
127 allow & and stop processing,
129 modify header: (servfail|refused|notimp), set TC=1,
135 enum class Action { Drop, Nxdomain, Refused, Spoof, Allow, HeaderModify, Pool, Delay, Truncate, ServFail, None, NoOp, NoRecurse };
136 static std::string typeToString(const Action& action)
141 case Action::Nxdomain:
142 return "Send NXDomain";
143 case Action::Refused:
144 return "Send Refused";
146 return "Spoof an answer";
149 case Action::HeaderModify:
150 return "Modify the header";
152 return "Route to a pool";
155 case Action::Truncate:
156 return "Truncate over UDP";
157 case Action::ServFail:
158 return "Send ServFail";
162 case Action::NoRecurse:
169 virtual Action operator()(DNSQuestion*, string* ruleresult) const =0;
173 virtual string toString() const = 0;
174 virtual std::map<string, double> getStats() const
180 class DNSResponseAction
183 enum class Action { Allow, Delay, Drop, HeaderModify, ServFail, None };
184 virtual Action operator()(DNSResponse*, string* ruleresult) const =0;
185 virtual ~DNSResponseAction()
188 virtual string toString() const = 0;
193 DynBlock(): action(DNSAction::Action::None), warning(false)
197 DynBlock(const std::string& reason_, const struct timespec& until_, const DNSName& domain_, DNSAction::Action action_): reason(reason_), until(until_), domain(domain_), action(action_), warning(false)
201 DynBlock(const DynBlock& rhs): reason(rhs.reason), until(rhs.until), domain(rhs.domain), action(rhs.action), warning(rhs.warning)
203 blocks.store(rhs.blocks);
206 DynBlock& operator=(const DynBlock& rhs)
212 blocks.store(rhs.blocks);
218 struct timespec until;
220 DNSAction::Action action;
221 mutable std::atomic<unsigned int> blocks;
225 extern GlobalStateHolder<NetmaskTree<DynBlock>> g_dynblockNMG;
227 extern vector<pair<struct timeval, std::string> > g_confDelta;
229 extern uint64_t getLatencyCount(const std::string&);
233 using stat_t=std::atomic<uint64_t>; // aww yiss ;-)
235 stat_t servfailResponses{0};
237 stat_t frontendNXDomain{0};
238 stat_t frontendServFail{0};
239 stat_t frontendNoError{0};
240 stat_t nonCompliantQueries{0};
241 stat_t nonCompliantResponses{0};
243 stat_t emptyQueries{0};
245 stat_t dynBlocked{0};
247 stat_t ruleNXDomain{0};
248 stat_t ruleRefused{0};
249 stat_t ruleServFail{0};
250 stat_t selfAnswered{0};
251 stat_t downstreamTimeouts{0};
252 stat_t downstreamSendErrors{0};
256 stat_t cacheMisses{0};
257 stat_t latency0_1{0}, latency1_10{0}, latency10_50{0}, latency50_100{0}, latency100_1000{0}, latencySlow{0}, latencySum{0};
258 stat_t securityStatus{0};
260 double latencyAvg100{0}, latencyAvg1000{0}, latencyAvg10000{0}, latencyAvg1000000{0};
261 typedef std::function<uint64_t(const std::string&)> statfunction_t;
262 typedef boost::variant<stat_t*, double*, statfunction_t> entry_t;
263 std::vector<std::pair<std::string, entry_t>> entries{
264 {"responses", &responses},
265 {"servfail-responses", &servfailResponses},
266 {"queries", &queries},
267 {"frontend-nxdomain", &frontendNXDomain},
268 {"frontend-servfail", &frontendServFail},
269 {"frontend-noerror", &frontendNoError},
270 {"acl-drops", &aclDrops},
271 {"rule-drop", &ruleDrop},
272 {"rule-nxdomain", &ruleNXDomain},
273 {"rule-refused", &ruleRefused},
274 {"rule-servfail", &ruleServFail},
275 {"self-answered", &selfAnswered},
276 {"downstream-timeouts", &downstreamTimeouts},
277 {"downstream-send-errors", &downstreamSendErrors},
278 {"trunc-failures", &truncFail},
279 {"no-policy", &noPolicy},
280 {"latency0-1", &latency0_1},
281 {"latency1-10", &latency1_10},
282 {"latency10-50", &latency10_50},
283 {"latency50-100", &latency50_100},
284 {"latency100-1000", &latency100_1000},
285 {"latency-slow", &latencySlow},
286 {"latency-avg100", &latencyAvg100},
287 {"latency-avg1000", &latencyAvg1000},
288 {"latency-avg10000", &latencyAvg10000},
289 {"latency-avg1000000", &latencyAvg1000000},
290 {"uptime", uptimeOfProcess},
291 {"real-memory-usage", getRealMemoryUsage},
292 {"special-memory-usage", getSpecialMemoryUsage},
293 {"noncompliant-queries", &nonCompliantQueries},
294 {"noncompliant-responses", &nonCompliantResponses},
295 {"rdqueries", &rdQueries},
296 {"empty-queries", &emptyQueries},
297 {"cache-hits", &cacheHits},
298 {"cache-misses", &cacheMisses},
299 {"cpu-user-msec", getCPUTimeUser},
300 {"cpu-sys-msec", getCPUTimeSystem},
301 {"fd-usage", getOpenFileDescriptors},
302 {"dyn-blocked", &dynBlocked},
303 {"dyn-block-nmg-size", [](const std::string&) { return g_dynblockNMG.getLocal()->size(); }},
304 {"security-status", &securityStatus},
306 {"latency-sum", &latencySum},
307 {"latency-count", getLatencyCount},
311 // Metric types for Prometheus
312 enum class PrometheusMetricType: int {
317 // Keeps additional information about metrics
318 struct MetricDefinition {
319 MetricDefinition(PrometheusMetricType _prometheusType, const std::string& _description): description(_description), prometheusType(_prometheusType) {
322 MetricDefinition() = default;
324 // Metric description
325 std::string description;
326 // Metric type for Prometheus
327 PrometheusMetricType prometheusType;
330 struct MetricDefinitionStorage {
331 // Return metric definition by name
332 bool getMetricDetails(std::string metricName, MetricDefinition& metric) {
333 auto metricDetailsIter = metrics.find(metricName);
335 if (metricDetailsIter == metrics.end()) {
339 metric = metricDetailsIter->second;
343 // Return string representation of Prometheus metric type
344 std::string getPrometheusStringMetricType(PrometheusMetricType metricType) {
345 switch (metricType) {
346 case PrometheusMetricType::counter:
349 case PrometheusMetricType::gauge:
358 std::map<std::string, MetricDefinition> metrics = {
359 { "responses", MetricDefinition(PrometheusMetricType::counter, "Number of responses received from backends") },
360 { "servfail-responses", MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received from backends") },
361 { "queries", MetricDefinition(PrometheusMetricType::counter, "Number of received queries")},
362 { "frontend-nxdomain", MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers sent to clients")},
363 { "frontend-servfail", MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers sent to clients")},
364 { "frontend-noerror", MetricDefinition(PrometheusMetricType::counter, "Number of NoError answers sent to clients")},
365 { "acl-drops", MetricDefinition(PrometheusMetricType::counter, "Number of packets dropped because of the ACL")},
366 { "rule-drop", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a rule")},
367 { "rule-nxdomain", MetricDefinition(PrometheusMetricType::counter, "Number of NXDomain answers returned because of a rule")},
368 { "rule-refused", MetricDefinition(PrometheusMetricType::counter, "Number of Refused answers returned because of a rule")},
369 { "rule-servfail", MetricDefinition(PrometheusMetricType::counter, "Number of SERVFAIL answers received because of a rule")},
370 { "self-answered", MetricDefinition(PrometheusMetricType::counter, "Number of self-answered responses")},
371 { "downstream-timeouts", MetricDefinition(PrometheusMetricType::counter, "Number of queries not answered in time by a backend")},
372 { "downstream-send-errors", MetricDefinition(PrometheusMetricType::counter, "Number of errors when sending a query to a backend")},
373 { "trunc-failures", MetricDefinition(PrometheusMetricType::counter, "Number of errors encountered while truncating an answer")},
374 { "no-policy", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because no server was available")},
375 { "latency0-1", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in less than 1ms")},
376 { "latency1-10", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 1-10 ms")},
377 { "latency10-50", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 10-50 ms")},
378 { "latency50-100", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 50-100 ms")},
379 { "latency100-1000", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in 100-1000 ms")},
380 { "latency-slow", MetricDefinition(PrometheusMetricType::counter, "Number of queries answered in more than 1 second")},
381 { "latency-avg100", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 100 packets")},
382 { "latency-avg1000", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 1000 packets")},
383 { "latency-avg10000", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 10000 packets")},
384 { "latency-avg1000000", MetricDefinition(PrometheusMetricType::gauge, "Average response latency in microseconds of the last 1000000 packets")},
385 { "uptime", MetricDefinition(PrometheusMetricType::gauge, "Uptime of the dnsdist process in seconds")},
386 { "real-memory-usage", MetricDefinition(PrometheusMetricType::gauge, "Current memory usage in bytes")},
387 { "noncompliant-queries", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped as non-compliant")},
388 { "noncompliant-responses", MetricDefinition(PrometheusMetricType::counter, "Number of answers from a backend dropped as non-compliant")},
389 { "rdqueries", MetricDefinition(PrometheusMetricType::counter, "Number of received queries with the recursion desired bit set")},
390 { "empty-queries", MetricDefinition(PrometheusMetricType::counter, "Number of empty queries received from clients")},
391 { "cache-hits", MetricDefinition(PrometheusMetricType::counter, "Number of times an answer was retrieved from cache")},
392 { "cache-misses", MetricDefinition(PrometheusMetricType::counter, "Number of times an answer not found in the cache")},
393 { "cpu-user-msec", MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the user state")},
394 { "cpu-sys-msec", MetricDefinition(PrometheusMetricType::counter, "Milliseconds spent by dnsdist in the system state")},
395 { "fd-usage", MetricDefinition(PrometheusMetricType::gauge, "Number of currently used file descriptors")},
396 { "dyn-blocked", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a dynamic block")},
397 { "dyn-block-nmg-size", MetricDefinition(PrometheusMetricType::gauge, "Number of dynamic blocks entries") },
398 { "security-status", MetricDefinition(PrometheusMetricType::gauge, "Security status of this software. 0=unknown, 1=OK, 2=upgrade recommended, 3=upgrade mandatory") },
400 { "latency-sum", MetricDefinition(PrometheusMetricType::counter, "Total response time in milliseconds")},
401 { "latency-count", MetricDefinition(PrometheusMetricType::counter, "Number of queries contributing to response time histogram")},
405 extern MetricDefinitionStorage g_metricDefinitions;
406 extern struct DNSDistStats g_stats;
407 void doLatencyStats(double udiff);
412 StopWatch(bool realTime=false): d_needRealTime(realTime)
415 struct timespec d_start{0,0};
416 bool d_needRealTime{false};
419 if(gettime(&d_start, d_needRealTime) < 0)
420 unixDie("Getting timestamp");
424 void set(const struct timespec& from) {
428 double udiff() const {
430 if(gettime(&now, d_needRealTime) < 0)
431 unixDie("Getting timestamp");
433 return 1000000.0*(now.tv_sec - d_start.tv_sec) + (now.tv_nsec - d_start.tv_nsec)/1000.0;
436 double udiffAndSet() {
438 if(gettime(&now, d_needRealTime) < 0)
439 unixDie("Getting timestamp");
441 auto ret= 1000000.0*(now.tv_sec - d_start.tv_sec) + (now.tv_nsec - d_start.tv_nsec)/1000.0;
448 class BasicQPSLimiter
455 BasicQPSLimiter(unsigned int burst): d_tokens(burst)
460 bool check(unsigned int rate, unsigned int burst) const // this is not quite fair
462 auto delta = d_prev.udiffAndSet();
464 if(delta > 0.0) // time, frequently, does go backwards..
465 d_tokens += 1.0 * rate * (delta/1000000.0);
467 if(d_tokens > burst) {
472 if(d_tokens >= 1.0) { // we need this because burst=1 is weird otherwise
480 bool seenSince(const struct timespec& cutOff) const
482 return cutOff < d_prev.d_start;
486 mutable StopWatch d_prev;
487 mutable double d_tokens;
490 class QPSLimiter : public BasicQPSLimiter
493 QPSLimiter(): BasicQPSLimiter()
497 QPSLimiter(unsigned int rate, unsigned int burst): BasicQPSLimiter(burst), d_rate(rate), d_burst(burst), d_passthrough(false)
502 unsigned int getRate() const
504 return d_passthrough ? 0 : d_rate;
507 int getPassed() const
512 int getBlocked() const
517 bool check() const // this is not quite fair
523 bool ret = BasicQPSLimiter::check(d_rate, d_burst);
534 mutable unsigned int d_passed{0};
535 mutable unsigned int d_blocked{0};
537 unsigned int d_burst;
538 bool d_passthrough{true};
545 IDState(): sentTime(true), delayMsec(0), tempFailureTTL(boost::none) { origDest.sin4.sin_family = 0;}
546 IDState(const IDState& orig): origRemote(orig.origRemote), origDest(orig.origDest), age(orig.age)
548 usageIndicator.store(orig.usageIndicator.load());
549 origFD = orig.origFD;
550 origID = orig.origID;
551 delayMsec = orig.delayMsec;
552 tempFailureTTL = orig.tempFailureTTL;
555 static const int64_t unusedIndicator = -1;
557 static bool isInUse(int64_t usageIndicator)
559 return usageIndicator != unusedIndicator;
564 return usageIndicator != unusedIndicator;
567 /* return true if the value has been successfully replaced meaning that
568 no-one updated the usage indicator in the meantime */
569 bool tryMarkUnused(int64_t expectedUsageIndicator)
571 return usageIndicator.compare_exchange_strong(expectedUsageIndicator, unusedIndicator);
574 /* mark as unused no matter what, return true if the state was in use before */
577 auto currentGeneration = generation++;
578 return markAsUsed(currentGeneration);
581 /* mark as unused no matter what, return true if the state was in use before */
582 bool markAsUsed(int64_t currentGeneration)
584 int64_t oldUsage = usageIndicator.exchange(currentGeneration);
585 return oldUsage != unusedIndicator;
588 /* We use this value to detect whether this state is in use.
589 For performance reasons we don't want to use a lock here, but that means
590 we need to be very careful when modifying this value. Modifications happen
592 - one of the UDP or DoH 'client' threads receiving a query, selecting a backend
593 then picking one of the states associated to this backend (via the idOffset).
594 Most of the time this state should not be in use and usageIndicator is -1, but we
595 might not yet have received a response for the query previously associated to this
596 state, meaning that we will 'reuse' this state and erase the existing state.
597 If we ever receive a response for this state, it will be discarded. This is
598 mostly fine for UDP except that we still need to be careful in order to miss
599 the 'outstanding' counters, which should only be increased when we are picking
600 an empty state, and not when reusing ;
601 For DoH, though, we have dynamically allocated a DOHUnit object that needs to
602 be freed, as well as internal objects internals to libh2o.
603 - one of the UDP receiver threads receiving a response from a backend, picking
604 the corresponding state and sending the response to the client ;
605 - the 'healthcheck' thread scanning the states to actively discover timeouts,
606 mostly to keep some counters like the 'outstanding' one sane.
607 We previously based that logic on the origFD (FD on which the query was received,
608 and therefore from where the response should be sent) but this suffered from an
609 ABA problem since it was quite likely that a UDP 'client thread' would reset it to the
610 same value since we only have so much incoming sockets:
611 - 1/ 'client' thread gets a query and set origFD to its FD, say 5 ;
612 - 2/ 'receiver' thread gets a response, read the value of origFD to 5, check that the qname,
613 qtype and qclass match
614 - 3/ during that time the 'client' thread reuses the state, setting again origFD to 5 ;
615 - 4/ the 'receiver' thread uses compare_exchange_strong() to only replace the value if it's still
616 5, except it's not the same 5 anymore and it overrides a fresh state.
617 We now use a 32-bit unsigned counter instead, which is incremented every time the state is set,
618 wrapping around if necessary, and we set an atomic signed 64-bit value, so that we still have -1
619 when the state is unused and the value of our counter otherwise.
621 std::atomic<int64_t> usageIndicator{unusedIndicator}; // set to unusedIndicator to indicate this state is empty // 8
622 std::atomic<uint32_t> generation{0}; // increased every time a state is used, to be able to detect an ABA issue // 4
623 ComboAddress origRemote; // 28
624 ComboAddress origDest; // 28
625 StopWatch sentTime; // 16
627 std::shared_ptr<DNSCryptQuery> dnsCryptQuery{nullptr};
629 boost::optional<boost::uuids::uuid> uniqueId;
631 boost::optional<Netmask> subnet{boost::none};
632 std::shared_ptr<DNSDistPacketCache> packetCache{nullptr};
633 std::shared_ptr<QTag> qTag{nullptr};
634 const ClientState* cs{nullptr};
635 DOHUnit* du{nullptr};
636 uint32_t cacheKey; // 4
637 uint32_t cacheKeyNoECS; // 4
640 uint16_t qclass; // 2
641 uint16_t origID; // 2
642 uint16_t origFlags; // 2
645 boost::optional<uint32_t> tempFailureTTL;
646 bool ednsAdded{false};
647 bool ecsAdded{false};
648 bool skipCache{false};
649 bool destHarvested{false}; // if true, origDest holds the original dest addr, otherwise the listening addr
650 bool dnssecOK{false};
654 typedef std::unordered_map<string, unsigned int> QueryCountRecords;
655 typedef std::function<std::tuple<bool, string>(const DNSQuestion* dq)> QueryCountFilter;
659 pthread_rwlock_init(&queryLock, nullptr);
661 QueryCountRecords records;
662 QueryCountFilter filter;
663 pthread_rwlock_t queryLock;
667 extern QueryCount g_qcount;
671 ClientState(const ComboAddress& local_, bool isTCP, bool doReusePort, int fastOpenQueue, const std::string& itfName, const std::set<int>& cpus_): cpus(cpus_), local(local_), interface(itfName), fastOpenQueueSize(fastOpenQueue), tcp(isTCP), reuseport(doReusePort)
677 std::shared_ptr<DNSCryptContext> dnscryptCtx{nullptr};
678 std::shared_ptr<TLSFrontend> tlsFrontend{nullptr};
679 std::shared_ptr<DOHFrontend> dohFrontend{nullptr};
680 std::string interface;
681 std::atomic<uint64_t> queries{0};
682 std::atomic<uint64_t> tcpDiedReadingQuery{0};
683 std::atomic<uint64_t> tcpDiedSendingResponse{0};
684 std::atomic<uint64_t> tcpGaveUp{0};
685 std::atomic<uint64_t> tcpClientTimeouts{0};
686 std::atomic<uint64_t> tcpDownstreamTimeouts{0};
687 std::atomic<uint64_t> tcpCurrentConnections{0};
688 std::atomic<double> tcpAvgQueriesPerConnection{0.0};
690 std::atomic<double> tcpAvgConnectionDuration{0.0};
693 int fastOpenQueueSize{0};
699 int getSocket() const
701 return udpFD != -1 ? udpFD : tcpFD;
704 std::string getType() const
706 std::string result = udpFD != -1 ? "UDP" : "TCP";
709 result += " (DNS over HTTPS)";
711 else if (tlsFrontend) {
712 result += " (DNS over TLS)";
714 else if (dnscryptCtx) {
715 result += " (DNSCrypt)";
722 shared_ptr<BPFFilter> d_filter;
727 d_filter->removeSocket(getSocket());
732 void attachFilter(shared_ptr<BPFFilter> bpf)
736 bpf->addSocket(getSocket());
739 #endif /* HAVE_EBPF */
741 void updateTCPMetrics(size_t queries, uint64_t durationMs)
743 tcpAvgQueriesPerConnection = (99.0 * tcpAvgQueriesPerConnection / 100.0) + (queries / 100.0);
744 tcpAvgConnectionDuration = (99.0 * tcpAvgConnectionDuration / 100.0) + (durationMs / 100.0);
748 class TCPClientCollection {
749 std::vector<int> d_tcpclientthreads;
750 std::atomic<uint64_t> d_numthreads{0};
751 std::atomic<uint64_t> d_pos{0};
752 std::atomic<uint64_t> d_queued{0};
753 const uint64_t d_maxthreads{0};
756 const bool d_useSinglePipe;
759 TCPClientCollection(size_t maxThreads, bool useSinglePipe=false): d_maxthreads(maxThreads), d_singlePipe{-1,-1}, d_useSinglePipe(useSinglePipe)
762 d_tcpclientthreads.reserve(maxThreads);
764 if (d_useSinglePipe) {
765 if (pipe(d_singlePipe) < 0) {
767 throw std::runtime_error("Error creating the TCP single communication pipe: " + stringerror(err));
770 if (!setNonBlocking(d_singlePipe[0])) {
772 close(d_singlePipe[0]);
773 close(d_singlePipe[1]);
774 throw std::runtime_error("Error setting the TCP single communication pipe non-blocking: " + stringerror(err));
777 if (!setNonBlocking(d_singlePipe[1])) {
779 close(d_singlePipe[0]);
780 close(d_singlePipe[1]);
781 throw std::runtime_error("Error setting the TCP single communication pipe non-blocking: " + stringerror(err));
787 uint64_t pos = d_pos++;
789 return d_tcpclientthreads[pos % d_numthreads];
791 bool hasReachedMaxThreads() const
793 return d_numthreads >= d_maxthreads;
795 uint64_t getThreadsCount() const
799 uint64_t getQueuedCount() const
803 void decrementQueuedCount()
807 void addTCPClientThread();
810 extern std::unique_ptr<TCPClientCollection> g_tcpclientthreads;
812 struct DownstreamState
814 typedef std::function<std::tuple<DNSName, uint16_t, uint16_t>(const DNSName&, uint16_t, uint16_t, dnsheader*)> checkfunc_t;
816 DownstreamState(const ComboAddress& remote_, const ComboAddress& sourceAddr_, unsigned int sourceItf, size_t numberOfSockets);
817 DownstreamState(const ComboAddress& remote_): DownstreamState(remote_, ComboAddress(), 0, 1) {}
820 for (auto& fd : sockets) {
827 boost::uuids::uuid id;
828 std::set<unsigned int> hashes;
829 mutable pthread_rwlock_t d_lock;
830 std::vector<int> sockets;
831 std::mutex socketsLock;
832 std::mutex connectLock;
833 std::unique_ptr<FDMultiplexer> mplexer{nullptr};
835 const ComboAddress remote;
837 vector<IDState> idStates;
838 const ComboAddress sourceAddr;
839 checkfunc_t checkFunction;
840 DNSName checkName{"a.root-servers.net."};
841 QType checkType{QType::A};
842 uint16_t checkClass{QClass::IN};
843 std::atomic<uint64_t> idOffset{0};
844 std::atomic<uint64_t> sendErrors{0};
845 std::atomic<uint64_t> outstanding{0};
846 std::atomic<uint64_t> reuseds{0};
847 std::atomic<uint64_t> queries{0};
849 std::atomic<uint64_t> sendErrors{0};
850 std::atomic<uint64_t> reuseds{0};
851 std::atomic<uint64_t> queries{0};
853 std::atomic<uint64_t> tcpDiedSendingQuery{0};
854 std::atomic<uint64_t> tcpDiedReadingResponse{0};
855 std::atomic<uint64_t> tcpGaveUp{0};
856 std::atomic<uint64_t> tcpReadTimeouts{0};
857 std::atomic<uint64_t> tcpWriteTimeouts{0};
858 std::atomic<uint64_t> tcpCurrentConnections{0};
859 std::atomic<double> tcpAvgQueriesPerConnection{0.0};
861 std::atomic<double> tcpAvgConnectionDuration{0.0};
863 size_t socketsOffset{0};
864 double queryLoad{0.0};
865 double dropRate{0.0};
866 double latencyUsec{0.0};
869 int tcpConnectTimeout{5};
870 int tcpRecvTimeout{30};
871 int tcpSendTimeout{30};
872 unsigned int checkInterval{1};
873 unsigned int lastCheck{0};
874 const unsigned int sourceItf{0};
876 uint16_t xpfRRCode{0};
877 uint16_t checkTimeout{1000}; /* in milliseconds */
878 uint8_t currentCheckFailures{0};
879 uint8_t consecutiveSuccessfulChecks{0};
880 uint8_t maxCheckFailures{1};
881 uint8_t minRiseSuccesses{1};
884 enum class Availability { Up, Down, Auto} availability{Availability::Auto};
885 bool mustResolve{false};
886 bool upStatus{false};
889 bool disableZeroScope{false};
890 std::atomic<bool> connected{false};
891 std::atomic_flag threadStarted;
892 bool tcpFastOpen{false};
893 bool ipBindAddrNoPort{true};
897 if(availability == Availability::Down)
899 if(availability == Availability::Up)
903 void setUp() { availability = Availability::Up; }
904 void setDown() { availability = Availability::Down; }
905 void setAuto() { availability = Availability::Auto; }
906 string getName() const {
908 return remote.toStringWithPort();
912 string getNameWithAddr() const {
914 return remote.toStringWithPort();
916 return name + " (" + remote.toStringWithPort()+ ")";
918 string getStatus() const
921 if(availability == DownstreamState::Availability::Up)
923 else if(availability == DownstreamState::Availability::Down)
926 status = (upStatus ? "up" : "down");
931 void setId(const boost::uuids::uuid& newId);
932 void setWeight(int newWeight);
934 void updateTCPMetrics(size_t queries, uint64_t durationMs)
936 tcpAvgQueriesPerConnection = (99.0 * tcpAvgQueriesPerConnection / 100.0) + (queries / 100.0);
937 tcpAvgConnectionDuration = (99.0 * tcpAvgConnectionDuration / 100.0) + (durationMs / 100.0);
940 using servers_t =vector<std::shared_ptr<DownstreamState>>;
942 template <class T> using NumberedVector = std::vector<std::pair<unsigned int, T> >;
944 void responderThread(std::shared_ptr<DownstreamState> state);
945 extern std::mutex g_luamutex;
946 extern LuaContext g_lua;
947 extern std::string g_outputBuffer; // locking for this is ok, as locked by g_luamutex
955 virtual bool matches(const DNSQuestion* dq) const =0;
956 virtual string toString() const = 0;
957 mutable std::atomic<uint64_t> d_matches{0};
960 using NumberedServerVector = NumberedVector<shared_ptr<DownstreamState>>;
961 typedef std::function<shared_ptr<DownstreamState>(const NumberedServerVector& servers, const DNSQuestion*)> policyfunc_t;
968 std::string toString() const {
969 return string("ServerPolicy") + (isLua ? " (Lua)" : "") + " \"" + name + "\"";
977 pthread_rwlock_init(&d_lock, nullptr);
980 const std::shared_ptr<DNSDistPacketCache> getCache() const { return packetCache; };
987 void setECS(bool useECS)
992 std::shared_ptr<DNSDistPacketCache> packetCache{nullptr};
993 std::shared_ptr<ServerPolicy> policy{nullptr};
995 size_t countServers(bool upOnly)
998 ReadLock rl(&d_lock);
999 for (const auto& server : d_servers) {
1000 if (!upOnly || std::get<1>(server)->isUp() ) {
1007 NumberedVector<shared_ptr<DownstreamState>> getServers()
1009 NumberedVector<shared_ptr<DownstreamState>> result;
1011 ReadLock rl(&d_lock);
1017 void addServer(shared_ptr<DownstreamState>& server)
1019 WriteLock wl(&d_lock);
1020 unsigned int count = (unsigned int) d_servers.size();
1021 d_servers.push_back(make_pair(++count, server));
1022 /* we need to reorder based on the server 'order' */
1023 std::stable_sort(d_servers.begin(), d_servers.end(), [](const std::pair<unsigned int,std::shared_ptr<DownstreamState> >& a, const std::pair<unsigned int,std::shared_ptr<DownstreamState> >& b) {
1024 return a.second->order < b.second->order;
1026 /* and now we need to renumber for Lua (custom policies) */
1028 for (auto& serv : d_servers) {
1033 void removeServer(shared_ptr<DownstreamState>& server)
1035 WriteLock wl(&d_lock);
1038 for (auto it = d_servers.begin(); it != d_servers.end();) {
1040 /* we need to renumber the servers placed
1041 after the removed one, for Lua (custom policies) */
1045 else if (it->second == server) {
1046 it = d_servers.erase(it);
1056 NumberedVector<shared_ptr<DownstreamState>> d_servers;
1057 pthread_rwlock_t d_lock;
1058 bool d_useECS{false};
1060 using pools_t=map<std::string,std::shared_ptr<ServerPool>>;
1061 void setPoolPolicy(pools_t& pools, const string& poolName, std::shared_ptr<ServerPolicy> policy);
1062 void addServerToPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server);
1063 void removeServerFromPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server);
1067 ComboAddress server;
1068 std::string namespace_name;
1069 std::string ourname;
1070 std::string instance_name;
1071 unsigned int interval;
1074 enum ednsHeaderFlags {
1075 EDNS_HEADER_FLAG_NONE = 0,
1076 EDNS_HEADER_FLAG_DO = 32768
1079 struct DNSDistRuleAction
1081 std::shared_ptr<DNSRule> d_rule;
1082 std::shared_ptr<DNSAction> d_action;
1083 boost::uuids::uuid d_id;
1084 uint64_t d_creationOrder;
1087 struct DNSDistResponseRuleAction
1089 std::shared_ptr<DNSRule> d_rule;
1090 std::shared_ptr<DNSResponseAction> d_action;
1091 boost::uuids::uuid d_id;
1092 uint64_t d_creationOrder;
1095 extern GlobalStateHolder<SuffixMatchTree<DynBlock>> g_dynblockSMT;
1096 extern DNSAction::Action g_dynBlockAction;
1098 extern GlobalStateHolder<vector<CarbonConfig> > g_carbon;
1099 extern GlobalStateHolder<ServerPolicy> g_policy;
1100 extern GlobalStateHolder<servers_t> g_dstates;
1101 extern GlobalStateHolder<pools_t> g_pools;
1102 extern GlobalStateHolder<vector<DNSDistRuleAction> > g_rulactions;
1103 extern GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_resprulactions;
1104 extern GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_cachehitresprulactions;
1105 extern GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_selfansweredresprulactions;
1106 extern GlobalStateHolder<NetmaskGroup> g_ACL;
1108 extern ComboAddress g_serverControl; // not changed during runtime
1110 extern std::vector<std::tuple<ComboAddress, bool, bool, int, std::string, std::set<int>>> g_locals; // not changed at runtime (we hope XXX)
1111 extern std::vector<shared_ptr<TLSFrontend>> g_tlslocals;
1112 extern std::vector<shared_ptr<DOHFrontend>> g_dohlocals;
1113 extern std::vector<std::unique_ptr<ClientState>> g_frontends;
1114 extern bool g_truncateTC;
1115 extern bool g_fixupCase;
1116 extern int g_tcpRecvTimeout;
1117 extern int g_tcpSendTimeout;
1118 extern int g_udpTimeout;
1119 extern uint16_t g_maxOutstanding;
1120 extern std::atomic<bool> g_configurationDone;
1121 extern uint64_t g_maxTCPClientThreads;
1122 extern uint64_t g_maxTCPQueuedConnections;
1123 extern size_t g_maxTCPQueriesPerConn;
1124 extern size_t g_maxTCPConnectionDuration;
1125 extern size_t g_maxTCPConnectionsPerClient;
1126 extern std::atomic<uint16_t> g_cacheCleaningDelay;
1127 extern std::atomic<uint16_t> g_cacheCleaningPercentage;
1128 extern bool g_verboseHealthChecks;
1129 extern uint32_t g_staleCacheEntriesTTL;
1130 extern bool g_apiReadWrite;
1131 extern std::string g_apiConfigDirectory;
1132 extern bool g_servFailOnNoPolicy;
1133 extern uint32_t g_hashperturb;
1134 extern bool g_useTCPSinglePipe;
1135 extern uint16_t g_downstreamTCPCleanupInterval;
1136 extern size_t g_udpVectorSize;
1137 extern bool g_preserveTrailingData;
1138 extern bool g_allowEmptyResponse;
1139 extern bool g_roundrobinFailOnNoServer;
1142 extern shared_ptr<BPFFilter> g_defaultBPFFilter;
1143 extern std::vector<std::shared_ptr<DynBPFFilter> > g_dynBPFFilters;
1144 #endif /* HAVE_EBPF */
1148 LocalHolders(): acl(g_ACL.getLocal()), policy(g_policy.getLocal()), rulactions(g_rulactions.getLocal()), cacheHitRespRulactions(g_cachehitresprulactions.getLocal()), selfAnsweredRespRulactions(g_selfansweredresprulactions.getLocal()), servers(g_dstates.getLocal()), dynNMGBlock(g_dynblockNMG.getLocal()), dynSMTBlock(g_dynblockSMT.getLocal()), pools(g_pools.getLocal())
1152 LocalStateHolder<NetmaskGroup> acl;
1153 LocalStateHolder<ServerPolicy> policy;
1154 LocalStateHolder<vector<DNSDistRuleAction> > rulactions;
1155 LocalStateHolder<vector<DNSDistResponseRuleAction> > cacheHitRespRulactions;
1156 LocalStateHolder<vector<DNSDistResponseRuleAction> > selfAnsweredRespRulactions;
1157 LocalStateHolder<servers_t> servers;
1158 LocalStateHolder<NetmaskTree<DynBlock> > dynNMGBlock;
1159 LocalStateHolder<SuffixMatchTree<DynBlock> > dynSMTBlock;
1160 LocalStateHolder<pools_t> pools;
1165 void controlThread(int fd, ComboAddress local);
1166 vector<std::function<void(void)>> setupLua(bool client, const std::string& config);
1167 std::shared_ptr<ServerPool> getPool(const pools_t& pools, const std::string& poolName);
1168 std::shared_ptr<ServerPool> createPoolIfNotExists(pools_t& pools, const string& poolName);
1169 NumberedServerVector getDownstreamCandidates(const pools_t& pools, const std::string& poolName);
1171 std::shared_ptr<DownstreamState> firstAvailable(const NumberedServerVector& servers, const DNSQuestion* dq);
1173 std::shared_ptr<DownstreamState> leastOutstanding(const NumberedServerVector& servers, const DNSQuestion* dq);
1174 std::shared_ptr<DownstreamState> wrandom(const NumberedServerVector& servers, const DNSQuestion* dq);
1175 std::shared_ptr<DownstreamState> whashed(const NumberedServerVector& servers, const DNSQuestion* dq);
1176 std::shared_ptr<DownstreamState> chashed(const NumberedServerVector& servers, const DNSQuestion* dq);
1177 std::shared_ptr<DownstreamState> roundrobin(const NumberedServerVector& servers, const DNSQuestion* dq);
1179 struct WebserverConfig
1181 std::string password;
1183 boost::optional<std::map<std::string, std::string> > customHeaders;
1187 void setWebserverAPIKey(const boost::optional<std::string> apiKey);
1188 void setWebserverPassword(const std::string& password);
1189 void setWebserverCustomHeaders(const boost::optional<std::map<std::string, std::string> > customHeaders);
1191 void dnsdistWebserverThread(int sock, const ComboAddress& local);
1192 void tcpAcceptorThread(void* p);
1193 #ifdef HAVE_DNS_OVER_HTTPS
1194 void dohThread(ClientState* cs);
1195 #endif /* HAVE_DNS_OVER_HTTPS */
1197 void setLuaNoSideEffect(); // if nothing has been declared, set that there are no side effects
1198 void setLuaSideEffect(); // set to report a side effect, cancelling all _no_ side effect calls
1199 bool getLuaNoSideEffect(); // set if there were only explicit declarations of _no_ side effect
1200 void resetLuaSideEffect(); // reset to indeterminate state
1202 bool responseContentMatches(const char* response, const uint16_t responseLen, const DNSName& qname, const uint16_t qtype, const uint16_t qclass, const ComboAddress& remote, unsigned int& consumed);
1203 bool processResponse(char** response, uint16_t* responseLen, size_t* responseSize, LocalStateHolder<vector<DNSDistResponseRuleAction> >& localRespRulactions, DNSResponse& dr, size_t addRoom, std::vector<uint8_t>& rewrittenResponse, bool muted);
1204 bool processRulesResult(const DNSAction::Action& action, DNSQuestion& dq, std::string& ruleresult, bool& drop);
1206 bool checkQueryHeaders(const struct dnsheader* dh);
1208 extern std::vector<std::shared_ptr<DNSCryptContext>> g_dnsCryptLocals;
1209 int handleDNSCryptQuery(char* packet, uint16_t len, std::shared_ptr<DNSCryptQuery> query, uint16_t* decryptedQueryLen, bool tcp, time_t now, std::vector<uint8_t>& response);
1210 boost::optional<std::vector<uint8_t>> checkDNSCryptQuery(const ClientState& cs, const char* query, uint16_t& len, std::shared_ptr<DNSCryptQuery>& dnsCryptQuery, time_t now, bool tcp);
1212 bool addXPF(DNSQuestion& dq, uint16_t optionCode);
1214 uint16_t getRandomDNSID();
1216 #include "dnsdist-snmp.hh"
1218 extern bool g_snmpEnabled;
1219 extern bool g_snmpTrapsEnabled;
1220 extern DNSDistSNMPAgent* g_snmpAgent;
1221 extern bool g_addEDNSToSelfGeneratedResponses;
1223 static const size_t s_udpIncomingBufferSize{1500};
1225 enum class ProcessQueryResult { Drop, SendAnswer, PassToBackend };
1226 ProcessQueryResult processQuery(DNSQuestion& dq, ClientState& cs, LocalHolders& holders, std::shared_ptr<DownstreamState>& selectedBackend);
1228 DNSResponse makeDNSResponseFromIDState(IDState& ids, struct dnsheader* dh, size_t bufferSize, uint16_t responseLen, bool isTCP);
1229 void setIDStateFromDNSQuestion(IDState& ids, DNSQuestion& dq, DNSName&& qname);
1231 int pickBackendSocketForSending(std::shared_ptr<DownstreamState>& state);
1232 ssize_t udpClientSendRequestToBackend(const std::shared_ptr<DownstreamState>& ss, const int sd, const char* request, const size_t requestLen, bool healthCheck=false);