2 * This file is part of PowerDNS or dnsdist.
3 * Copyright -- PowerDNS.COM B.V. and its contributors
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of version 2 of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * In addition, for the avoidance of any doubt, permission is granted to
10 * link this program with OpenSSL and to (re)distribute the binaries
11 * produced as the result of such linking.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
29 #include <netinet/tcp.h>
31 #include <sys/resource.h>
34 #if defined (__OpenBSD__) || defined(__NetBSD__)
35 #include <readline/readline.h>
37 #include <editline/readline.h>
41 #include <systemd/sd-daemon.h>
45 #include "dnsdist-cache.hh"
46 #include "dnsdist-console.hh"
47 #include "dnsdist-ecs.hh"
48 #include "dnsdist-healthchecks.hh"
49 #include "dnsdist-lua.hh"
50 #include "dnsdist-rings.hh"
51 #include "dnsdist-secpoll.hh"
52 #include "dnsdist-xpf.hh"
55 #include "delaypipe.hh"
58 #include "dnsparser.hh"
59 #include "ednsoptions.hh"
63 #include "sodcrypto.hh"
65 #include "threadname.hh"
69 Receiver is currently single threaded
70 not *that* bad actually, but now that we are thread safe, might want to scale
74 Set of Rules, if one matches, it leads to an Action
75 Both rules and actions could conceivably be Lua based.
76 On the C++ side, both could be inherited from a class Rule and a class Action,
77 on the Lua side we can't do that. */
83 struct DNSDistStats g_stats
;
85 uint16_t g_maxOutstanding
{std::numeric_limits
<uint16_t>::max()};
86 uint32_t g_staleCacheEntriesTTL
{0};
88 bool g_allowEmptyResponse
{false};
90 GlobalStateHolder
<NetmaskGroup
> g_ACL
;
91 string g_outputBuffer
;
93 std::vector
<std::shared_ptr
<TLSFrontend
>> g_tlslocals
;
94 std::vector
<std::shared_ptr
<DOHFrontend
>> g_dohlocals
;
95 std::vector
<std::shared_ptr
<DNSCryptContext
>> g_dnsCryptLocals
;
97 shared_ptr
<BPFFilter
> g_defaultBPFFilter
;
98 std::vector
<std::shared_ptr
<DynBPFFilter
> > g_dynBPFFilters
;
99 #endif /* HAVE_EBPF */
100 std::vector
<std::unique_ptr
<ClientState
>> g_frontends
;
101 GlobalStateHolder
<pools_t
> g_pools
;
102 size_t g_udpVectorSize
{1};
104 bool g_snmpEnabled
{false};
105 bool g_snmpTrapsEnabled
{false};
106 DNSDistSNMPAgent
* g_snmpAgent
{nullptr};
108 /* UDP: the grand design. Per socket we listen on for incoming queries there is one thread.
109 Then we have a bunch of connected sockets for talking to downstream servers.
110 We send directly to those sockets.
112 For the return path, per downstream server we have a thread that listens to responses.
114 Per socket there is an array of 2^16 states, when we send out a packet downstream, we note
115 there the original requestor and the original id. The new ID is the offset in the array.
117 When an answer comes in on a socket, we look up the offset by the id, and lob it to the
120 IDs are assigned by atomic increments of the socket offset.
123 GlobalStateHolder
<vector
<DNSDistRuleAction
> > g_rulactions
;
124 GlobalStateHolder
<vector
<DNSDistResponseRuleAction
> > g_resprulactions
;
125 GlobalStateHolder
<vector
<DNSDistResponseRuleAction
> > g_cachehitresprulactions
;
126 GlobalStateHolder
<vector
<DNSDistResponseRuleAction
> > g_selfansweredresprulactions
;
131 GlobalStateHolder
<servers_t
> g_dstates
;
132 GlobalStateHolder
<NetmaskTree
<DynBlock
>> g_dynblockNMG
;
133 GlobalStateHolder
<SuffixMatchTree
<DynBlock
>> g_dynblockSMT
;
134 DNSAction::Action g_dynBlockAction
= DNSAction::Action::Drop
;
135 int g_tcpRecvTimeout
{2};
136 int g_tcpSendTimeout
{2};
139 bool g_servFailOnNoPolicy
{false};
140 bool g_truncateTC
{false};
141 bool g_fixupCase
{false};
142 bool g_preserveTrailingData
{false};
143 bool g_roundrobinFailOnNoServer
{false};
145 std::set
<std::string
> g_capabilitiesToRetain
;
147 static void truncateTC(char* packet
, uint16_t* len
, size_t responseSize
, unsigned int consumed
)
150 bool hadEDNS
= false;
151 uint16_t payloadSize
= 0;
154 if (g_addEDNSToSelfGeneratedResponses
) {
155 hadEDNS
= getEDNSUDPPayloadSizeAndZ(packet
, *len
, &payloadSize
, &z
);
158 *len
=static_cast<uint16_t>(sizeof(dnsheader
)+consumed
+DNS_TYPE_SIZE
+DNS_CLASS_SIZE
);
159 struct dnsheader
* dh
= reinterpret_cast<struct dnsheader
*>(packet
);
160 dh
->ancount
= dh
->arcount
= dh
->nscount
= 0;
163 addEDNS(dh
, *len
, responseSize
, z
& EDNS_HEADER_FLAG_DO
, payloadSize
, 0);
175 ComboAddress destination
;
176 ComboAddress origDest
;
180 if(origDest
.sin4
.sin_family
== 0) {
181 res
= sendto(fd
, packet
.c_str(), packet
.size(), 0, (struct sockaddr
*)&destination
, destination
.getSocklen());
184 res
= sendfromto(fd
, packet
.c_str(), packet
.size(), 0, origDest
, destination
);
188 vinfolog("Error sending delayed response to %s: %s", destination
.toStringWithPort(), strerror(err
));
193 DelayPipe
<DelayedPacket
>* g_delay
= nullptr;
195 void doLatencyStats(double udiff
)
197 if(udiff
< 1000) ++g_stats
.latency0_1
;
198 else if(udiff
< 10000) ++g_stats
.latency1_10
;
199 else if(udiff
< 50000) ++g_stats
.latency10_50
;
200 else if(udiff
< 100000) ++g_stats
.latency50_100
;
201 else if(udiff
< 1000000) ++g_stats
.latency100_1000
;
202 else ++g_stats
.latencySlow
;
203 g_stats
.latencySum
+= udiff
/ 1000;
205 auto doAvg
= [](double& var
, double n
, double weight
) {
206 var
= (weight
-1) * var
/weight
+ n
/weight
;
209 doAvg(g_stats
.latencyAvg100
, udiff
, 100);
210 doAvg(g_stats
.latencyAvg1000
, udiff
, 1000);
211 doAvg(g_stats
.latencyAvg10000
, udiff
, 10000);
212 doAvg(g_stats
.latencyAvg1000000
, udiff
, 1000000);
215 bool responseContentMatches(const char* response
, const uint16_t responseLen
, const DNSName
& qname
, const uint16_t qtype
, const uint16_t qclass
, const ComboAddress
& remote
, unsigned int& consumed
)
217 if (responseLen
< sizeof(dnsheader
)) {
221 const struct dnsheader
* dh
= reinterpret_cast<const struct dnsheader
*>(response
);
222 if (dh
->qdcount
== 0) {
223 if ((dh
->rcode
!= RCode::NoError
&& dh
->rcode
!= RCode::NXDomain
) || g_allowEmptyResponse
) {
227 ++g_stats
.nonCompliantResponses
;
232 uint16_t rqtype
, rqclass
;
235 rqname
=DNSName(response
, responseLen
, sizeof(dnsheader
), false, &rqtype
, &rqclass
, &consumed
);
237 catch(const std::exception
& e
) {
238 if(responseLen
> 0 && static_cast<size_t>(responseLen
) > sizeof(dnsheader
)) {
239 infolog("Backend %s sent us a response with id %d that did not parse: %s", remote
.toStringWithPort(), ntohs(dh
->id
), e
.what());
241 ++g_stats
.nonCompliantResponses
;
245 if (rqtype
!= qtype
|| rqclass
!= qclass
|| rqname
!= qname
) {
252 static void restoreFlags(struct dnsheader
* dh
, uint16_t origFlags
)
254 static const uint16_t rdMask
= 1 << FLAGS_RD_OFFSET
;
255 static const uint16_t cdMask
= 1 << FLAGS_CD_OFFSET
;
256 static const uint16_t restoreFlagsMask
= UINT16_MAX
& ~(rdMask
| cdMask
);
257 uint16_t * flags
= getFlagsFromDNSHeader(dh
);
258 /* clear the flags we are about to restore */
259 *flags
&= restoreFlagsMask
;
260 /* only keep the flags we want to restore */
261 origFlags
&= ~restoreFlagsMask
;
262 /* set the saved flags as they were */
266 static bool fixUpQueryTurnedResponse(DNSQuestion
& dq
, const uint16_t origFlags
)
268 restoreFlags(dq
.dh
, origFlags
);
270 return addEDNSToQueryTurnedResponse(dq
);
273 static bool fixUpResponse(char** response
, uint16_t* responseLen
, size_t* responseSize
, const DNSName
& qname
, uint16_t origFlags
, bool ednsAdded
, bool ecsAdded
, std::vector
<uint8_t>& rewrittenResponse
, uint16_t addRoom
, bool* zeroScope
)
275 if (*responseLen
< sizeof(dnsheader
)) {
279 struct dnsheader
* dh
= reinterpret_cast<struct dnsheader
*>(*response
);
280 restoreFlags(dh
, origFlags
);
282 if (*responseLen
== sizeof(dnsheader
)) {
287 string realname
= qname
.toDNSString();
288 if (*responseLen
>= (sizeof(dnsheader
) + realname
.length())) {
289 memcpy(*response
+ sizeof(dnsheader
), realname
.c_str(), realname
.length());
293 if (ednsAdded
|| ecsAdded
) {
298 const std::string
responseStr(*response
, *responseLen
);
299 int res
= locateEDNSOptRR(responseStr
, &optStart
, &optLen
, &last
);
302 if (zeroScope
) { // this finds if an EDNS Client Subnet scope was set, and if it is 0
303 size_t optContentStart
= 0;
304 uint16_t optContentLen
= 0;
305 /* we need at least 4 bytes after the option length (family: 2, source prefix-length: 1, scope prefix-length: 1) */
306 if (isEDNSOptionInOpt(responseStr
, optStart
, optLen
, EDNSOptionCode::ECS
, &optContentStart
, &optContentLen
) && optContentLen
>= 4) {
307 /* see if the EDNS Client Subnet SCOPE PREFIX-LENGTH byte in position 3 is set to 0, which is the only thing
309 *zeroScope
= responseStr
.at(optContentStart
+ 3) == 0;
314 /* we added the entire OPT RR,
315 therefore we need to remove it entirely */
317 /* simply remove the last AR */
318 *responseLen
-= optLen
;
319 uint16_t arcount
= ntohs(dh
->arcount
);
321 dh
->arcount
= htons(arcount
);
324 /* Removing an intermediary RR could lead to compression error */
325 if (rewriteResponseWithoutEDNS(responseStr
, rewrittenResponse
) == 0) {
326 *responseLen
= rewrittenResponse
.size();
327 if (addRoom
&& (UINT16_MAX
- *responseLen
) > addRoom
) {
328 rewrittenResponse
.reserve(*responseLen
+ addRoom
);
330 *responseSize
= rewrittenResponse
.capacity();
331 *response
= reinterpret_cast<char*>(rewrittenResponse
.data());
334 warnlog("Error rewriting content");
339 /* the OPT RR was already present, but without ECS,
340 we need to remove the ECS option if any */
342 /* nothing after the OPT RR, we can simply remove the
344 size_t existingOptLen
= optLen
;
345 removeEDNSOptionFromOPT(*response
+ optStart
, &optLen
, EDNSOptionCode::ECS
);
346 *responseLen
-= (existingOptLen
- optLen
);
349 /* Removing an intermediary RR could lead to compression error */
350 if (rewriteResponseWithoutEDNSOption(responseStr
, EDNSOptionCode::ECS
, rewrittenResponse
) == 0) {
351 *responseLen
= rewrittenResponse
.size();
352 if (addRoom
&& (UINT16_MAX
- *responseLen
) > addRoom
) {
353 rewrittenResponse
.reserve(*responseLen
+ addRoom
);
355 *responseSize
= rewrittenResponse
.capacity();
356 *response
= reinterpret_cast<char*>(rewrittenResponse
.data());
359 warnlog("Error rewriting content");
370 static bool encryptResponse(char* response
, uint16_t* responseLen
, size_t responseSize
, bool tcp
, std::shared_ptr
<DNSCryptQuery
> dnsCryptQuery
, dnsheader
** dh
, dnsheader
* dhCopy
)
373 uint16_t encryptedResponseLen
= 0;
375 /* save the original header before encrypting it in place */
376 if (dh
!= nullptr && *dh
!= nullptr && dhCopy
!= nullptr) {
377 memcpy(dhCopy
, *dh
, sizeof(dnsheader
));
381 int res
= dnsCryptQuery
->encryptResponse(response
, *responseLen
, responseSize
, tcp
, &encryptedResponseLen
);
383 *responseLen
= encryptedResponseLen
;
385 /* dropping response */
386 vinfolog("Error encrypting the response, dropping.");
392 #endif /* HAVE_DNSCRYPT */
394 static bool applyRulesToResponse(LocalStateHolder
<vector
<DNSDistResponseRuleAction
> >& localRespRulactions
, DNSResponse
& dr
)
396 DNSResponseAction::Action action
=DNSResponseAction::Action::None
;
397 std::string ruleresult
;
398 for(const auto& lr
: *localRespRulactions
) {
399 if(lr
.d_rule
->matches(&dr
)) {
400 lr
.d_rule
->d_matches
++;
401 action
=(*lr
.d_action
)(&dr
, &ruleresult
);
403 case DNSResponseAction::Action::Allow
:
406 case DNSResponseAction::Action::Drop
:
409 case DNSResponseAction::Action::HeaderModify
:
412 case DNSResponseAction::Action::ServFail
:
413 dr
.dh
->rcode
= RCode::ServFail
;
416 /* non-terminal actions follow */
417 case DNSResponseAction::Action::Delay
:
418 dr
.delayMsec
= static_cast<int>(pdns_stou(ruleresult
)); // sorry
420 case DNSResponseAction::Action::None
:
429 bool processResponse(char** response
, uint16_t* responseLen
, size_t* responseSize
, LocalStateHolder
<vector
<DNSDistResponseRuleAction
> >& localRespRulactions
, DNSResponse
& dr
, size_t addRoom
, std::vector
<uint8_t>& rewrittenResponse
, bool muted
)
431 if (!applyRulesToResponse(localRespRulactions
, dr
)) {
435 bool zeroScope
= false;
436 if (!fixUpResponse(response
, responseLen
, responseSize
, *dr
.qname
, dr
.origFlags
, dr
.ednsAdded
, dr
.ecsAdded
, rewrittenResponse
, addRoom
, dr
.useZeroScope
? &zeroScope
: nullptr)) {
440 if (dr
.packetCache
&& !dr
.skipCache
&& *responseLen
<= s_maxPacketCacheEntrySize
) {
441 if (!dr
.useZeroScope
) {
442 /* if the query was not suitable for zero-scope, for
443 example because it had an existing ECS entry so the hash is
444 not really 'no ECS', so just insert it for the existing subnet
446 - we don't have the correct hash for a non-ECS query
447 - inserting with hash computed before the ECS replacement but with
448 the subnet extracted _after_ the replacement would not work.
452 // if zeroScope, pass the pre-ECS hash-key and do not pass the subnet to the cache
453 dr
.packetCache
->insert(zeroScope
? dr
.cacheKeyNoECS
: dr
.cacheKey
, zeroScope
? boost::none
: dr
.subnet
, dr
.origFlags
, dr
.dnssecOK
, *dr
.qname
, dr
.qtype
, dr
.qclass
, *response
, *responseLen
, dr
.tcp
, dr
.dh
->rcode
, dr
.tempFailureTTL
);
458 if (!encryptResponse(*response
, responseLen
, *responseSize
, dr
.tcp
, dr
.dnsCryptQuery
, nullptr, nullptr)) {
462 #endif /* HAVE_DNSCRYPT */
467 static bool sendUDPResponse(int origFD
, const char* response
, const uint16_t responseLen
, const int delayMsec
, const ComboAddress
& origDest
, const ComboAddress
& origRemote
)
469 if(delayMsec
&& g_delay
) {
470 DelayedPacket dp
{origFD
, string(response
,responseLen
), origRemote
, origDest
};
471 g_delay
->submit(dp
, delayMsec
);
475 if(origDest
.sin4
.sin_family
== 0) {
476 res
= sendto(origFD
, response
, responseLen
, 0, reinterpret_cast<const struct sockaddr
*>(&origRemote
), origRemote
.getSocklen());
479 res
= sendfromto(origFD
, response
, responseLen
, 0, origDest
, origRemote
);
483 vinfolog("Error sending response to %s: %s", origRemote
.toStringWithPort(), stringerror(err
));
491 int pickBackendSocketForSending(std::shared_ptr
<DownstreamState
>& state
)
493 return state
->sockets
[state
->socketsOffset
++ % state
->sockets
.size()];
496 static void pickBackendSocketsReadyForReceiving(const std::shared_ptr
<DownstreamState
>& state
, std::vector
<int>& ready
)
500 if (state
->sockets
.size() == 1) {
501 ready
.push_back(state
->sockets
[0]);
506 std::lock_guard
<std::mutex
> lock(state
->socketsLock
);
507 state
->mplexer
->getAvailableFDs(ready
, -1);
511 // listens on a dedicated socket, lobs answers from downstream servers to original requestors
512 void responderThread(std::shared_ptr
<DownstreamState
> dss
)
514 setThreadName("dnsdist/respond");
515 auto localRespRulactions
= g_resprulactions
.getLocal();
516 char packet
[s_maxPacketCacheEntrySize
+ DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE
];
517 static_assert(sizeof(packet
) <= UINT16_MAX
, "Packet size should fit in a uint16_t");
518 /* when the answer is encrypted in place, we need to get a copy
519 of the original header before encryption to fill the ring buffer */
520 dnsheader cleartextDH
;
521 vector
<uint8_t> rewrittenResponse
;
523 uint16_t queryId
= 0;
524 std::vector
<int> sockets
;
525 sockets
.reserve(dss
->sockets
.size());
528 dnsheader
* dh
= reinterpret_cast<struct dnsheader
*>(packet
);
530 pickBackendSocketsReadyForReceiving(dss
, sockets
);
531 for (const auto& fd
: sockets
) {
532 ssize_t got
= recv(fd
, packet
, sizeof(packet
), 0);
533 char * response
= packet
;
534 size_t responseSize
= sizeof(packet
);
536 if (got
< 0 || static_cast<size_t>(got
) < sizeof(dnsheader
))
539 uint16_t responseLen
= static_cast<uint16_t>(got
);
542 if(queryId
>= dss
->idStates
.size()) {
546 IDState
* ids
= &dss
->idStates
[queryId
];
547 int64_t usageIndicator
= ids
->usageIndicator
;
549 if(!IDState::isInUse(usageIndicator
)) {
550 /* the corresponding state is marked as not in use, meaning that:
551 - it was already cleaned up by another thread and the state is gone ;
552 - we already got a response for this query and this one is a duplicate.
553 Either way, we don't touch it.
558 /* read the potential DOHUnit state as soon as possible, but don't use it
559 until we have confirmed that we own this state by updating usageIndicator */
561 /* setting age to 0 to prevent the maintainer thread from
562 cleaning this IDS while we process the response.
565 int origFD
= ids
->origFD
;
567 unsigned int consumed
= 0;
568 if (!responseContentMatches(response
, responseLen
, ids
->qname
, ids
->qtype
, ids
->qclass
, dss
->remote
, consumed
)) {
572 bool isDoH
= du
!= nullptr;
573 /* atomically mark the state as available, but only if it has not been altered
575 if (ids
->tryMarkUnused(usageIndicator
)) {
576 /* clear the potential DOHUnit asap, it's ours now
577 and since we just marked the state as unused,
578 someone could overwrite it. */
580 /* we only decrement the outstanding counter if the value was not
581 altered in the meantime, which would mean that the state has been actively reused
582 and the other thread has not incremented the outstanding counter, so we don't
583 want it to be decremented twice. */
584 --dss
->outstanding
; // you'd think an attacker could game this, but we're using connected socket
586 /* someone updated the state in the meantime, we can't touch the existing pointer */
588 /* since the state has been updated, we can't safely access it so let's just drop
593 if(dh
->tc
&& g_truncateTC
) {
594 truncateTC(response
, &responseLen
, responseSize
, consumed
);
597 dh
->id
= ids
->origID
;
599 uint16_t addRoom
= 0;
600 DNSResponse dr
= makeDNSResponseFromIDState(*ids
, dh
, sizeof(packet
), responseLen
, false);
601 if (dr
.dnsCryptQuery
) {
602 addRoom
= DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE
;
605 memcpy(&cleartextDH
, dr
.dh
, sizeof(cleartextDH
));
606 if (!processResponse(&response
, &responseLen
, &responseSize
, localRespRulactions
, dr
, addRoom
, rewrittenResponse
, ids
->cs
&& ids
->cs
->muted
)) {
610 if (ids
->cs
&& !ids
->cs
->muted
) {
612 #ifdef HAVE_DNS_OVER_HTTPS
614 du
->response
= std::string(response
, responseLen
);
615 if (send(du
->rsock
, &du
, sizeof(du
), 0) != sizeof(du
)) {
616 /* at this point we have the only remaining pointer on this
617 DOHUnit object since we did set ids->du to nullptr earlier,
618 except if we got the response before the pointer could be
619 released by the frontend */
622 #endif /* HAVE_DNS_OVER_HTTPS */
627 empty
.sin4
.sin_family
= 0;
628 /* if ids->destHarvested is false, origDest holds the listening address.
629 We don't want to use that as a source since it could be 0.0.0.0 for example. */
630 sendUDPResponse(origFD
, response
, responseLen
, dr
.delayMsec
, ids
->destHarvested
? ids
->origDest
: empty
, ids
->origRemote
);
636 ++ids
->cs
->responses
;
640 double udiff
= ids
->sentTime
.udiff();
641 vinfolog("Got answer from %s, relayed to %s%s, took %f usec", dss
->remote
.toStringWithPort(), ids
->origRemote
.toStringWithPort(),
642 isDoH
? " (https)": "", udiff
);
646 g_rings
.insertResponse(ts
, *dr
.remote
, *dr
.qname
, dr
.qtype
, static_cast<unsigned int>(udiff
), static_cast<unsigned int>(got
), cleartextDH
, dss
->remote
);
648 switch (cleartextDH
.rcode
) {
649 case RCode::NXDomain
:
650 ++g_stats
.frontendNXDomain
;
652 case RCode::ServFail
:
653 ++g_stats
.servfailResponses
;
654 ++g_stats
.frontendServFail
;
657 ++g_stats
.frontendNoError
;
660 dss
->latencyUsec
= (127.0 * dss
->latencyUsec
/ 128.0) + udiff
/128.0;
662 doLatencyStats(udiff
);
664 rewrittenResponse
.clear();
667 catch(const std::exception
& e
){
668 vinfolog("Got an error in UDP responder thread while parsing a response from %s, id %d: %s", dss
->remote
.toStringWithPort(), queryId
, e
.what());
672 catch(const std::exception
& e
)
674 errlog("UDP responder thread died because of exception: %s", e
.what());
676 catch(const PDNSException
& e
)
678 errlog("UDP responder thread died because of PowerDNS exception: %s", e
.reason
);
682 errlog("UDP responder thread died because of an exception: %s", "unknown");
685 bool DownstreamState::reconnect()
687 std::unique_lock
<std::mutex
> tl(connectLock
, std::try_to_lock
);
688 if (!tl
.owns_lock()) {
689 /* we are already reconnecting */
694 for (auto& fd
: sockets
) {
696 if (sockets
.size() > 1) {
697 std::lock_guard
<std::mutex
> lock(socketsLock
);
698 mplexer
->removeReadFD(fd
);
700 /* shutdown() is needed to wake up recv() in the responderThread */
701 shutdown(fd
, SHUT_RDWR
);
705 if (!IsAnyAddress(remote
)) {
706 fd
= SSocket(remote
.sin4
.sin_family
, SOCK_DGRAM
, 0);
707 if (!IsAnyAddress(sourceAddr
)) {
708 SSetsockopt(fd
, SOL_SOCKET
, SO_REUSEADDR
, 1);
709 if (!sourceItfName
.empty()) {
710 #ifdef SO_BINDTODEVICE
711 int res
= setsockopt(fd
, SOL_SOCKET
, SO_BINDTODEVICE
, sourceItfName
.c_str(), sourceItfName
.length());
713 infolog("Error setting up the interface on backend socket '%s': %s", remote
.toStringWithPort(), stringerror());
718 SBind(fd
, sourceAddr
);
721 SConnect(fd
, remote
);
722 if (sockets
.size() > 1) {
723 std::lock_guard
<std::mutex
> lock(socketsLock
);
724 mplexer
->addReadFD(fd
, [](int, boost::any
) {});
728 catch(const std::runtime_error
& error
) {
729 infolog("Error connecting to new server with address %s: %s", remote
.toStringWithPort(), error
.what());
736 /* if at least one (re-)connection failed, close all sockets */
738 for (auto& fd
: sockets
) {
740 if (sockets
.size() > 1) {
741 std::lock_guard
<std::mutex
> lock(socketsLock
);
742 mplexer
->removeReadFD(fd
);
744 /* shutdown() is needed to wake up recv() in the responderThread */
745 shutdown(fd
, SHUT_RDWR
);
754 void DownstreamState::hash()
756 vinfolog("Computing hashes for id=%s and weight=%d", id
, weight
);
758 WriteLock
wl(&d_lock
);
761 std::string uuid
= boost::str(boost::format("%s-%d") % id
% w
);
762 unsigned int wshash
= burtleCI((const unsigned char*)uuid
.c_str(), uuid
.size(), g_hashperturb
);
763 hashes
.insert(wshash
);
768 void DownstreamState::setId(const boost::uuids::uuid
& newId
)
771 // compute hashes only if already done
772 if (!hashes
.empty()) {
777 void DownstreamState::setWeight(int newWeight
)
780 errlog("Error setting server's weight: downstream weight value must be greater than 0.");
784 if (!hashes
.empty()) {
789 DownstreamState::DownstreamState(const ComboAddress
& remote_
, const ComboAddress
& sourceAddr_
, unsigned int sourceItf_
, const std::string
& sourceItfName_
, size_t numberOfSockets
, bool connect
=true): sourceItfName(sourceItfName_
), remote(remote_
), sourceAddr(sourceAddr_
), sourceItf(sourceItf_
)
791 pthread_rwlock_init(&d_lock
, nullptr);
793 threadStarted
.clear();
795 mplexer
= std::unique_ptr
<FDMultiplexer
>(FDMultiplexer::getMultiplexerSilent());
797 sockets
.resize(numberOfSockets
);
798 for (auto& fd
: sockets
) {
802 if (connect
&& !IsAnyAddress(remote
)) {
804 idStates
.resize(g_maxOutstanding
);
806 infolog("Added downstream server %s", remote
.toStringWithPort());
811 std::mutex g_luamutex
;
814 GlobalStateHolder
<ServerPolicy
> g_policy
;
816 shared_ptr
<DownstreamState
> firstAvailable(const NumberedServerVector
& servers
, const DNSQuestion
* dq
)
818 for(auto& d
: servers
) {
819 if(d
.second
->isUp() && d
.second
->qps
.check())
822 return leastOutstanding(servers
, dq
);
825 // get server with least outstanding queries, and within those, with the lowest order, and within those: the fastest
826 shared_ptr
<DownstreamState
> leastOutstanding(const NumberedServerVector
& servers
, const DNSQuestion
* dq
)
828 if (servers
.size() == 1 && servers
[0].second
->isUp()) {
829 return servers
[0].second
;
832 vector
<pair
<tuple
<int,int,double>, shared_ptr
<DownstreamState
>>> poss
;
833 /* so you might wonder, why do we go through this trouble? The data on which we sort could change during the sort,
834 which would suck royally and could even lead to crashes. So first we snapshot on what we sort, and then we sort */
835 poss
.reserve(servers
.size());
836 for(auto& d
: servers
) {
837 if(d
.second
->isUp()) {
838 poss
.push_back({make_tuple(d
.second
->outstanding
.load(), d
.second
->order
, d
.second
->latencyUsec
), d
.second
});
842 return shared_ptr
<DownstreamState
>();
843 nth_element(poss
.begin(), poss
.begin(), poss
.end(), [](const decltype(poss
)::value_type
& a
, const decltype(poss
)::value_type
& b
) { return a
.first
< b
.first
; });
844 return poss
.begin()->second
;
847 shared_ptr
<DownstreamState
> valrandom(unsigned int val
, const NumberedServerVector
& servers
, const DNSQuestion
* dq
)
849 vector
<pair
<int, shared_ptr
<DownstreamState
>>> poss
;
851 int max
= std::numeric_limits
<int>::max();
853 for(auto& d
: servers
) { // w=1, w=10 -> 1, 11
854 if(d
.second
->isUp()) {
855 // Don't overflow sum when adding high weights
856 if(d
.second
->weight
> max
- sum
) {
859 sum
+= d
.second
->weight
;
862 poss
.push_back({sum
, d
.second
});
866 // Catch poss & sum are empty to avoid SIGFPE
868 return shared_ptr
<DownstreamState
>();
871 auto p
= upper_bound(poss
.begin(), poss
.end(),r
, [](int r_
, const decltype(poss
)::value_type
& a
) { return r_
< a
.first
;});
873 return shared_ptr
<DownstreamState
>();
877 shared_ptr
<DownstreamState
> wrandom(const NumberedServerVector
& servers
, const DNSQuestion
* dq
)
879 return valrandom(random(), servers
, dq
);
882 uint32_t g_hashperturb
;
883 double g_consistentHashBalancingFactor
= 0;
884 shared_ptr
<DownstreamState
> whashed(const NumberedServerVector
& servers
, const DNSQuestion
* dq
)
886 return valrandom(dq
->qname
->hash(g_hashperturb
), servers
, dq
);
889 shared_ptr
<DownstreamState
> chashed(const NumberedServerVector
& servers
, const DNSQuestion
* dq
)
891 unsigned int qhash
= dq
->qname
->hash(g_hashperturb
);
892 unsigned int sel
= std::numeric_limits
<unsigned int>::max();
893 unsigned int min
= std::numeric_limits
<unsigned int>::max();
894 shared_ptr
<DownstreamState
> ret
= nullptr, first
= nullptr;
896 double targetLoad
= std::numeric_limits
<double>::max();
897 if (g_consistentHashBalancingFactor
> 0) {
898 /* we start with one, representing the query we are currently handling */
899 double currentLoad
= 1;
900 for (const auto& pair
: servers
) {
901 currentLoad
+= pair
.second
->outstanding
;
903 targetLoad
= (currentLoad
/ servers
.size()) * g_consistentHashBalancingFactor
;
906 for (const auto& d
: servers
) {
907 if (d
.second
->isUp() && d
.second
->outstanding
<= targetLoad
) {
908 // make sure hashes have been computed
909 if (d
.second
->hashes
.empty()) {
913 ReadLock
rl(&(d
.second
->d_lock
));
914 const auto& server
= d
.second
;
915 // we want to keep track of the last hash
916 if (min
> *(server
->hashes
.begin())) {
917 min
= *(server
->hashes
.begin());
921 auto hash_it
= server
->hashes
.lower_bound(qhash
);
922 if (hash_it
!= server
->hashes
.end()) {
923 if (*hash_it
< sel
) {
931 if (ret
!= nullptr) {
934 if (first
!= nullptr) {
937 return shared_ptr
<DownstreamState
>();
940 shared_ptr
<DownstreamState
> roundrobin(const NumberedServerVector
& servers
, const DNSQuestion
* dq
)
942 NumberedServerVector poss
;
944 for(auto& d
: servers
) {
945 if(d
.second
->isUp()) {
950 const auto *res
=&poss
;
951 if(poss
.empty() && !g_roundrobinFailOnNoServer
)
955 return shared_ptr
<DownstreamState
>();
957 static unsigned int counter
;
959 return (*res
)[(counter
++) % res
->size()].second
;
962 ComboAddress g_serverControl
{"127.0.0.1:5199"};
964 std::shared_ptr
<ServerPool
> createPoolIfNotExists(pools_t
& pools
, const string
& poolName
)
966 std::shared_ptr
<ServerPool
> pool
;
967 pools_t::iterator it
= pools
.find(poolName
);
968 if (it
!= pools
.end()) {
972 if (!poolName
.empty())
973 vinfolog("Creating pool %s", poolName
);
974 pool
= std::make_shared
<ServerPool
>();
975 pools
.insert(std::pair
<std::string
,std::shared_ptr
<ServerPool
> >(poolName
, pool
));
980 void setPoolPolicy(pools_t
& pools
, const string
& poolName
, std::shared_ptr
<ServerPolicy
> policy
)
982 std::shared_ptr
<ServerPool
> pool
= createPoolIfNotExists(pools
, poolName
);
983 if (!poolName
.empty()) {
984 vinfolog("Setting pool %s server selection policy to %s", poolName
, policy
->name
);
986 vinfolog("Setting default pool server selection policy to %s", policy
->name
);
988 pool
->policy
= policy
;
991 void addServerToPool(pools_t
& pools
, const string
& poolName
, std::shared_ptr
<DownstreamState
> server
)
993 std::shared_ptr
<ServerPool
> pool
= createPoolIfNotExists(pools
, poolName
);
994 if (!poolName
.empty()) {
995 vinfolog("Adding server to pool %s", poolName
);
997 vinfolog("Adding server to default pool");
999 pool
->addServer(server
);
1002 void removeServerFromPool(pools_t
& pools
, const string
& poolName
, std::shared_ptr
<DownstreamState
> server
)
1004 std::shared_ptr
<ServerPool
> pool
= getPool(pools
, poolName
);
1006 if (!poolName
.empty()) {
1007 vinfolog("Removing server from pool %s", poolName
);
1010 vinfolog("Removing server from default pool");
1013 pool
->removeServer(server
);
1016 std::shared_ptr
<ServerPool
> getPool(const pools_t
& pools
, const std::string
& poolName
)
1018 pools_t::const_iterator it
= pools
.find(poolName
);
1020 if (it
== pools
.end()) {
1021 throw std::out_of_range("No pool named " + poolName
);
1027 NumberedServerVector
getDownstreamCandidates(const pools_t
& pools
, const std::string
& poolName
)
1029 std::shared_ptr
<ServerPool
> pool
= getPool(pools
, poolName
);
1030 return pool
->getServers();
1033 static void spoofResponseFromString(DNSQuestion
& dq
, const string
& spoofContent
)
1037 std::vector
<std::string
> addrs
;
1038 stringtok(addrs
, spoofContent
, " ,");
1040 if (addrs
.size() == 1) {
1042 ComboAddress
spoofAddr(spoofContent
);
1043 SpoofAction
sa({spoofAddr
});
1046 catch(const PDNSException
&e
) {
1047 SpoofAction
sa(spoofContent
); // CNAME then
1051 std::vector
<ComboAddress
> cas
;
1052 for (const auto& addr
: addrs
) {
1054 cas
.push_back(ComboAddress(addr
));
1059 SpoofAction
sa(cas
);
1064 bool processRulesResult(const DNSAction::Action
& action
, DNSQuestion
& dq
, std::string
& ruleresult
, bool& drop
)
1067 case DNSAction::Action::Allow
:
1070 case DNSAction::Action::Drop
:
1075 case DNSAction::Action::Nxdomain
:
1076 dq
.dh
->rcode
= RCode::NXDomain
;
1078 ++g_stats
.ruleNXDomain
;
1081 case DNSAction::Action::Refused
:
1082 dq
.dh
->rcode
= RCode::Refused
;
1084 ++g_stats
.ruleRefused
;
1087 case DNSAction::Action::ServFail
:
1088 dq
.dh
->rcode
= RCode::ServFail
;
1090 ++g_stats
.ruleServFail
;
1093 case DNSAction::Action::Spoof
:
1094 spoofResponseFromString(dq
, ruleresult
);
1097 case DNSAction::Action::Truncate
:
1100 dq
.dh
->ra
= dq
.dh
->rd
;
1105 case DNSAction::Action::HeaderModify
:
1108 case DNSAction::Action::Pool
:
1109 dq
.poolname
=ruleresult
;
1112 case DNSAction::Action::NoRecurse
:
1116 /* non-terminal actions follow */
1117 case DNSAction::Action::Delay
:
1118 dq
.delayMsec
= static_cast<int>(pdns_stou(ruleresult
)); // sorry
1120 case DNSAction::Action::None
:
1122 case DNSAction::Action::NoOp
:
1126 /* false means that we don't stop the processing */
1131 static bool applyRulesToQuery(LocalHolders
& holders
, DNSQuestion
& dq
, const struct timespec
& now
)
1133 g_rings
.insertQuery(now
, *dq
.remote
, *dq
.qname
, dq
.qtype
, dq
.len
, *dq
.dh
);
1135 if(g_qcount
.enabled
) {
1136 string qname
= (*dq
.qname
).toLogString();
1137 bool countQuery
{true};
1138 if(g_qcount
.filter
) {
1139 std::lock_guard
<std::mutex
> lock(g_luamutex
);
1140 std::tie (countQuery
, qname
) = g_qcount
.filter(&dq
);
1144 WriteLock
wl(&g_qcount
.queryLock
);
1145 if(!g_qcount
.records
.count(qname
)) {
1146 g_qcount
.records
[qname
] = 0;
1148 g_qcount
.records
[qname
]++;
1152 if(auto got
= holders
.dynNMGBlock
->lookup(*dq
.remote
)) {
1153 auto updateBlockStats
= [&got
]() {
1154 ++g_stats
.dynBlocked
;
1155 got
->second
.blocks
++;
1158 if(now
< got
->second
.until
) {
1159 DNSAction::Action action
= got
->second
.action
;
1160 if (action
== DNSAction::Action::None
) {
1161 action
= g_dynBlockAction
;
1164 case DNSAction::Action::NoOp
:
1168 case DNSAction::Action::Nxdomain
:
1169 vinfolog("Query from %s turned into NXDomain because of dynamic block", dq
.remote
->toStringWithPort());
1172 dq
.dh
->rcode
= RCode::NXDomain
;
1176 case DNSAction::Action::Refused
:
1177 vinfolog("Query from %s refused because of dynamic block", dq
.remote
->toStringWithPort());
1180 dq
.dh
->rcode
= RCode::Refused
;
1184 case DNSAction::Action::Truncate
:
1187 vinfolog("Query from %s truncated because of dynamic block", dq
.remote
->toStringWithPort());
1190 dq
.dh
->ra
= dq
.dh
->rd
;
1196 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq
.remote
->toStringWithPort(), dq
.qname
->toLogString());
1199 case DNSAction::Action::NoRecurse
:
1201 vinfolog("Query from %s setting rd=0 because of dynamic block", dq
.remote
->toStringWithPort());
1206 vinfolog("Query from %s dropped because of dynamic block", dq
.remote
->toStringWithPort());
1212 if(auto got
= holders
.dynSMTBlock
->lookup(*dq
.qname
)) {
1213 auto updateBlockStats
= [&got
]() {
1214 ++g_stats
.dynBlocked
;
1218 if(now
< got
->until
) {
1219 DNSAction::Action action
= got
->action
;
1220 if (action
== DNSAction::Action::None
) {
1221 action
= g_dynBlockAction
;
1224 case DNSAction::Action::NoOp
:
1227 case DNSAction::Action::Nxdomain
:
1228 vinfolog("Query from %s for %s turned into NXDomain because of dynamic block", dq
.remote
->toStringWithPort(), dq
.qname
->toLogString());
1231 dq
.dh
->rcode
= RCode::NXDomain
;
1234 case DNSAction::Action::Refused
:
1235 vinfolog("Query from %s for %s refused because of dynamic block", dq
.remote
->toStringWithPort(), dq
.qname
->toLogString());
1238 dq
.dh
->rcode
= RCode::Refused
;
1241 case DNSAction::Action::Truncate
:
1245 vinfolog("Query from %s for %s truncated because of dynamic block", dq
.remote
->toStringWithPort(), dq
.qname
->toLogString());
1248 dq
.dh
->ra
= dq
.dh
->rd
;
1254 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq
.remote
->toStringWithPort(), dq
.qname
->toLogString());
1257 case DNSAction::Action::NoRecurse
:
1259 vinfolog("Query from %s setting rd=0 because of dynamic block", dq
.remote
->toStringWithPort());
1264 vinfolog("Query from %s for %s dropped because of dynamic block", dq
.remote
->toStringWithPort(), dq
.qname
->toLogString());
1270 DNSAction::Action action
=DNSAction::Action::None
;
1273 for(const auto& lr
: *holders
.rulactions
) {
1274 if(lr
.d_rule
->matches(&dq
)) {
1275 lr
.d_rule
->d_matches
++;
1276 action
=(*lr
.d_action
)(&dq
, &ruleresult
);
1277 if (processRulesResult(action
, dq
, ruleresult
, drop
)) {
1290 ssize_t
udpClientSendRequestToBackend(const std::shared_ptr
<DownstreamState
>& ss
, const int sd
, const char* request
, const size_t requestLen
, bool healthCheck
)
1294 if (ss
->sourceItf
== 0) {
1295 result
= send(sd
, request
, requestLen
, 0);
1300 cmsgbuf_aligned cbuf
;
1301 ComboAddress
remote(ss
->remote
);
1302 fillMSGHdr(&msgh
, &iov
, &cbuf
, sizeof(cbuf
), const_cast<char*>(request
), requestLen
, &remote
);
1303 addCMsgSrcAddr(&msgh
, &cbuf
, &ss
->sourceAddr
, ss
->sourceItf
);
1304 result
= sendmsg(sd
, &msgh
, 0);
1308 int savederrno
= errno
;
1309 vinfolog("Error sending request to backend %s: %d", ss
->remote
.toStringWithPort(), savederrno
);
1311 /* This might sound silly, but on Linux send() might fail with EINVAL
1312 if the interface the socket was bound to doesn't exist anymore.
1313 We don't want to reconnect the real socket if the healthcheck failed,
1314 because it's not using the same socket.
1316 if (!healthCheck
&& (savederrno
== EINVAL
|| savederrno
== ENODEV
)) {
1324 static bool isUDPQueryAcceptable(ClientState
& cs
, LocalHolders
& holders
, const struct msghdr
* msgh
, const ComboAddress
& remote
, ComboAddress
& dest
)
1326 if (msgh
->msg_flags
& MSG_TRUNC
) {
1327 /* message was too large for our buffer */
1328 vinfolog("Dropping message too large for our buffer");
1329 ++g_stats
.nonCompliantQueries
;
1333 if(!holders
.acl
->match(remote
)) {
1334 vinfolog("Query from %s dropped because of ACL", remote
.toStringWithPort());
1342 if (HarvestDestinationAddress(msgh
, &dest
)) {
1343 /* we don't get the port, only the address */
1344 dest
.sin4
.sin_port
= cs
.local
.sin4
.sin_port
;
1347 dest
.sin4
.sin_family
= 0;
1353 boost::optional
<std::vector
<uint8_t>> checkDNSCryptQuery(const ClientState
& cs
, const char* query
, uint16_t& len
, std::shared_ptr
<DNSCryptQuery
>& dnsCryptQuery
, time_t now
, bool tcp
)
1355 if (cs
.dnscryptCtx
) {
1356 #ifdef HAVE_DNSCRYPT
1357 vector
<uint8_t> response
;
1358 uint16_t decryptedQueryLen
= 0;
1360 dnsCryptQuery
= std::make_shared
<DNSCryptQuery
>(cs
.dnscryptCtx
);
1362 bool decrypted
= handleDNSCryptQuery(const_cast<char*>(query
), len
, dnsCryptQuery
, &decryptedQueryLen
, tcp
, now
, response
);
1365 if (response
.size() > 0) {
1368 throw std::runtime_error("Unable to decrypt DNSCrypt query, dropping.");
1371 len
= decryptedQueryLen
;
1372 #endif /* HAVE_DNSCRYPT */
1377 bool checkQueryHeaders(const struct dnsheader
* dh
)
1379 if (dh
->qr
) { // don't respond to responses
1380 ++g_stats
.nonCompliantQueries
;
1384 if (dh
->qdcount
== 0) {
1385 ++g_stats
.emptyQueries
;
1390 ++g_stats
.rdQueries
;
1396 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1397 static void queueResponse(const ClientState
& cs
, const char* response
, uint16_t responseLen
, const ComboAddress
& dest
, const ComboAddress
& remote
, struct mmsghdr
& outMsg
, struct iovec
* iov
, cmsgbuf_aligned
* cbuf
)
1400 fillMSGHdr(&outMsg
.msg_hdr
, iov
, nullptr, 0, const_cast<char*>(response
), responseLen
, const_cast<ComboAddress
*>(&remote
));
1402 if (dest
.sin4
.sin_family
== 0) {
1403 outMsg
.msg_hdr
.msg_control
= nullptr;
1406 addCMsgSrcAddr(&outMsg
.msg_hdr
, cbuf
, &dest
, 0);
1409 #endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1411 /* self-generated responses or cache hits */
1412 static bool prepareOutgoingResponse(LocalHolders
& holders
, ClientState
& cs
, DNSQuestion
& dq
, bool cacheHit
)
1414 DNSResponse
dr(dq
.qname
, dq
.qtype
, dq
.qclass
, dq
.consumed
, dq
.local
, dq
.remote
, reinterpret_cast<dnsheader
*>(dq
.dh
), dq
.size
, dq
.len
, dq
.tcp
, dq
.queryTime
);
1416 #ifdef HAVE_PROTOBUF
1417 dr
.uniqueId
= dq
.uniqueId
;
1420 dr
.delayMsec
= dq
.delayMsec
;
1422 if (!applyRulesToResponse(cacheHit
? holders
.cacheHitRespRulactions
: holders
.selfAnsweredRespRulactions
, dr
)) {
1426 /* in case a rule changed it */
1427 dq
.delayMsec
= dr
.delayMsec
;
1429 #ifdef HAVE_DNSCRYPT
1431 if (!encryptResponse(reinterpret_cast<char*>(dq
.dh
), &dq
.len
, dq
.size
, dq
.tcp
, dq
.dnsCryptQuery
, nullptr, nullptr)) {
1435 #endif /* HAVE_DNSCRYPT */
1438 ++g_stats
.cacheHits
;
1441 switch (dr
.dh
->rcode
) {
1442 case RCode::NXDomain
:
1443 ++g_stats
.frontendNXDomain
;
1445 case RCode::ServFail
:
1446 ++g_stats
.frontendServFail
;
1448 case RCode::NoError
:
1449 ++g_stats
.frontendNoError
;
1453 doLatencyStats(0); // we're not going to measure this
1457 ProcessQueryResult
processQuery(DNSQuestion
& dq
, ClientState
& cs
, LocalHolders
& holders
, std::shared_ptr
<DownstreamState
>& selectedBackend
)
1459 const uint16_t queryId
= ntohs(dq
.dh
->id
);
1462 /* we need an accurate ("real") value for the response and
1463 to store into the IDS, but not for insertion into the
1464 rings for example */
1465 struct timespec now
;
1468 if (!applyRulesToQuery(holders
, dq
, now
)) {
1469 return ProcessQueryResult::Drop
;
1472 if(dq
.dh
->qr
) { // something turned it into a response
1473 fixUpQueryTurnedResponse(dq
, dq
.origFlags
);
1475 if (!prepareOutgoingResponse(holders
, cs
, dq
, false)) {
1476 return ProcessQueryResult::Drop
;
1479 ++g_stats
.selfAnswered
;
1481 return ProcessQueryResult::SendAnswer
;
1484 std::shared_ptr
<ServerPool
> serverPool
= getPool(*holders
.pools
, dq
.poolname
);
1485 dq
.packetCache
= serverPool
->packetCache
;
1486 auto policy
= *(holders
.policy
);
1487 if (serverPool
->policy
!= nullptr) {
1488 policy
= *(serverPool
->policy
);
1490 auto servers
= serverPool
->getServers();
1492 std::lock_guard
<std::mutex
> lock(g_luamutex
);
1493 selectedBackend
= policy
.policy(servers
, &dq
);
1496 selectedBackend
= policy
.policy(servers
, &dq
);
1499 uint16_t cachedResponseSize
= dq
.size
;
1500 uint32_t allowExpired
= selectedBackend
? 0 : g_staleCacheEntriesTTL
;
1502 if (dq
.packetCache
&& !dq
.skipCache
) {
1503 dq
.dnssecOK
= (getEDNSZ(dq
) & EDNS_HEADER_FLAG_DO
);
1506 if (dq
.useECS
&& ((selectedBackend
&& selectedBackend
->useECS
) || (!selectedBackend
&& serverPool
->getECS()))) {
1507 // we special case our cache in case a downstream explicitly gave us a universally valid response with a 0 scope
1508 // we need ECS parsing (parseECS) to be true so we can be sure that the initial incoming query did not have an existing
1509 // ECS option, which would make it unsuitable for the zero-scope feature.
1510 if (dq
.packetCache
&& !dq
.skipCache
&& (!selectedBackend
|| !selectedBackend
->disableZeroScope
) && dq
.packetCache
->isECSParsingEnabled()) {
1511 if (dq
.packetCache
->get(dq
, dq
.consumed
, dq
.dh
->id
, reinterpret_cast<char*>(dq
.dh
), &cachedResponseSize
, &dq
.cacheKeyNoECS
, dq
.subnet
, dq
.dnssecOK
, allowExpired
)) {
1512 dq
.len
= cachedResponseSize
;
1514 if (!prepareOutgoingResponse(holders
, cs
, dq
, true)) {
1515 return ProcessQueryResult::Drop
;
1518 return ProcessQueryResult::SendAnswer
;
1522 /* there was no existing ECS on the query, enable the zero-scope feature */
1523 dq
.useZeroScope
= true;
1527 if (!handleEDNSClientSubnet(dq
, &(dq
.ednsAdded
), &(dq
.ecsAdded
), g_preserveTrailingData
)) {
1528 vinfolog("Dropping query from %s because we couldn't insert the ECS value", dq
.remote
->toStringWithPort());
1529 return ProcessQueryResult::Drop
;
1533 if (dq
.packetCache
&& !dq
.skipCache
) {
1534 if (dq
.packetCache
->get(dq
, dq
.consumed
, dq
.dh
->id
, reinterpret_cast<char*>(dq
.dh
), &cachedResponseSize
, &dq
.cacheKey
, dq
.subnet
, dq
.dnssecOK
, allowExpired
)) {
1535 dq
.len
= cachedResponseSize
;
1537 if (!prepareOutgoingResponse(holders
, cs
, dq
, true)) {
1538 return ProcessQueryResult::Drop
;
1541 return ProcessQueryResult::SendAnswer
;
1543 ++g_stats
.cacheMisses
;
1546 if(!selectedBackend
) {
1549 vinfolog("%s query for %s|%s from %s, no policy applied", g_servFailOnNoPolicy
? "ServFailed" : "Dropped", dq
.qname
->toLogString(), QType(dq
.qtype
).getName(), dq
.remote
->toStringWithPort());
1550 if (g_servFailOnNoPolicy
) {
1551 restoreFlags(dq
.dh
, dq
.origFlags
);
1553 dq
.dh
->rcode
= RCode::ServFail
;
1556 if (!prepareOutgoingResponse(holders
, cs
, dq
, false)) {
1557 return ProcessQueryResult::Drop
;
1559 // no response-only statistics counter to update.
1560 return ProcessQueryResult::SendAnswer
;
1563 return ProcessQueryResult::Drop
;
1566 if (dq
.addXPF
&& selectedBackend
->xpfRRCode
!= 0) {
1567 addXPF(dq
, selectedBackend
->xpfRRCode
, g_preserveTrailingData
);
1570 selectedBackend
->queries
++;
1571 return ProcessQueryResult::PassToBackend
;
1573 catch(const std::exception
& e
){
1574 vinfolog("Got an error while parsing a %s query from %s, id %d: %s", (dq
.tcp
? "TCP" : "UDP"), dq
.remote
->toStringWithPort(), queryId
, e
.what());
1576 return ProcessQueryResult::Drop
;
1579 static void processUDPQuery(ClientState
& cs
, LocalHolders
& holders
, const struct msghdr
* msgh
, const ComboAddress
& remote
, ComboAddress
& dest
, char* query
, uint16_t len
, size_t queryBufferSize
, struct mmsghdr
* responsesVect
, unsigned int* queuedResponses
, struct iovec
* respIOV
, cmsgbuf_aligned
* respCBuf
)
1581 assert(responsesVect
== nullptr || (queuedResponses
!= nullptr && respIOV
!= nullptr && respCBuf
!= nullptr));
1582 uint16_t queryId
= 0;
1585 if (!isUDPQueryAcceptable(cs
, holders
, msgh
, remote
, dest
)) {
1589 /* we need an accurate ("real") value for the response and
1590 to store into the IDS, but not for insertion into the
1591 rings for example */
1592 struct timespec queryRealTime
;
1593 gettime(&queryRealTime
, true);
1595 std::shared_ptr
<DNSCryptQuery
> dnsCryptQuery
= nullptr;
1596 auto dnsCryptResponse
= checkDNSCryptQuery(cs
, query
, len
, dnsCryptQuery
, queryRealTime
.tv_sec
, false);
1597 if (dnsCryptResponse
) {
1598 sendUDPResponse(cs
.udpFD
, reinterpret_cast<char*>(dnsCryptResponse
->data()), static_cast<uint16_t>(dnsCryptResponse
->size()), 0, dest
, remote
);
1602 struct dnsheader
* dh
= reinterpret_cast<struct dnsheader
*>(query
);
1603 queryId
= ntohs(dh
->id
);
1605 if (!checkQueryHeaders(dh
)) {
1609 uint16_t qtype
, qclass
;
1610 unsigned int consumed
= 0;
1611 DNSName
qname(query
, len
, sizeof(dnsheader
), false, &qtype
, &qclass
, &consumed
);
1612 DNSQuestion
dq(&qname
, qtype
, qclass
, consumed
, dest
.sin4
.sin_family
!= 0 ? &dest
: &cs
.local
, &remote
, dh
, queryBufferSize
, len
, false, &queryRealTime
);
1613 dq
.dnsCryptQuery
= std::move(dnsCryptQuery
);
1614 std::shared_ptr
<DownstreamState
> ss
{nullptr};
1615 auto result
= processQuery(dq
, cs
, holders
, ss
);
1617 if (result
== ProcessQueryResult::Drop
) {
1621 if (result
== ProcessQueryResult::SendAnswer
) {
1622 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1623 if (dq
.delayMsec
== 0 && responsesVect
!= nullptr) {
1624 queueResponse(cs
, reinterpret_cast<char*>(dq
.dh
), dq
.len
, *dq
.local
, *dq
.remote
, responsesVect
[*queuedResponses
], respIOV
, respCBuf
);
1625 (*queuedResponses
)++;
1628 #endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1629 /* we use dest, always, because we don't want to use the listening address to send a response since it could be 0.0.0.0 */
1630 sendUDPResponse(cs
.udpFD
, reinterpret_cast<char*>(dq
.dh
), dq
.len
, dq
.delayMsec
, dest
, *dq
.remote
);
1634 if (result
!= ProcessQueryResult::PassToBackend
|| ss
== nullptr) {
1638 unsigned int idOffset
= (ss
->idOffset
++) % ss
->idStates
.size();
1639 IDState
* ids
= &ss
->idStates
[idOffset
];
1641 DOHUnit
* du
= nullptr;
1643 /* that means that the state was in use, possibly with an allocated
1644 DOHUnit that we will need to handle, but we can't touch it before
1645 confirming that we now own this state */
1646 if (ids
->isInUse()) {
1650 /* we atomically replace the value, we now own this state */
1651 if (!ids
->markAsUsed()) {
1652 /* the state was not in use.
1653 we reset 'du' because it might have still been in use when we read it. */
1658 /* we are reusing a state, no change in outstanding but if there was an existing DOHUnit we need
1659 to handle it because it's about to be overwritten. */
1662 ++g_stats
.downstreamTimeouts
;
1663 handleDOHTimeout(du
);
1667 ids
->origFD
= cs
.udpFD
;
1668 ids
->origID
= dh
->id
;
1669 setIDStateFromDNSQuestion(*ids
, dq
, std::move(qname
));
1671 /* If we couldn't harvest the real dest addr, still
1672 write down the listening addr since it will be useful
1673 (especially if it's not an 'any' one).
1674 We need to keep track of which one it is since we may
1675 want to use the real but not the listening addr to reply.
1677 if (dest
.sin4
.sin_family
!= 0) {
1678 ids
->origDest
= dest
;
1679 ids
->destHarvested
= true;
1682 ids
->origDest
= cs
.local
;
1683 ids
->destHarvested
= false;
1688 int fd
= pickBackendSocketForSending(ss
);
1689 ssize_t ret
= udpClientSendRequestToBackend(ss
, fd
, query
, dq
.len
);
1693 ++g_stats
.downstreamSendErrors
;
1696 vinfolog("Got query for %s|%s from %s, relayed to %s", ids
->qname
.toLogString(), QType(ids
->qtype
).getName(), remote
.toStringWithPort(), ss
->getName());
1698 catch(const std::exception
& e
){
1699 vinfolog("Got an error in UDP question thread while parsing a query from %s, id %d: %s", remote
.toStringWithPort(), queryId
, e
.what());
1703 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1704 static void MultipleMessagesUDPClientThread(ClientState
* cs
, LocalHolders
& holders
)
1708 char packet
[s_maxPacketCacheEntrySize
];
1709 ComboAddress remote
;
1712 /* used by HarvestDestinationAddress */
1713 cmsgbuf_aligned cbuf
;
1715 const size_t vectSize
= g_udpVectorSize
;
1716 /* the actual buffer is larger because:
1717 - we may have to add EDNS and/or ECS
1718 - we use it for self-generated responses (from rule or cache)
1719 but we only accept incoming payloads up to that size
1721 static_assert(s_udpIncomingBufferSize
<= sizeof(MMReceiver::packet
), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1723 auto recvData
= std::unique_ptr
<MMReceiver
[]>(new MMReceiver
[vectSize
]);
1724 auto msgVec
= std::unique_ptr
<struct mmsghdr
[]>(new struct mmsghdr
[vectSize
]);
1725 auto outMsgVec
= std::unique_ptr
<struct mmsghdr
[]>(new struct mmsghdr
[vectSize
]);
1727 /* initialize the structures needed to receive our messages */
1728 for (size_t idx
= 0; idx
< vectSize
; idx
++) {
1729 recvData
[idx
].remote
.sin4
.sin_family
= cs
->local
.sin4
.sin_family
;
1730 fillMSGHdr(&msgVec
[idx
].msg_hdr
, &recvData
[idx
].iov
, &recvData
[idx
].cbuf
, sizeof(recvData
[idx
].cbuf
), recvData
[idx
].packet
, s_udpIncomingBufferSize
, &recvData
[idx
].remote
);
1736 /* reset the IO vector, since it's also used to send the vector of responses
1737 to avoid having to copy the data around */
1738 for (size_t idx
= 0; idx
< vectSize
; idx
++) {
1739 recvData
[idx
].iov
.iov_base
= recvData
[idx
].packet
;
1740 recvData
[idx
].iov
.iov_len
= sizeof(recvData
[idx
].packet
);
1743 /* block until we have at least one message ready, but return
1744 as many as possible to save the syscall costs */
1745 int msgsGot
= recvmmsg(cs
->udpFD
, msgVec
.get(), vectSize
, MSG_WAITFORONE
| MSG_TRUNC
, nullptr);
1748 vinfolog("Getting UDP messages via recvmmsg() failed with: %s", stringerror());
1752 unsigned int msgsToSend
= 0;
1754 /* process the received messages */
1755 for (int msgIdx
= 0; msgIdx
< msgsGot
; msgIdx
++) {
1756 const struct msghdr
* msgh
= &msgVec
[msgIdx
].msg_hdr
;
1757 unsigned int got
= msgVec
[msgIdx
].msg_len
;
1758 const ComboAddress
& remote
= recvData
[msgIdx
].remote
;
1760 if (static_cast<size_t>(got
) < sizeof(struct dnsheader
)) {
1761 ++g_stats
.nonCompliantQueries
;
1765 processUDPQuery(*cs
, holders
, msgh
, remote
, recvData
[msgIdx
].dest
, recvData
[msgIdx
].packet
, static_cast<uint16_t>(got
), sizeof(recvData
[msgIdx
].packet
), outMsgVec
.get(), &msgsToSend
, &recvData
[msgIdx
].iov
, &recvData
[msgIdx
].cbuf
);
1769 /* immediate (not delayed or sent to a backend) responses (mostly from a rule, dynamic block
1770 or the cache) can be sent in batch too */
1772 if (msgsToSend
> 0 && msgsToSend
<= static_cast<unsigned int>(msgsGot
)) {
1773 int sent
= sendmmsg(cs
->udpFD
, outMsgVec
.get(), msgsToSend
, 0);
1775 if (sent
< 0 || static_cast<unsigned int>(sent
) != msgsToSend
) {
1776 vinfolog("Error sending responses with sendmmsg() (%d on %u): %s", sent
, msgsToSend
, stringerror());
1782 #endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1784 // listens to incoming queries, sends out to downstream servers, noting the intended return path
1785 static void udpClientThread(ClientState
* cs
)
1788 setThreadName("dnsdist/udpClie");
1789 LocalHolders holders
;
1791 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1792 if (g_udpVectorSize
> 1) {
1793 MultipleMessagesUDPClientThread(cs
, holders
);
1797 #endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1799 char packet
[s_maxPacketCacheEntrySize
];
1800 /* the actual buffer is larger because:
1801 - we may have to add EDNS and/or ECS
1802 - we use it for self-generated responses (from rule or cache)
1803 but we only accept incoming payloads up to that size
1805 static_assert(s_udpIncomingBufferSize
<= sizeof(packet
), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1808 /* used by HarvestDestinationAddress */
1809 cmsgbuf_aligned cbuf
;
1811 ComboAddress remote
;
1813 remote
.sin4
.sin_family
= cs
->local
.sin4
.sin_family
;
1814 fillMSGHdr(&msgh
, &iov
, &cbuf
, sizeof(cbuf
), packet
, s_udpIncomingBufferSize
, &remote
);
1817 ssize_t got
= recvmsg(cs
->udpFD
, &msgh
, 0);
1819 if (got
< 0 || static_cast<size_t>(got
) < sizeof(struct dnsheader
)) {
1820 ++g_stats
.nonCompliantQueries
;
1824 processUDPQuery(*cs
, holders
, &msgh
, remote
, dest
, packet
, static_cast<uint16_t>(got
), sizeof(packet
), nullptr, nullptr, nullptr, nullptr);
1828 catch(const std::exception
&e
)
1830 errlog("UDP client thread died because of exception: %s", e
.what());
1832 catch(const PDNSException
&e
)
1834 errlog("UDP client thread died because of PowerDNS exception: %s", e
.reason
);
1838 errlog("UDP client thread died because of an exception: %s", "unknown");
1841 uint16_t getRandomDNSID()
1843 #ifdef HAVE_LIBSODIUM
1844 return randombytes_uniform(65536);
1846 return (random() % 65536);
1850 uint64_t g_maxTCPClientThreads
{10};
1851 std::atomic
<uint16_t> g_cacheCleaningDelay
{60};
1852 std::atomic
<uint16_t> g_cacheCleaningPercentage
{100};
1856 setThreadName("dnsdist/main");
1859 int32_t secondsToWaitLog
= 0;
1865 std::lock_guard
<std::mutex
> lock(g_luamutex
);
1866 auto f
= g_lua
.readVariable
<boost::optional
<std::function
<void()> > >("maintenance");
1870 secondsToWaitLog
= 0;
1872 catch(std::exception
&e
) {
1873 if (secondsToWaitLog
<= 0) {
1874 infolog("Error during execution of maintenance function: %s", e
.what());
1875 secondsToWaitLog
= 61;
1877 secondsToWaitLog
-= interval
;
1883 if (counter
>= g_cacheCleaningDelay
) {
1884 /* keep track, for each cache, of whether we should keep
1886 std::map
<std::shared_ptr
<DNSDistPacketCache
>, bool> caches
;
1888 /* gather all caches actually used by at least one pool, and see
1889 if something prevents us from cleaning the expired entries */
1890 auto localPools
= g_pools
.getLocal();
1891 for (const auto& entry
: *localPools
) {
1892 auto& pool
= entry
.second
;
1894 auto packetCache
= pool
->packetCache
;
1899 auto pair
= caches
.insert({packetCache
, false});
1900 auto& iter
= pair
.first
;
1901 /* if we need to keep stale data for this cache (ie, not clear
1902 expired entries when at least one pool using this cache
1903 has all its backends down) */
1904 if (packetCache
->keepStaleData() && iter
->second
== false) {
1905 /* so far all pools had at least one backend up */
1906 if (pool
->countServers(true) == 0) {
1907 iter
->second
= true;
1912 for (auto pair
: caches
) {
1913 /* shall we keep expired entries ? */
1914 if (pair
.second
== true) {
1917 auto& packetCache
= pair
.first
;
1918 size_t upTo
= (packetCache
->getMaxEntries()* (100 - g_cacheCleaningPercentage
)) / 100;
1919 packetCache
->purgeExpired(upTo
);
1924 // ponder pruning g_dynblocks of expired entries here
1928 static void secPollThread()
1930 setThreadName("dnsdist/secpoll");
1934 doSecPoll(g_secPollSuffix
);
1938 sleep(g_secPollInterval
);
1942 static void healthChecksThread()
1944 setThreadName("dnsdist/healthC");
1946 static const int interval
= 1;
1951 if(g_tcpclientthreads
->getQueuedCount() > 1 && !g_tcpclientthreads
->hasReachedMaxThreads()) {
1952 g_tcpclientthreads
->addTCPClientThread();
1955 auto mplexer
= std::shared_ptr
<FDMultiplexer
>(FDMultiplexer::getMultiplexerSilent());
1956 auto states
= g_dstates
.getLocal(); // this points to the actual shared_ptrs!
1957 for(auto& dss
: *states
) {
1958 if(++dss
->lastCheck
< dss
->checkInterval
) {
1964 if (dss
->availability
== DownstreamState::Availability::Auto
) {
1965 if (!queueHealthCheck(mplexer
, dss
)) {
1966 updateHealthCheckResult(dss
, false);
1970 auto delta
= dss
->sw
.udiffAndSet()/1000000.0;
1971 dss
->queryLoad
= 1.0*(dss
->queries
.load() - dss
->prev
.queries
.load())/delta
;
1972 dss
->dropRate
= 1.0*(dss
->reuseds
.load() - dss
->prev
.reuseds
.load())/delta
;
1973 dss
->prev
.queries
.store(dss
->queries
.load());
1974 dss
->prev
.reuseds
.store(dss
->reuseds
.load());
1976 for (IDState
& ids
: dss
->idStates
) { // timeouts
1977 int64_t usageIndicator
= ids
.usageIndicator
;
1978 if(IDState::isInUse(usageIndicator
) && ids
.age
++ > g_udpTimeout
) {
1979 /* We mark the state as unused as soon as possible
1980 to limit the risk of racing with the
1983 auto oldDU
= ids
.du
;
1985 if (!ids
.tryMarkUnused(usageIndicator
)) {
1986 /* this state has been altered in the meantime,
1987 don't go anywhere near it */
1991 handleDOHTimeout(oldDU
);
1995 ++g_stats
.downstreamTimeouts
; // this is an 'actively' discovered timeout
1996 vinfolog("Had a downstream timeout from %s (%s) for query for %s|%s from %s",
1997 dss
->remote
.toStringWithPort(), dss
->name
,
1998 ids
.qname
.toLogString(), QType(ids
.qtype
).getName(), ids
.origRemote
.toStringWithPort());
2003 struct dnsheader fake
;
2004 memset(&fake
, 0, sizeof(fake
));
2005 fake
.id
= ids
.origID
;
2007 g_rings
.insertResponse(ts
, ids
.origRemote
, ids
.qname
, ids
.qtype
, std::numeric_limits
<unsigned int>::max(), 0, fake
, dss
->remote
);
2012 handleQueuedHealthChecks(mplexer
);
2016 static void bindAny(int af
, int sock
)
2018 __attribute__((unused
)) int one
= 1;
2021 if (setsockopt(sock
, IPPROTO_IP
, IP_FREEBIND
, &one
, sizeof(one
)) < 0)
2022 warnlog("Warning: IP_FREEBIND setsockopt failed: %s", stringerror());
2027 if (setsockopt(sock
, IPPROTO_IP
, IP_BINDANY
, &one
, sizeof(one
)) < 0)
2028 warnlog("Warning: IP_BINDANY setsockopt failed: %s", stringerror());
2032 if (setsockopt(sock
, IPPROTO_IPV6
, IPV6_BINDANY
, &one
, sizeof(one
)) < 0)
2033 warnlog("Warning: IPV6_BINDANY setsockopt failed: %s", stringerror());
2036 if (setsockopt(sock
, SOL_SOCKET
, SO_BINDANY
, &one
, sizeof(one
)) < 0)
2037 warnlog("Warning: SO_BINDANY setsockopt failed: %s", stringerror());
2041 static void dropGroupPrivs(gid_t gid
)
2044 if (setgid(gid
) == 0) {
2045 if (setgroups(0, NULL
) < 0) {
2046 warnlog("Warning: Unable to drop supplementary gids: %s", stringerror());
2050 warnlog("Warning: Unable to set group ID to %d: %s", gid
, stringerror());
2055 static void dropUserPrivs(uid_t uid
)
2058 if(setuid(uid
) < 0) {
2059 warnlog("Warning: Unable to set user ID to %d: %s", uid
, stringerror());
2064 static void checkFileDescriptorsLimits(size_t udpBindsCount
, size_t tcpBindsCount
)
2066 /* stdin, stdout, stderr */
2067 size_t requiredFDsCount
= 3;
2068 auto backends
= g_dstates
.getLocal();
2069 /* UDP sockets to backends */
2070 size_t backendUDPSocketsCount
= 0;
2071 for (const auto& backend
: *backends
) {
2072 backendUDPSocketsCount
+= backend
->sockets
.size();
2074 requiredFDsCount
+= backendUDPSocketsCount
;
2075 /* TCP sockets to backends */
2076 requiredFDsCount
+= (backends
->size() * g_maxTCPClientThreads
);
2077 /* listening sockets */
2078 requiredFDsCount
+= udpBindsCount
;
2079 requiredFDsCount
+= tcpBindsCount
;
2080 /* max TCP connections currently served */
2081 requiredFDsCount
+= g_maxTCPClientThreads
;
2082 /* max pipes for communicating between TCP acceptors and client threads */
2083 requiredFDsCount
+= (g_maxTCPClientThreads
* 2);
2084 /* max TCP queued connections */
2085 requiredFDsCount
+= g_maxTCPQueuedConnections
;
2086 /* DelayPipe pipe */
2087 requiredFDsCount
+= 2;
2090 /* webserver main socket */
2092 /* console main socket */
2099 getrlimit(RLIMIT_NOFILE
, &rl
);
2100 if (rl
.rlim_cur
<= requiredFDsCount
) {
2101 warnlog("Warning, this configuration can use more than %d file descriptors, web server and console connections not included, and the current limit is %d.", std::to_string(requiredFDsCount
), std::to_string(rl
.rlim_cur
));
2103 warnlog("You can increase this value by using LimitNOFILE= in the systemd unit file or ulimit.");
2105 warnlog("You can increase this value by using ulimit.");
2110 static void setUpLocalBind(std::unique_ptr
<ClientState
>& cs
)
2112 /* skip some warnings if there is an identical UDP context */
2113 bool warn
= cs
->tcp
== false || cs
->tlsFrontend
!= nullptr || cs
->dohFrontend
!= nullptr;
2114 int& fd
= cs
->tcp
== false ? cs
->udpFD
: cs
->tcpFD
;
2117 fd
= SSocket(cs
->local
.sin4
.sin_family
, cs
->tcp
== false ? SOCK_DGRAM
: SOCK_STREAM
, 0);
2120 SSetsockopt(fd
, SOL_SOCKET
, SO_REUSEADDR
, 1);
2121 #ifdef TCP_DEFER_ACCEPT
2122 SSetsockopt(fd
, IPPROTO_TCP
, TCP_DEFER_ACCEPT
, 1);
2124 if (cs
->fastOpenQueueSize
> 0) {
2126 SSetsockopt(fd
, IPPROTO_TCP
, TCP_FASTOPEN
, cs
->fastOpenQueueSize
);
2129 warnlog("TCP Fast Open has been configured on local address '%s' but is not supported", cs
->local
.toStringWithPort());
2135 if(cs
->local
.sin4
.sin_family
== AF_INET6
) {
2136 SSetsockopt(fd
, IPPROTO_IPV6
, IPV6_V6ONLY
, 1);
2139 bindAny(cs
->local
.sin4
.sin_family
, fd
);
2141 if(!cs
->tcp
&& IsAnyAddress(cs
->local
)) {
2143 setsockopt(fd
, IPPROTO_IP
, GEN_IP_PKTINFO
, &one
, sizeof(one
)); // linux supports this, so why not - might fail on other systems
2144 #ifdef IPV6_RECVPKTINFO
2145 setsockopt(fd
, IPPROTO_IPV6
, IPV6_RECVPKTINFO
, &one
, sizeof(one
));
2149 if (cs
->reuseport
) {
2151 SSetsockopt(fd
, SOL_SOCKET
, SO_REUSEPORT
, 1);
2154 /* no need to warn again if configured but support is not available, we already did for UDP */
2155 warnlog("SO_REUSEPORT has been configured on local address '%s' but is not supported", cs
->local
.toStringWithPort());
2161 if (cs
->local
.isIPv4()) {
2163 setSocketIgnorePMTU(cs
->udpFD
);
2165 catch(const std::exception
& e
) {
2166 warnlog("Failed to set IP_MTU_DISCOVER on UDP server socket for local address '%s': %s", cs
->local
.toStringWithPort(), e
.what());
2171 const std::string
& itf
= cs
->interface
;
2173 #ifdef SO_BINDTODEVICE
2174 int res
= setsockopt(fd
, SOL_SOCKET
, SO_BINDTODEVICE
, itf
.c_str(), itf
.length());
2176 warnlog("Error setting up the interface on local address '%s': %s", cs
->local
.toStringWithPort(), stringerror());
2180 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", cs
->local
.toStringWithPort());
2186 if (g_defaultBPFFilter
) {
2187 cs
->attachFilter(g_defaultBPFFilter
);
2188 vinfolog("Attaching default BPF Filter to %s frontend %s", (!cs
->tcp
? "UDP" : "TCP"), cs
->local
.toStringWithPort());
2190 #endif /* HAVE_EBPF */
2192 if (cs
->tlsFrontend
!= nullptr) {
2193 if (!cs
->tlsFrontend
->setupTLS()) {
2194 errlog("Error while setting up TLS on local address '%s', exiting", cs
->local
.toStringWithPort());
2195 _exit(EXIT_FAILURE
);
2199 if (cs
->dohFrontend
!= nullptr) {
2200 cs
->dohFrontend
->setup();
2203 SBind(fd
, cs
->local
);
2206 SListen(cs
->tcpFD
, SOMAXCONN
);
2207 if (cs
->tlsFrontend
!= nullptr) {
2208 warnlog("Listening on %s for TLS", cs
->local
.toStringWithPort());
2210 else if (cs
->dohFrontend
!= nullptr) {
2211 warnlog("Listening on %s for DoH", cs
->local
.toStringWithPort());
2213 else if (cs
->dnscryptCtx
!= nullptr) {
2214 warnlog("Listening on %s for DNSCrypt", cs
->local
.toStringWithPort());
2217 warnlog("Listening on %s", cs
->local
.toStringWithPort());
2226 vector
<string
> locals
;
2227 vector
<string
> remotes
;
2228 bool checkConfig
{false};
2229 bool beClient
{false};
2230 bool beSupervised
{false};
2237 std::atomic
<bool> g_configurationDone
{false};
2242 cout
<<"Syntax: dnsdist [-C,--config file] [-c,--client [IP[:PORT]]]\n";
2243 cout
<<"[-e,--execute cmd] [-h,--help] [-l,--local addr]\n";
2244 cout
<<"[-v,--verbose] [--check-config] [--version]\n";
2246 cout
<<"-a,--acl netmask Add this netmask to the ACL\n";
2247 cout
<<"-C,--config file Load configuration from 'file'\n";
2248 cout
<<"-c,--client Operate as a client, connect to dnsdist. This reads\n";
2249 cout
<<" controlSocket from your configuration file, but also\n";
2250 cout
<<" accepts an IP:PORT argument\n";
2251 #ifdef HAVE_LIBSODIUM
2252 cout
<<"-k,--setkey KEY Use KEY for encrypted communication to dnsdist. This\n";
2253 cout
<<" is similar to setting setKey in the configuration file.\n";
2254 cout
<<" NOTE: this will leak this key in your shell's history\n";
2255 cout
<<" and in the systems running process list.\n";
2257 cout
<<"--check-config Validate the configuration file and exit. The exit-code\n";
2258 cout
<<" reflects the validation, 0 is OK, 1 means an error.\n";
2259 cout
<<" Any errors are printed as well.\n";
2260 cout
<<"-e,--execute cmd Connect to dnsdist and execute 'cmd'\n";
2261 cout
<<"-g,--gid gid Change the process group ID after binding sockets\n";
2262 cout
<<"-h,--help Display this helpful message\n";
2263 cout
<<"-l,--local address Listen on this local address\n";
2264 cout
<<"--supervised Don't open a console, I'm supervised\n";
2265 cout
<<" (use with e.g. systemd and daemontools)\n";
2266 cout
<<"--disable-syslog Don't log to syslog, only to stdout\n";
2267 cout
<<" (use with e.g. systemd)\n";
2268 cout
<<"-u,--uid uid Change the process user ID after binding sockets\n";
2269 cout
<<"-v,--verbose Enable verbose mode\n";
2270 cout
<<"-V,--version Show dnsdist version information and exit\n";
2273 int main(int argc
, char** argv
)
2276 size_t udpBindsCount
= 0;
2277 size_t tcpBindsCount
= 0;
2278 rl_attempted_completion_function
= my_completion
;
2279 rl_completion_append_character
= 0;
2281 signal(SIGPIPE
, SIG_IGN
);
2282 signal(SIGCHLD
, SIG_IGN
);
2283 openlog("dnsdist", LOG_PID
|LOG_NDELAY
, LOG_DAEMON
);
2285 #ifdef HAVE_LIBSODIUM
2286 if (sodium_init() == -1) {
2287 cerr
<<"Unable to initialize crypto library"<<endl
;
2290 g_hashperturb
=randombytes_uniform(0xffffffff);
2291 srandom(randombytes_uniform(0xffffffff));
2295 gettimeofday(&tv
, 0);
2296 srandom(tv
.tv_sec
^ tv
.tv_usec
^ getpid());
2297 g_hashperturb
=random();
2301 ComboAddress clientAddress
= ComboAddress();
2302 g_cmdLine
.config
=SYSCONFDIR
"/dnsdist.conf";
2303 struct option longopts
[]={
2304 {"acl", required_argument
, 0, 'a'},
2305 {"check-config", no_argument
, 0, 1},
2306 {"client", no_argument
, 0, 'c'},
2307 {"config", required_argument
, 0, 'C'},
2308 {"disable-syslog", no_argument
, 0, 2},
2309 {"execute", required_argument
, 0, 'e'},
2310 {"gid", required_argument
, 0, 'g'},
2311 {"help", no_argument
, 0, 'h'},
2312 {"local", required_argument
, 0, 'l'},
2313 {"setkey", required_argument
, 0, 'k'},
2314 {"supervised", no_argument
, 0, 3},
2315 {"uid", required_argument
, 0, 'u'},
2316 {"verbose", no_argument
, 0, 'v'},
2317 {"version", no_argument
, 0, 'V'},
2323 int c
=getopt_long(argc
, argv
, "a:cC:e:g:hk:l:u:vV", longopts
, &longindex
);
2328 g_cmdLine
.checkConfig
=true;
2334 g_cmdLine
.beSupervised
=true;
2337 g_cmdLine
.config
=optarg
;
2340 g_cmdLine
.beClient
=true;
2343 g_cmdLine
.command
=optarg
;
2346 g_cmdLine
.gid
=optarg
;
2349 cout
<<"dnsdist "<<VERSION
<<endl
;
2356 g_ACL
.modify([optstring
](NetmaskGroup
& nmg
) { nmg
.addMask(optstring
); });
2359 #ifdef HAVE_LIBSODIUM
2360 if (B64Decode(string(optarg
), g_consoleKey
) < 0) {
2361 cerr
<<"Unable to decode key '"<<optarg
<<"'."<<endl
;
2365 cerr
<<"dnsdist has been built without libsodium, -k/--setkey is unsupported."<<endl
;
2370 g_cmdLine
.locals
.push_back(trim_copy(string(optarg
)));
2373 g_cmdLine
.uid
=optarg
;
2379 #ifdef LUAJIT_VERSION
2380 cout
<<"dnsdist "<<VERSION
<<" ("<<LUA_RELEASE
<<" ["<<LUAJIT_VERSION
<<"])"<<endl
;
2382 cout
<<"dnsdist "<<VERSION
<<" ("<<LUA_RELEASE
<<")"<<endl
;
2384 cout
<<"Enabled features: ";
2388 #ifdef HAVE_DNS_OVER_TLS
2389 cout
<<"dns-over-tls(";
2401 #ifdef HAVE_DNS_OVER_HTTPS
2402 cout
<<"dns-over-https(DOH) ";
2404 #ifdef HAVE_DNSCRYPT
2413 #ifdef HAVE_LIBCRYPTO
2416 #ifdef HAVE_LIBSODIUM
2422 #ifdef HAVE_PROTOBUF
2428 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
2429 cout
<<"recvmmsg/sendmmsg ";
2431 #ifdef HAVE_NET_SNMP
2441 //getopt_long printed an error message.
2450 for(auto p
= argv
; *p
; ++p
) {
2451 if(g_cmdLine
.beClient
) {
2452 clientAddress
= ComboAddress(*p
, 5199);
2454 g_cmdLine
.remotes
.push_back(*p
);
2458 ServerPolicy leastOutstandingPol
{"leastOutstanding", leastOutstanding
, false};
2460 g_policy
.setState(leastOutstandingPol
);
2461 if(g_cmdLine
.beClient
|| !g_cmdLine
.command
.empty()) {
2462 setupLua(true, false, g_cmdLine
.config
);
2463 if (clientAddress
!= ComboAddress())
2464 g_serverControl
= clientAddress
;
2465 doClient(g_serverControl
, g_cmdLine
.command
);
2466 _exit(EXIT_SUCCESS
);
2469 auto acl
= g_ACL
.getCopy();
2471 for(auto& addr
: {"127.0.0.0/8", "10.0.0.0/8", "100.64.0.0/10", "169.254.0.0/16", "192.168.0.0/16", "172.16.0.0/12", "::1/128", "fc00::/7", "fe80::/10"})
2473 g_ACL
.setState(acl
);
2476 auto consoleACL
= g_consoleACL
.getCopy();
2477 for (const auto& mask
: { "127.0.0.1/8", "::1/128" }) {
2478 consoleACL
.addMask(mask
);
2480 g_consoleACL
.setState(consoleACL
);
2482 if (g_cmdLine
.checkConfig
) {
2483 setupLua(false, true, g_cmdLine
.config
);
2484 // No exception was thrown
2485 infolog("Configuration '%s' OK!", g_cmdLine
.config
);
2486 _exit(EXIT_SUCCESS
);
2489 auto todo
=setupLua(false, false, g_cmdLine
.config
);
2491 auto localPools
= g_pools
.getCopy();
2493 bool precompute
= false;
2494 if (g_policy
.getLocal()->name
== "chashed") {
2497 for (const auto& entry
: localPools
) {
2498 if (entry
.second
->policy
!= nullptr && entry
.second
->policy
->name
== "chashed") {
2505 vinfolog("Pre-computing hashes for consistent hash load-balancing policy");
2506 // pre compute hashes
2507 auto backends
= g_dstates
.getLocal();
2508 for (auto& backend
: *backends
) {
2514 if (!g_cmdLine
.locals
.empty()) {
2515 for (auto it
= g_frontends
.begin(); it
!= g_frontends
.end(); ) {
2516 /* DoH, DoT and DNSCrypt frontends are separate */
2517 if ((*it
)->dohFrontend
== nullptr && (*it
)->tlsFrontend
== nullptr && (*it
)->dnscryptCtx
== nullptr) {
2518 it
= g_frontends
.erase(it
);
2525 for(const auto& loc
: g_cmdLine
.locals
) {
2527 g_frontends
.push_back(std::unique_ptr
<ClientState
>(new ClientState(ComboAddress(loc
, 53), false, false, 0, "", {})));
2529 g_frontends
.push_back(std::unique_ptr
<ClientState
>(new ClientState(ComboAddress(loc
, 53), true, false, 0, "", {})));
2533 if (g_frontends
.empty()) {
2535 g_frontends
.push_back(std::unique_ptr
<ClientState
>(new ClientState(ComboAddress("127.0.0.1", 53), false, false, 0, "", {})));
2537 g_frontends
.push_back(std::unique_ptr
<ClientState
>(new ClientState(ComboAddress("127.0.0.1", 53), true, false, 0, "", {})));
2540 g_configurationDone
= true;
2542 for(auto& frontend
: g_frontends
) {
2543 setUpLocalBind(frontend
);
2545 if (frontend
->tcp
== false) {
2553 warnlog("dnsdist %s comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it according to the terms of the GPL version 2", VERSION
);
2557 g_ACL
.getLocal()->toStringVector(&vec
);
2558 for(const auto& s
: vec
) {
2563 infolog("ACL allowing queries from: %s", acls
.c_str());
2566 g_consoleACL
.getLocal()->toStringVector(&vec
);
2567 for (const auto& entry
: vec
) {
2568 if (!acls
.empty()) {
2573 infolog("Console ACL allowing connections from: %s", acls
.c_str());
2575 #ifdef HAVE_LIBSODIUM
2576 if (g_consoleEnabled
&& g_consoleKey
.empty()) {
2577 warnlog("Warning, the console has been enabled via 'controlSocket()' but no key has been set with 'setKey()' so all connections will fail until a key has been set");
2584 if(!g_cmdLine
.gid
.empty())
2585 newgid
= strToGID(g_cmdLine
.gid
.c_str());
2587 if(!g_cmdLine
.uid
.empty())
2588 newuid
= strToUID(g_cmdLine
.uid
.c_str());
2590 dropGroupPrivs(newgid
);
2591 dropUserPrivs(newuid
);
2593 /* we might still have capabilities remaining,
2594 for example if we have been started as root
2595 without --uid or --gid (please don't do that)
2596 or as an unprivileged user with ambient
2597 capabilities like CAP_NET_BIND_SERVICE.
2599 dropCapabilities(g_capabilitiesToRetain
);
2601 catch(const std::exception
& e
) {
2602 warnlog("%s", e
.what());
2605 /* this need to be done _after_ dropping privileges */
2606 g_delay
= new DelayPipe
<DelayedPacket
>();
2612 g_tcpclientthreads
= std::unique_ptr
<TCPClientCollection
>(new TCPClientCollection(g_maxTCPClientThreads
, g_useTCPSinglePipe
));
2617 localPools
= g_pools
.getCopy();
2618 /* create the default pool no matter what */
2619 createPoolIfNotExists(localPools
, "");
2620 if(g_cmdLine
.remotes
.size()) {
2621 for(const auto& address
: g_cmdLine
.remotes
) {
2622 auto ret
=std::make_shared
<DownstreamState
>(ComboAddress(address
, 53));
2623 addServerToPool(localPools
, "", ret
);
2624 if (ret
->connected
&& !ret
->threadStarted
.test_and_set()) {
2625 ret
->tid
= thread(responderThread
, ret
);
2627 g_dstates
.modify([ret
](servers_t
& servers
) { servers
.push_back(ret
); });
2630 g_pools
.setState(localPools
);
2632 if(g_dstates
.getLocal()->empty()) {
2633 errlog("No downstream servers defined: all packets will get dropped");
2634 // you might define them later, but you need to know
2637 checkFileDescriptorsLimits(udpBindsCount
, tcpBindsCount
);
2639 auto mplexer
= std::shared_ptr
<FDMultiplexer
>(FDMultiplexer::getMultiplexerSilent());
2640 for(auto& dss
: g_dstates
.getCopy()) { // it is a copy, but the internal shared_ptrs are the real deal
2641 if (dss
->availability
== DownstreamState::Availability::Auto
) {
2642 if (!queueHealthCheck(mplexer
, dss
, true)) {
2643 dss
->upStatus
= false;
2644 warnlog("Marking downstream %s as 'down'", dss
->getNameWithAddr());
2648 handleQueuedHealthChecks(mplexer
, true);
2650 for(auto& cs
: g_frontends
) {
2651 if (cs
->dohFrontend
!= nullptr) {
2652 #ifdef HAVE_DNS_OVER_HTTPS
2653 std::thread
t1(dohThread
, cs
.get());
2654 if (!cs
->cpus
.empty()) {
2655 mapThreadToCPUList(t1
.native_handle(), cs
->cpus
);
2658 #endif /* HAVE_DNS_OVER_HTTPS */
2661 if (cs
->udpFD
>= 0) {
2662 thread
t1(udpClientThread
, cs
.get());
2663 if (!cs
->cpus
.empty()) {
2664 mapThreadToCPUList(t1
.native_handle(), cs
->cpus
);
2668 else if (cs
->tcpFD
>= 0) {
2669 thread
t1(tcpAcceptorThread
, cs
.get());
2670 if (!cs
->cpus
.empty()) {
2671 mapThreadToCPUList(t1
.native_handle(), cs
->cpus
);
2677 thread
carbonthread(carbonDumpThread
);
2678 carbonthread
.detach();
2680 thread
stattid(maintThread
);
2683 thread
healththread(healthChecksThread
);
2685 if (!g_secPollSuffix
.empty()) {
2686 thread
secpollthread(secPollThread
);
2687 secpollthread
.detach();
2690 if(g_cmdLine
.beSupervised
) {
2692 sd_notify(0, "READY=1");
2694 healththread
.join();
2697 healththread
.detach();
2700 _exit(EXIT_SUCCESS
);
2703 catch(const LuaContext::ExecutionErrorException
& e
) {
2705 errlog("Fatal Lua error: %s", e
.what());
2706 std::rethrow_if_nested(e
);
2707 } catch(const std::exception
& ne
) {
2708 errlog("Details: %s", ne
.what());
2710 catch(PDNSException
&ae
)
2712 errlog("Fatal pdns error: %s", ae
.reason
);
2714 _exit(EXIT_FAILURE
);
2716 catch(std::exception
&e
)
2718 errlog("Fatal error: %s", e
.what());
2719 _exit(EXIT_FAILURE
);
2721 catch(PDNSException
&ae
)
2723 errlog("Fatal pdns error: %s", ae
.reason
);
2724 _exit(EXIT_FAILURE
);
2727 uint64_t getLatencyCount(const std::string
&)
2729 return g_stats
.responses
+ g_stats
.selfAnswered
+ g_stats
.cacheHits
;