]> git.ipfire.org Git - thirdparty/pdns.git/blame - pdns/dnsdist.cc
dnsdist: Increase the default value of setMaxUDPOutstanding to 65535
[thirdparty/pdns.git] / pdns / dnsdist.cc
CommitLineData
24d5cb00 1/*
12471842
PL
2 * This file is part of PowerDNS or dnsdist.
3 * Copyright -- PowerDNS.COM B.V. and its contributors
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of version 2 of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * In addition, for the avoidance of any doubt, permission is granted to
10 * link this program with OpenSSL and to (re)distribute the binaries
11 * produced as the result of such linking.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
5cc8371b
RG
22
23#include "config.h"
24
25#include <fstream>
26#include <getopt.h>
27#include <grp.h>
2c5db3f7 28#include <limits>
5cc8371b
RG
29#include <netinet/tcp.h>
30#include <pwd.h>
31#include <sys/resource.h>
32#include <unistd.h>
424bdfb1 33
4d39d7f3 34#if defined (__OpenBSD__) || defined(__NetBSD__)
424bdfb1
PD
35#include <readline/readline.h>
36#else
37#include <editline/readline.h>
38#endif
39
6ab65223
PL
40#ifdef HAVE_SYSTEMD
41#include <systemd/sd-daemon.h>
42#endif
43
5cc8371b
RG
44#include "dnsdist.hh"
45#include "dnsdist-cache.hh"
b5521206 46#include "dnsdist-console.hh"
5cc8371b
RG
47#include "dnsdist-ecs.hh"
48#include "dnsdist-lua.hh"
03b00917 49#include "dnsdist-rings.hh"
5d4e1ef8 50#include "dnsdist-secpoll.hh"
53c57da7 51#include "dnsdist-xpf.hh"
5cc8371b
RG
52
53#include "base64.hh"
54#include "delaypipe.hh"
55#include "dolog.hh"
56#include "dnsname.hh"
e0fd37ec 57#include "dnsparser.hh"
5cc8371b
RG
58#include "dnswriter.hh"
59#include "ednsoptions.hh"
60#include "gettime.hh"
61#include "lock.hh"
62#include "misc.hh"
63#include "sodcrypto.hh"
64#include "sstuff.hh"
519f5484 65#include "threadname.hh"
5cc8371b 66
a40df301 67/* Known sins:
e48090d1 68
d12cea01 69 Receiver is currently single threaded
e5a14b2b 70 not *that* bad actually, but now that we are thread safe, might want to scale
a40df301 71*/
24d5cb00 72
0940e4eb 73/* the Rulaction plan
74 Set of Rules, if one matches, it leads to an Action
75 Both rules and actions could conceivably be Lua based.
76 On the C++ side, both could be inherited from a class Rule and a class Action,
77 on the Lua side we can't do that. */
78
64e4ebb4 79using std::atomic;
80using std::thread;
7730131a 81bool g_verbose;
b065f701 82
e48090d1 83struct DNSDistStats g_stats;
37a5c2d5
PO
84MetricDefinitionStorage g_metricDefinitions;
85
3b203c83 86uint16_t g_maxOutstanding{std::numeric_limits<uint16_t>::max()};
9e87dcb8 87bool g_verboseHealthChecks{false};
1ea747c0 88uint32_t g_staleCacheEntriesTTL{0};
bbfaaa6f 89bool g_syslog{true};
0dffe9e3 90bool g_allowEmptyResponse{false};
cffde2fd 91
92GlobalStateHolder<NetmaskGroup> g_ACL;
c9262563 93string g_outputBuffer;
a227f47d 94
a227f47d 95std::vector<std::shared_ptr<TLSFrontend>> g_tlslocals;
fbf14b03 96std::vector<std::shared_ptr<DOHFrontend>> g_dohlocals;
6e9fd124 97std::vector<std::shared_ptr<DNSCryptContext>> g_dnsCryptLocals;
87b515ed
RG
98#ifdef HAVE_EBPF
99shared_ptr<BPFFilter> g_defaultBPFFilter;
8429ad04 100std::vector<std::shared_ptr<DynBPFFilter> > g_dynBPFFilters;
87b515ed 101#endif /* HAVE_EBPF */
6e9fd124 102std::vector<std::unique_ptr<ClientState>> g_frontends;
886e2cf2 103GlobalStateHolder<pools_t> g_pools;
0beaa5c8 104size_t g_udpVectorSize{1};
7c0860e1 105
9f4eb5cc
RG
106bool g_snmpEnabled{false};
107bool g_snmpTrapsEnabled{false};
108DNSDistSNMPAgent* g_snmpAgent{nullptr};
109
726ddf60 110/* UDP: the grand design. Per socket we listen on for incoming queries there is one thread.
7730131a 111 Then we have a bunch of connected sockets for talking to downstream servers.
112 We send directly to those sockets.
24d5cb00 113
7730131a 114 For the return path, per downstream server we have a thread that listens to responses.
24d5cb00 115
7730131a 116 Per socket there is an array of 2^16 states, when we send out a packet downstream, we note
117 there the original requestor and the original id. The new ID is the offset in the array.
118
119 When an answer comes in on a socket, we look up the offset by the id, and lob it to the
120 original requestor.
121
122 IDs are assigned by atomic increments of the socket offset.
123 */
124
4d5959e6
RG
125GlobalStateHolder<vector<DNSDistRuleAction> > g_rulactions;
126GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_resprulactions;
127GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_cachehitresprulactions;
2d4783a8 128GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_selfansweredresprulactions;
4d5959e6 129
df111b53 130Rings g_rings;
786e4d8c 131QueryCount g_qcount;
7c0860e1 132
ecbe9133 133GlobalStateHolder<servers_t> g_dstates;
78ffa782 134GlobalStateHolder<NetmaskTree<DynBlock>> g_dynblockNMG;
71c94675 135GlobalStateHolder<SuffixMatchTree<DynBlock>> g_dynblockSMT;
dd46e5e3 136DNSAction::Action g_dynBlockAction = DNSAction::Action::Drop;
3f6d07a4
RG
137int g_tcpRecvTimeout{2};
138int g_tcpSendTimeout{2};
e0b5e49d 139int g_udpTimeout{2};
3f6d07a4 140
26a3cdb7 141bool g_servFailOnNoPolicy{false};
e72fbfc4 142bool g_truncateTC{false};
53c57da7
RG
143bool g_fixupCase{false};
144bool g_preserveTrailingData{false};
32b86928 145bool g_roundrobinFailOnNoServer{false};
0beaa5c8 146
e0fd37ec 147static void truncateTC(char* packet, uint16_t* len, size_t responseSize, unsigned int consumed)
6ad8b29a 148try
149{
e0fd37ec
RG
150 bool hadEDNS = false;
151 uint16_t payloadSize = 0;
152 uint16_t z = 0;
153
154 if (g_addEDNSToSelfGeneratedResponses) {
155 hadEDNS = getEDNSUDPPayloadSizeAndZ(packet, *len, &payloadSize, &z);
156 }
157
2aa2c0aa 158 *len=static_cast<uint16_t>(sizeof(dnsheader)+consumed+DNS_TYPE_SIZE+DNS_CLASS_SIZE);
e7c732b8
RG
159 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(packet);
160 dh->ancount = dh->arcount = dh->nscount = 0;
e0fd37ec
RG
161
162 if (hadEDNS) {
5e98ccfa 163 addEDNS(dh, *len, responseSize, z & EDNS_HEADER_FLAG_DO, payloadSize, 0);
e0fd37ec 164 }
6ad8b29a 165}
166catch(...)
167{
168 g_stats.truncFail++;
169}
170
7b3865cd 171struct DelayedPacket
172{
173 int fd;
174 string packet;
175 ComboAddress destination;
68f3eb4d 176 ComboAddress origDest;
7b3865cd 177 void operator()()
178 {
20b019fa
RG
179 ssize_t res;
180 if(origDest.sin4.sin_family == 0) {
181 res = sendto(fd, packet.c_str(), packet.size(), 0, (struct sockaddr*)&destination, destination.getSocklen());
182 }
183 else {
184 res = sendfromto(fd, packet.c_str(), packet.size(), 0, origDest, destination);
185 }
186 if (res == -1) {
187 int err = errno;
188 vinfolog("Error sending delayed response to %s: %s", destination.toStringWithPort(), strerror(err));
189 }
7b3865cd 190 }
191};
192
3e425868 193DelayPipe<DelayedPacket>* g_delay = nullptr;
7b3865cd 194
f653b8df 195void doLatencyStats(double udiff)
daacd477 196{
cb167afd
CHB
197 if(udiff < 1000) ++g_stats.latency0_1;
198 else if(udiff < 10000) ++g_stats.latency1_10;
199 else if(udiff < 50000) ++g_stats.latency10_50;
200 else if(udiff < 100000) ++g_stats.latency50_100;
201 else if(udiff < 1000000) ++g_stats.latency100_1000;
202 else ++g_stats.latencySlow;
eb0335ff 203 g_stats.latencySum += udiff / 1000;
be6c318f 204
daacd477 205 auto doAvg = [](double& var, double n, double weight) {
206 var = (weight -1) * var/weight + n/weight;
207 };
208
209 doAvg(g_stats.latencyAvg100, udiff, 100);
210 doAvg(g_stats.latencyAvg1000, udiff, 1000);
211 doAvg(g_stats.latencyAvg10000, udiff, 10000);
212 doAvg(g_stats.latencyAvg1000000, udiff, 1000000);
213}
ca404e94 214
e7c732b8 215bool responseContentMatches(const char* response, const uint16_t responseLen, const DNSName& qname, const uint16_t qtype, const uint16_t qclass, const ComboAddress& remote, unsigned int& consumed)
fcffc585 216{
fcffc585
RG
217 if (responseLen < sizeof(dnsheader)) {
218 return false;
219 }
220
3e425868 221 const struct dnsheader* dh = reinterpret_cast<const struct dnsheader*>(response);
c8c3d4e4 222 if (dh->qdcount == 0) {
ad7ec9a2 223 if ((dh->rcode != RCode::NoError && dh->rcode != RCode::NXDomain) || g_allowEmptyResponse) {
c8c3d4e4
RG
224 return true;
225 }
226 else {
cb167afd 227 ++g_stats.nonCompliantResponses;
c8c3d4e4
RG
228 return false;
229 }
230 }
231
3e425868
RG
232 uint16_t rqtype, rqclass;
233 DNSName rqname;
fcffc585
RG
234 try {
235 rqname=DNSName(response, responseLen, sizeof(dnsheader), false, &rqtype, &rqclass, &consumed);
236 }
3e425868
RG
237 catch(const std::exception& e) {
238 if(responseLen > 0 && static_cast<size_t>(responseLen) > sizeof(dnsheader)) {
fcffc585 239 infolog("Backend %s sent us a response with id %d that did not parse: %s", remote.toStringWithPort(), ntohs(dh->id), e.what());
3e425868 240 }
cb167afd 241 ++g_stats.nonCompliantResponses;
fcffc585
RG
242 return false;
243 }
244
245 if (rqtype != qtype || rqclass != qclass || rqname != qname) {
246 return false;
247 }
248
249 return true;
250}
251
4ab01344 252static void restoreFlags(struct dnsheader* dh, uint16_t origFlags)
fcffc585
RG
253{
254 static const uint16_t rdMask = 1 << FLAGS_RD_OFFSET;
255 static const uint16_t cdMask = 1 << FLAGS_CD_OFFSET;
256 static const uint16_t restoreFlagsMask = UINT16_MAX & ~(rdMask | cdMask);
0f72fd5c
RG
257 uint16_t * flags = getFlagsFromDNSHeader(dh);
258 /* clear the flags we are about to restore */
259 *flags &= restoreFlagsMask;
260 /* only keep the flags we want to restore */
261 origFlags &= ~restoreFlagsMask;
262 /* set the saved flags as they were */
263 *flags |= origFlags;
264}
265
4ab01344 266static bool fixUpQueryTurnedResponse(DNSQuestion& dq, const uint16_t origFlags)
e7c732b8
RG
267{
268 restoreFlags(dq.dh, origFlags);
269
270 return addEDNSToQueryTurnedResponse(dq);
271}
272
3e425868 273static bool fixUpResponse(char** response, uint16_t* responseLen, size_t* responseSize, const DNSName& qname, uint16_t origFlags, bool ednsAdded, bool ecsAdded, std::vector<uint8_t>& rewrittenResponse, uint16_t addRoom, bool* zeroScope)
0f72fd5c 274{
fcffc585
RG
275 if (*responseLen < sizeof(dnsheader)) {
276 return false;
277 }
278
3e425868 279 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(*response);
c8c3d4e4
RG
280 restoreFlags(dh, origFlags);
281
282 if (*responseLen == sizeof(dnsheader)) {
283 return true;
284 }
285
fcffc585
RG
286 if(g_fixupCase) {
287 string realname = qname.toDNSString();
288 if (*responseLen >= (sizeof(dnsheader) + realname.length())) {
289 memcpy(*response + sizeof(dnsheader), realname.c_str(), realname.length());
290 }
291 }
292
ff73f02b 293 if (ednsAdded || ecsAdded) {
ceae225c 294 uint16_t optStart;
fcffc585
RG
295 size_t optLen = 0;
296 bool last = false;
297
11d5d247
RG
298 const std::string responseStr(*response, *responseLen);
299 int res = locateEDNSOptRR(responseStr, &optStart, &optLen, &last);
fcffc585
RG
300
301 if (res == 0) {
49c33a6c
RG
302 if (zeroScope) { // this finds if an EDNS Client Subnet scope was set, and if it is 0
303 size_t optContentStart = 0;
304 uint16_t optContentLen = 0;
305 /* we need at least 4 bytes after the option length (family: 2, source prefix-length: 1, scope prefix-length: 1) */
306 if (isEDNSOptionInOpt(responseStr, optStart, optLen, EDNSOptionCode::ECS, &optContentStart, &optContentLen) && optContentLen >= 4) {
307 /* see if the EDNS Client Subnet SCOPE PREFIX-LENGTH byte in position 3 is set to 0, which is the only thing
308 we care about. */
309 *zeroScope = responseStr.at(optContentStart + 3) == 0;
310 }
40669042 311 }
9837850d 312
ff73f02b
RG
313 if (ednsAdded) {
314 /* we added the entire OPT RR,
315 therefore we need to remove it entirely */
316 if (last) {
317 /* simply remove the last AR */
318 *responseLen -= optLen;
319 uint16_t arcount = ntohs(dh->arcount);
320 arcount--;
321 dh->arcount = htons(arcount);
322 }
323 else {
324 /* Removing an intermediary RR could lead to compression error */
11d5d247 325 if (rewriteResponseWithoutEDNS(responseStr, rewrittenResponse) == 0) {
ff73f02b
RG
326 *responseLen = rewrittenResponse.size();
327 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
328 rewrittenResponse.reserve(*responseLen + addRoom);
329 }
330 *responseSize = rewrittenResponse.capacity();
331 *response = reinterpret_cast<char*>(rewrittenResponse.data());
332 }
333 else {
334 warnlog("Error rewriting content");
335 }
336 }
fcffc585
RG
337 }
338 else {
ff73f02b
RG
339 /* the OPT RR was already present, but without ECS,
340 we need to remove the ECS option if any */
341 if (last) {
342 /* nothing after the OPT RR, we can simply remove the
343 ECS option */
344 size_t existingOptLen = optLen;
11d5d247 345 removeEDNSOptionFromOPT(*response + optStart, &optLen, EDNSOptionCode::ECS);
ff73f02b 346 *responseLen -= (existingOptLen - optLen);
fcffc585
RG
347 }
348 else {
ff73f02b 349 /* Removing an intermediary RR could lead to compression error */
11d5d247 350 if (rewriteResponseWithoutEDNSOption(responseStr, EDNSOptionCode::ECS, rewrittenResponse) == 0) {
ff73f02b
RG
351 *responseLen = rewrittenResponse.size();
352 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
353 rewrittenResponse.reserve(*responseLen + addRoom);
354 }
355 *responseSize = rewrittenResponse.capacity();
356 *response = reinterpret_cast<char*>(rewrittenResponse.data());
357 }
358 else {
359 warnlog("Error rewriting content");
360 }
fcffc585
RG
361 }
362 }
363 }
364 }
365
366 return true;
367}
368
fcffc585 369#ifdef HAVE_DNSCRYPT
3e425868 370static bool encryptResponse(char* response, uint16_t* responseLen, size_t responseSize, bool tcp, std::shared_ptr<DNSCryptQuery> dnsCryptQuery, dnsheader** dh, dnsheader* dhCopy)
fcffc585 371{
0f72fd5c
RG
372 if (dnsCryptQuery) {
373 uint16_t encryptedResponseLen = 0;
57847d65
RG
374
375 /* save the original header before encrypting it in place */
376 if (dh != nullptr && *dh != nullptr && dhCopy != nullptr) {
377 memcpy(dhCopy, *dh, sizeof(dnsheader));
378 *dh = dhCopy;
379 }
380
43234e76 381 int res = dnsCryptQuery->encryptResponse(response, *responseLen, responseSize, tcp, &encryptedResponseLen);
fcffc585 382 if (res == 0) {
0f72fd5c 383 *responseLen = encryptedResponseLen;
fcffc585
RG
384 } else {
385 /* dropping response */
386 vinfolog("Error encrypting the response, dropping.");
387 return false;
388 }
389 }
0f72fd5c
RG
390 return true;
391}
7129b5c4 392#endif /* HAVE_DNSCRYPT */
fcffc585 393
3e425868
RG
394static bool applyRulesToResponse(LocalStateHolder<vector<DNSDistResponseRuleAction> >& localRespRulactions, DNSResponse& dr)
395{
396 DNSResponseAction::Action action=DNSResponseAction::Action::None;
397 std::string ruleresult;
398 for(const auto& lr : *localRespRulactions) {
399 if(lr.d_rule->matches(&dr)) {
400 lr.d_rule->d_matches++;
401 action=(*lr.d_action)(&dr, &ruleresult);
402 switch(action) {
403 case DNSResponseAction::Action::Allow:
404 return true;
405 break;
406 case DNSResponseAction::Action::Drop:
407 return false;
408 break;
409 case DNSResponseAction::Action::HeaderModify:
410 return true;
411 break;
412 case DNSResponseAction::Action::ServFail:
413 dr.dh->rcode = RCode::ServFail;
414 return true;
415 break;
416 /* non-terminal actions follow */
417 case DNSResponseAction::Action::Delay:
418 dr.delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
419 break;
420 case DNSResponseAction::Action::None:
421 break;
422 }
423 }
424 }
425
426 return true;
427}
428
429bool processResponse(char** response, uint16_t* responseLen, size_t* responseSize, LocalStateHolder<vector<DNSDistResponseRuleAction> >& localRespRulactions, DNSResponse& dr, size_t addRoom, std::vector<uint8_t>& rewrittenResponse, bool muted)
430{
431 if (!applyRulesToResponse(localRespRulactions, dr)) {
432 return false;
433 }
434
435 bool zeroScope = false;
436 if (!fixUpResponse(response, responseLen, responseSize, *dr.qname, dr.origFlags, dr.ednsAdded, dr.ecsAdded, rewrittenResponse, addRoom, dr.useZeroScope ? &zeroScope : nullptr)) {
437 return false;
438 }
439
440 if (dr.packetCache && !dr.skipCache) {
441 if (!dr.useZeroScope) {
442 /* if the query was not suitable for zero-scope, for
443 example because it had an existing ECS entry so the hash is
444 not really 'no ECS', so just insert it for the existing subnet
445 since:
446 - we don't have the correct hash for a non-ECS query
447 - inserting with hash computed before the ECS replacement but with
448 the subnet extracted _after_ the replacement would not work.
449 */
450 zeroScope = false;
451 }
452 // if zeroScope, pass the pre-ECS hash-key and do not pass the subnet to the cache
453 dr.packetCache->insert(zeroScope ? dr.cacheKeyNoECS : dr.cacheKey, zeroScope ? boost::none : dr.subnet, dr.origFlags, dr.dnssecOK, *dr.qname, dr.qtype, dr.qclass, *response, *responseLen, dr.tcp, dr.dh->rcode, dr.tempFailureTTL);
454 }
455
456#ifdef HAVE_DNSCRYPT
457 if (!muted) {
458 if (!encryptResponse(*response, responseLen, *responseSize, dr.tcp, dr.dnsCryptQuery, nullptr, nullptr)) {
459 return false;
460 }
461 }
7129b5c4 462#endif /* HAVE_DNSCRYPT */
3e425868
RG
463
464 return true;
465}
466
467static bool sendUDPResponse(int origFD, const char* response, const uint16_t responseLen, const int delayMsec, const ComboAddress& origDest, const ComboAddress& origRemote)
0f72fd5c 468{
fcffc585
RG
469 if(delayMsec && g_delay) {
470 DelayedPacket dp{origFD, string(response,responseLen), origRemote, origDest};
471 g_delay->submit(dp, delayMsec);
472 }
473 else {
20b019fa
RG
474 ssize_t res;
475 if(origDest.sin4.sin_family == 0) {
3e425868 476 res = sendto(origFD, response, responseLen, 0, reinterpret_cast<const struct sockaddr*>(&origRemote), origRemote.getSocklen());
20b019fa
RG
477 }
478 else {
479 res = sendfromto(origFD, response, responseLen, 0, origDest, origRemote);
480 }
481 if (res == -1) {
482 int err = errno;
483 vinfolog("Error sending response to %s: %s", origRemote.toStringWithPort(), strerror(err));
484 }
fcffc585
RG
485 }
486
487 return true;
488}
489
150105a2 490
fbf14b03 491int pickBackendSocketForSending(std::shared_ptr<DownstreamState>& state)
150105a2 492{
5bdbb83d 493 return state->sockets[state->socketsOffset++ % state->sockets.size()];
150105a2
RG
494}
495
5bdbb83d 496static void pickBackendSocketsReadyForReceiving(const std::shared_ptr<DownstreamState>& state, std::vector<int>& ready)
150105a2 497{
5bdbb83d 498 ready.clear();
150105a2 499
5bdbb83d
RG
500 if (state->sockets.size() == 1) {
501 ready.push_back(state->sockets[0]);
502 return ;
150105a2
RG
503 }
504
5bdbb83d
RG
505 {
506 std::lock_guard<std::mutex> lock(state->socketsLock);
507 state->mplexer->getAvailableFDs(ready, -1);
150105a2 508 }
150105a2
RG
509}
510
4632045b 511// listens on a dedicated socket, lobs answers from downstream servers to original requestors
9b73b71c 512void responderThread(std::shared_ptr<DownstreamState> dss)
66d1c0bf 513try {
519f5484 514 setThreadName("dnsdist/respond");
d8c19b98 515 auto localRespRulactions = g_resprulactions.getLocal();
11e1e08b 516 char packet[4096 + DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE];
b50ee135 517 static_assert(sizeof(packet) <= UINT16_MAX, "Packet size should fit in a uint16_t");
3e425868
RG
518 /* when the answer is encrypted in place, we need to get a copy
519 of the original header before encryption to fill the ring buffer */
520 dnsheader cleartextDH;
ca404e94 521 vector<uint8_t> rewrittenResponse;
7267213a 522
66d1c0bf 523 uint16_t queryId = 0;
5bdbb83d
RG
524 std::vector<int> sockets;
525 sockets.reserve(dss->sockets.size());
526
7730131a 527 for(;;) {
57847d65 528 dnsheader* dh = reinterpret_cast<struct dnsheader*>(packet);
66d1c0bf 529 try {
5bdbb83d
RG
530 pickBackendSocketsReadyForReceiving(dss, sockets);
531 for (const auto& fd : sockets) {
532 ssize_t got = recv(fd, packet, sizeof(packet), 0);
533 char * response = packet;
534 size_t responseSize = sizeof(packet);
ca404e94 535
2aa2c0aa 536 if (got < 0 || static_cast<size_t>(got) < sizeof(dnsheader))
5bdbb83d 537 continue;
4f380af3 538
3e425868 539 uint16_t responseLen = static_cast<uint16_t>(got);
5bdbb83d 540 queryId = dh->id;
b50ee135 541
fbf14b03 542 if(queryId >= dss->idStates.size()) {
5bdbb83d 543 continue;
fbf14b03 544 }
4f380af3 545
5bdbb83d 546 IDState* ids = &dss->idStates[queryId];
a9489723 547 int64_t usageIndicator = ids->usageIndicator;
c9ba8478 548
311f19d5 549 if(!IDState::isInUse(usageIndicator)) {
9bd1a882 550 /* the corresponding state is marked as not in use, meaning that:
a9489723 551 - it was already cleaned up by another thread and the state is gone ;
9bd1a882
RG
552 - we already got a response for this query and this one is a duplicate.
553 Either way, we don't touch it.
554 */
5bdbb83d 555 continue;
9bd1a882 556 }
7267213a 557
9bd1a882 558 /* read the potential DOHUnit state as soon as possible, but don't use it
a9489723 559 until we have confirmed that we own this state by updating usageIndicator */
0956c5c5 560 auto du = ids->du;
5bdbb83d
RG
561 /* setting age to 0 to prevent the maintainer thread from
562 cleaning this IDS while we process the response.
5bdbb83d
RG
563 */
564 ids->age = 0;
a9489723 565 int origFD = ids->origFD;
fcffc585 566
e7c732b8
RG
567 unsigned int consumed = 0;
568 if (!responseContentMatches(response, responseLen, ids->qname, ids->qtype, ids->qclass, dss->remote, consumed)) {
5bdbb83d
RG
569 continue;
570 }
7267213a 571
3c72bc54 572 bool isDoH = du != nullptr;
a9489723
RG
573 /* atomically mark the state as available, but only if it has not been altered
574 in the meantime */
311f19d5 575 if (ids->tryMarkUnused(usageIndicator)) {
9bd1a882
RG
576 /* clear the potential DOHUnit asap, it's ours now
577 and since we just marked the state as unused,
578 someone could overwrite it. */
246ff31f 579 ids->du = nullptr;
71b86bd8
RG
580 /* we only decrement the outstanding counter if the value was not
581 altered in the meantime, which would mean that the state has been actively reused
582 and the other thread has not incremented the outstanding counter, so we don't
583 want it to be decremented twice. */
584 --dss->outstanding; // you'd think an attacker could game this, but we're using connected socket
0956c5c5 585 } else {
9bd1a882 586 /* someone updated the state in the meantime, we can't touch the existing pointer */
0956c5c5 587 du = nullptr;
f2d0c808
RG
588 /* since the state has been updated, we can't safely access it so let's just drop
589 this response */
590 continue;
71b86bd8 591 }
2c5db3f7 592
5bdbb83d 593 if(dh->tc && g_truncateTC) {
e0fd37ec 594 truncateTC(response, &responseLen, responseSize, consumed);
5bdbb83d 595 }
aeb36780 596
5bdbb83d 597 dh->id = ids->origID;
ca404e94 598
5bdbb83d 599 uint16_t addRoom = 0;
d0ae6360
RG
600 DNSResponse dr = makeDNSResponseFromIDState(*ids, dh, sizeof(packet), responseLen, false);
601 if (dr.dnsCryptQuery) {
5bdbb83d
RG
602 addRoom = DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE;
603 }
ca404e94 604
3e425868
RG
605 memcpy(&cleartextDH, dr.dh, sizeof(cleartextDH));
606 if (!processResponse(&response, &responseLen, &responseSize, localRespRulactions, dr, addRoom, rewrittenResponse, ids->cs && ids->cs->muted)) {
607 continue;
5bdbb83d 608 }
886e2cf2 609
5bdbb83d 610 if (ids->cs && !ids->cs->muted) {
0956c5c5 611 if (du) {
fbf14b03
RG
612#ifdef HAVE_DNS_OVER_HTTPS
613 // DoH query
246ff31f 614 du->response = std::string(response, responseLen);
0956c5c5 615 if (send(du->rsock, &du, sizeof(du), 0) != sizeof(du)) {
9bd1a882
RG
616 /* at this point we have the only remaining pointer on this
617 DOHUnit object since we did set ids->du to nullptr earlier */
0956c5c5 618 delete du;
fbf14b03
RG
619 }
620#endif /* HAVE_DNS_OVER_HTTPS */
0956c5c5 621 du = nullptr;
fbf14b03
RG
622 }
623 else {
624 ComboAddress empty;
625 empty.sin4.sin_family = 0;
626 /* if ids->destHarvested is false, origDest holds the listening address.
627 We don't want to use that as a source since it could be 0.0.0.0 for example. */
628 sendUDPResponse(origFD, response, responseLen, dr.delayMsec, ids->destHarvested ? ids->origDest : empty, ids->origRemote);
629 }
5bdbb83d 630 }
66d1c0bf 631
cb167afd 632 ++g_stats.responses;
66d1c0bf 633
5bdbb83d 634 double udiff = ids->sentTime.udiff();
fbf14b03 635 vinfolog("Got answer from %s, relayed to %s%s, took %f usec", dss->remote.toStringWithPort(), ids->origRemote.toStringWithPort(),
3c72bc54 636 isDoH ? " (https)": "", udiff);
66d1c0bf 637
01f6920b
RG
638 struct timespec ts;
639 gettime(&ts);
3e425868 640 g_rings.insertResponse(ts, *dr.remote, *dr.qname, dr.qtype, static_cast<unsigned int>(udiff), static_cast<unsigned int>(got), cleartextDH, dss->remote);
66d1c0bf 641
61d10a4d
MH
642 switch (dh->rcode) {
643 case RCode::NXDomain:
644 ++g_stats.frontendNXDomain;
645 break;
646 case RCode::ServFail:
cb167afd 647 ++g_stats.servfailResponses;
61d10a4d
MH
648 ++g_stats.frontendServFail;
649 break;
650 case RCode::NoError:
651 ++g_stats.frontendNoError;
652 break;
cb167afd 653 }
5bdbb83d 654 dss->latencyUsec = (127.0 * dss->latencyUsec / 128.0) + udiff/128.0;
66d1c0bf 655
5bdbb83d 656 doLatencyStats(udiff);
fcadd56e 657
5bdbb83d
RG
658 rewrittenResponse.clear();
659 }
66d1c0bf 660 }
df560083 661 catch(const std::exception& e){
cd7fa253 662 vinfolog("Got an error in UDP responder thread while parsing a response from %s, id %d: %s", dss->remote.toStringWithPort(), queryId, e.what());
66d1c0bf 663 }
7730131a 664 }
7730131a 665}
66d1c0bf
RG
666catch(const std::exception& e)
667{
668 errlog("UDP responder thread died because of exception: %s", e.what());
66d1c0bf
RG
669}
670catch(const PDNSException& e)
671{
672 errlog("UDP responder thread died because of PowerDNS exception: %s", e.reason);
66d1c0bf
RG
673}
674catch(...)
675{
676 errlog("UDP responder thread died because of an exception: %s", "unknown");
66d1c0bf 677}
24d5cb00 678
5d7e6765 679bool DownstreamState::reconnect()
3c115e0f 680{
5d7e6765
RG
681 std::unique_lock<std::mutex> tl(connectLock, std::try_to_lock);
682 if (!tl.owns_lock()) {
683 /* we are already reconnecting */
684 return false;
685 }
686
b58f08e5 687 connected = false;
5bdbb83d 688 for (auto& fd : sockets) {
150105a2 689 if (fd != -1) {
5d7e6765 690 if (sockets.size() > 1) {
5bdbb83d
RG
691 std::lock_guard<std::mutex> lock(socketsLock);
692 mplexer->removeReadFD(fd);
693 }
150105a2
RG
694 /* shutdown() is needed to wake up recv() in the responderThread */
695 shutdown(fd, SHUT_RDWR);
696 close(fd);
697 fd = -1;
f99e3aaf 698 }
150105a2
RG
699 if (!IsAnyAddress(remote)) {
700 fd = SSocket(remote.sin4.sin_family, SOCK_DGRAM, 0);
701 if (!IsAnyAddress(sourceAddr)) {
702 SSetsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 1);
703 SBind(fd, sourceAddr);
704 }
705 try {
706 SConnect(fd, remote);
5d7e6765 707 if (sockets.size() > 1) {
5bdbb83d
RG
708 std::lock_guard<std::mutex> lock(socketsLock);
709 mplexer->addReadFD(fd, [](int, boost::any) {});
710 }
150105a2
RG
711 connected = true;
712 }
713 catch(const std::runtime_error& error) {
714 infolog("Error connecting to new server with address %s: %s", remote.toStringWithPort(), error.what());
38069e7e
RG
715 connected = false;
716 break;
717 }
7565f4e6 718 }
38069e7e
RG
719 }
720
721 /* if at least one (re-)connection failed, close all sockets */
722 if (!connected) {
5bdbb83d 723 for (auto& fd : sockets) {
38069e7e 724 if (fd != -1) {
5d7e6765
RG
725 if (sockets.size() > 1) {
726 std::lock_guard<std::mutex> lock(socketsLock);
727 mplexer->removeReadFD(fd);
728 }
38069e7e
RG
729 /* shutdown() is needed to wake up recv() in the responderThread */
730 shutdown(fd, SHUT_RDWR);
731 close(fd);
732 fd = -1;
150105a2 733 }
7565f4e6 734 }
b58f08e5 735 }
5d7e6765
RG
736
737 return connected;
b58f08e5 738}
f2caf657
CHB
739void DownstreamState::hash()
740{
d58e616a 741 vinfolog("Computing hashes for id=%s and weight=%d", id, weight);
f2caf657 742 auto w = weight;
d58e616a 743 WriteLock wl(&d_lock);
f2caf657
CHB
744 hashes.clear();
745 while (w > 0) {
746 std::string uuid = boost::str(boost::format("%s-%d") % id % w);
747 unsigned int wshash = burtleCI((const unsigned char*)uuid.c_str(), uuid.size(), g_hashperturb);
748 hashes.insert(wshash);
749 --w;
750 }
751}
752
753void DownstreamState::setId(const boost::uuids::uuid& newId)
754{
755 id = newId;
d58e616a
CHB
756 // compute hashes only if already done
757 if (!hashes.empty()) {
758 hash();
759 }
f2caf657
CHB
760}
761
762void DownstreamState::setWeight(int newWeight)
763{
5a2bbe8c
CHB
764 if (newWeight < 1) {
765 errlog("Error setting server's weight: downstream weight value must be greater than 0.");
766 return ;
767 }
f2caf657 768 weight = newWeight;
d58e616a
CHB
769 if (!hashes.empty()) {
770 hash();
771 }
f2caf657 772}
b58f08e5 773
150105a2 774DownstreamState::DownstreamState(const ComboAddress& remote_, const ComboAddress& sourceAddr_, unsigned int sourceItf_, size_t numberOfSockets): remote(remote_), sourceAddr(sourceAddr_), sourceItf(sourceItf_)
b58f08e5 775{
d58e616a 776 pthread_rwlock_init(&d_lock, nullptr);
d61aa945 777 id = getUniqueID();
5d7e6765
RG
778 threadStarted.clear();
779
5bdbb83d
RG
780 mplexer = std::unique_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
781
782 sockets.resize(numberOfSockets);
783 for (auto& fd : sockets) {
150105a2
RG
784 fd = -1;
785 }
786
b58f08e5
RG
787 if (!IsAnyAddress(remote)) {
788 reconnect();
f99e3aaf
RG
789 idStates.resize(g_maxOutstanding);
790 sw.start();
791 infolog("Added downstream server %s", remote.toStringWithPort());
fbe2a2e0 792 }
1720247e 793
3c115e0f 794}
795
773470ca 796std::mutex g_luamutex;
3f25bce6 797LuaContext g_lua;
3f25bce6 798
ecbe9133 799GlobalStateHolder<ServerPolicy> g_policy;
773470ca 800
497a6e3a 801shared_ptr<DownstreamState> firstAvailable(const NumberedServerVector& servers, const DNSQuestion* dq)
773470ca 802{
22b2b326 803 for(auto& d : servers) {
da4e7813 804 if(d.second->isUp() && d.second->qps.check())
805 return d.second;
773470ca 806 }
497a6e3a 807 return leastOutstanding(servers, dq);
2c5db3f7 808}
809
d1fba58e 810// get server with least outstanding queries, and within those, with the lowest order, and within those: the fastest
497a6e3a 811shared_ptr<DownstreamState> leastOutstanding(const NumberedServerVector& servers, const DNSQuestion* dq)
c9262563 812{
886e2cf2
RG
813 if (servers.size() == 1 && servers[0].second->isUp()) {
814 return servers[0].second;
815 }
816
d1fba58e 817 vector<pair<tuple<int,int,double>, shared_ptr<DownstreamState>>> poss;
818 /* so you might wonder, why do we go through this trouble? The data on which we sort could change during the sort,
819 which would suck royally and could even lead to crashes. So first we snapshot on what we sort, and then we sort */
478ce172 820 poss.reserve(servers.size());
0e41337b 821 for(auto& d : servers) {
da4e7813 822 if(d.second->isUp()) {
d1fba58e 823 poss.push_back({make_tuple(d.second->outstanding.load(), d.second->order, d.second->latencyUsec), d.second});
c9262563 824 }
825 }
826 if(poss.empty())
827 return shared_ptr<DownstreamState>();
828 nth_element(poss.begin(), poss.begin(), poss.end(), [](const decltype(poss)::value_type& a, const decltype(poss)::value_type& b) { return a.first < b.first; });
829 return poss.begin()->second;
830}
831
497a6e3a 832shared_ptr<DownstreamState> valrandom(unsigned int val, const NumberedServerVector& servers, const DNSQuestion* dq)
c9262563 833{
834 vector<pair<int, shared_ptr<DownstreamState>>> poss;
278403d3
DM
835 int sum = 0;
836 int max = std::numeric_limits<int>::max();
837
22b2b326 838 for(auto& d : servers) { // w=1, w=10 -> 1, 11
da4e7813 839 if(d.second->isUp()) {
278403d3
DM
840 // Don't overflow sum when adding high weights
841 if(d.second->weight > max - sum) {
842 sum = max;
843 } else {
844 sum += d.second->weight;
845 }
846
da4e7813 847 poss.push_back({sum, d.second});
c9262563 848 }
849 }
d2894425
NJ
850
851 // Catch poss & sum are empty to avoid SIGFPE
852 if(poss.empty())
853 return shared_ptr<DownstreamState>();
854
a7f3108c 855 int r = val % sum;
af619119 856 auto p = upper_bound(poss.begin(), poss.end(),r, [](int r_, const decltype(poss)::value_type& a) { return r_ < a.first;});
c9262563 857 if(p==poss.end())
858 return shared_ptr<DownstreamState>();
859 return p->second;
860}
861
497a6e3a 862shared_ptr<DownstreamState> wrandom(const NumberedServerVector& servers, const DNSQuestion* dq)
a7f3108c 863{
497a6e3a 864 return valrandom(random(), servers, dq);
a7f3108c 865}
866
36e763fa 867uint32_t g_hashperturb;
497a6e3a 868shared_ptr<DownstreamState> whashed(const NumberedServerVector& servers, const DNSQuestion* dq)
a7f3108c 869{
497a6e3a 870 return valrandom(dq->qname->hash(g_hashperturb), servers, dq);
a7f3108c 871}
872
1720247e
CHB
873shared_ptr<DownstreamState> chashed(const NumberedServerVector& servers, const DNSQuestion* dq)
874{
1720247e 875 unsigned int qhash = dq->qname->hash(g_hashperturb);
b712c51e
CHB
876 unsigned int sel = std::numeric_limits<unsigned int>::max();
877 unsigned int min = std::numeric_limits<unsigned int>::max();
878 shared_ptr<DownstreamState> ret = nullptr, first = nullptr;
1720247e
CHB
879
880 for (const auto& d: servers) {
881 if (d.second->isUp()) {
93ca495f 882 // make sure hashes have been computed
d58e616a
CHB
883 if (d.second->hashes.empty()) {
884 d.second->hash();
885 }
886 {
887 ReadLock rl(&(d.second->d_lock));
93ca495f
CHB
888 const auto& server = d.second;
889 // we want to keep track of the last hash
b712c51e
CHB
890 if (min > *(server->hashes.begin())) {
891 min = *(server->hashes.begin());
892 first = server;
93ca495f 893 }
b712c51e
CHB
894
895 auto hash_it = server->hashes.lower_bound(qhash);
896 if (hash_it != server->hashes.end()) {
897 if (*hash_it < sel) {
93ca495f
CHB
898 sel = *hash_it;
899 ret = server;
900 }
d58e616a 901 }
1720247e
CHB
902 }
903 }
904 }
93ca495f
CHB
905 if (ret != nullptr) {
906 return ret;
1720247e 907 }
b712c51e
CHB
908 if (first != nullptr) {
909 return first;
1720247e 910 }
93ca495f 911 return shared_ptr<DownstreamState>();
1720247e 912}
a7f3108c 913
497a6e3a 914shared_ptr<DownstreamState> roundrobin(const NumberedServerVector& servers, const DNSQuestion* dq)
5f504638 915{
da4e7813 916 NumberedServerVector poss;
c9262563 917
22b2b326 918 for(auto& d : servers) {
da4e7813 919 if(d.second->isUp()) {
5f504638 920 poss.push_back(d);
c9262563 921 }
5f504638 922 }
c9262563 923
ecbe9133 924 const auto *res=&poss;
32b86928 925 if(poss.empty() && !g_roundrobinFailOnNoServer)
ecbe9133 926 res = &servers;
c9262563 927
928 if(res->empty())
929 return shared_ptr<DownstreamState>();
930
931 static unsigned int counter;
932
da4e7813 933 return (*res)[(counter++) % res->size()].second;
5f504638 934}
935
e6841c9e 936ComboAddress g_serverControl{"127.0.0.1:5199"};
b076b34a 937
886e2cf2
RG
938std::shared_ptr<ServerPool> createPoolIfNotExists(pools_t& pools, const string& poolName)
939{
940 std::shared_ptr<ServerPool> pool;
941 pools_t::iterator it = pools.find(poolName);
942 if (it != pools.end()) {
943 pool = it->second;
944 }
945 else {
8f4f5186
RG
946 if (!poolName.empty())
947 vinfolog("Creating pool %s", poolName);
886e2cf2
RG
948 pool = std::make_shared<ServerPool>();
949 pools.insert(std::pair<std::string,std::shared_ptr<ServerPool> >(poolName, pool));
950 }
951 return pool;
952}
78c8047b 953
742c079a
RG
954void setPoolPolicy(pools_t& pools, const string& poolName, std::shared_ptr<ServerPolicy> policy)
955{
956 std::shared_ptr<ServerPool> pool = createPoolIfNotExists(pools, poolName);
957 if (!poolName.empty()) {
958 vinfolog("Setting pool %s server selection policy to %s", poolName, policy->name);
959 } else {
960 vinfolog("Setting default pool server selection policy to %s", policy->name);
961 }
962 pool->policy = policy;
963}
964
886e2cf2 965void addServerToPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server)
22b2b326 966{
886e2cf2 967 std::shared_ptr<ServerPool> pool = createPoolIfNotExists(pools, poolName);
c218b223 968 if (!poolName.empty()) {
8f4f5186 969 vinfolog("Adding server to pool %s", poolName);
c218b223 970 } else {
8f4f5186 971 vinfolog("Adding server to default pool");
c218b223 972 }
a1b1a29d 973 pool->addServer(server);
886e2cf2
RG
974}
975
976void removeServerFromPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server)
977{
8f4f5186 978 std::shared_ptr<ServerPool> pool = getPool(pools, poolName);
886e2cf2 979
c218b223 980 if (!poolName.empty()) {
8f4f5186 981 vinfolog("Removing server from pool %s", poolName);
c218b223 982 }
983 else {
8f4f5186 984 vinfolog("Removing server from default pool");
c218b223 985 }
886e2cf2 986
a1b1a29d 987 pool->removeServer(server);
886e2cf2
RG
988}
989
990std::shared_ptr<ServerPool> getPool(const pools_t& pools, const std::string& poolName)
991{
992 pools_t::const_iterator it = pools.find(poolName);
993
994 if (it == pools.end()) {
995 throw std::out_of_range("No pool named " + poolName);
996 }
997
998 return it->second;
999}
1000
a1b1a29d 1001NumberedServerVector getDownstreamCandidates(const pools_t& pools, const std::string& poolName)
886e2cf2
RG
1002{
1003 std::shared_ptr<ServerPool> pool = getPool(pools, poolName);
a1b1a29d 1004 return pool->getServers();
22b2b326 1005}
c9262563 1006
614239d0 1007static void spoofResponseFromString(DNSQuestion& dq, const string& spoofContent)
7791f83a
RG
1008{
1009 string result;
614239d0
RG
1010
1011 std::vector<std::string> addrs;
1012 stringtok(addrs, spoofContent, " ,");
1013
1014 if (addrs.size() == 1) {
1015 try {
1016 ComboAddress spoofAddr(spoofContent);
1017 SpoofAction sa({spoofAddr});
1018 sa(&dq, &result);
1019 }
1020 catch(const PDNSException &e) {
1021 SpoofAction sa(spoofContent); // CNAME then
1022 sa(&dq, &result);
1023 }
1024 } else {
1025 std::vector<ComboAddress> cas;
1026 for (const auto& addr : addrs) {
1027 try {
1028 cas.push_back(ComboAddress(addr));
1029 }
1030 catch (...) {
1031 }
1032 }
1033 SpoofAction sa(cas);
6ca7a40a 1034 sa(&dq, &result);
7791f83a
RG
1035 }
1036}
1037
2a28db86
RG
1038bool processRulesResult(const DNSAction::Action& action, DNSQuestion& dq, std::string& ruleresult, bool& drop)
1039{
1040 switch(action) {
1041 case DNSAction::Action::Allow:
1042 return true;
1043 break;
1044 case DNSAction::Action::Drop:
1045 ++g_stats.ruleDrop;
1046 drop = true;
1047 return true;
1048 break;
1049 case DNSAction::Action::Nxdomain:
1050 dq.dh->rcode = RCode::NXDomain;
1051 dq.dh->qr=true;
1052 ++g_stats.ruleNXDomain;
1053 return true;
1054 break;
1055 case DNSAction::Action::Refused:
1056 dq.dh->rcode = RCode::Refused;
1057 dq.dh->qr=true;
1058 ++g_stats.ruleRefused;
1059 return true;
1060 break;
1061 case DNSAction::Action::ServFail:
1062 dq.dh->rcode = RCode::ServFail;
1063 dq.dh->qr=true;
1064 ++g_stats.ruleServFail;
1065 return true;
1066 break;
1067 case DNSAction::Action::Spoof:
1068 spoofResponseFromString(dq, ruleresult);
1069 return true;
1070 break;
1071 case DNSAction::Action::Truncate:
1072 dq.dh->tc = true;
1073 dq.dh->qr = true;
1074 return true;
1075 break;
1076 case DNSAction::Action::HeaderModify:
1077 return true;
1078 break;
1079 case DNSAction::Action::Pool:
1080 dq.poolname=ruleresult;
1081 return true;
1082 break;
1083 case DNSAction::Action::NoRecurse:
1084 dq.dh->rd = false;
1085 return true;
1086 break;
1087 /* non-terminal actions follow */
1088 case DNSAction::Action::Delay:
1089 dq.delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
1090 break;
1091 case DNSAction::Action::None:
1092 /* fall-through */
1093 case DNSAction::Action::NoOp:
1094 break;
1095 }
1096
1097 /* false means that we don't stop the processing */
1098 return false;
1099}
1100
1101
1102static bool applyRulesToQuery(LocalHolders& holders, DNSQuestion& dq, const struct timespec& now)
e91084ce 1103{
4ab01344 1104 g_rings.insertQuery(now, *dq.remote, *dq.qname, dq.qtype, dq.len, *dq.dh);
e91084ce 1105
786e4d8c 1106 if(g_qcount.enabled) {
348ef1c6 1107 string qname = (*dq.qname).toLogString();
786e4d8c
RS
1108 bool countQuery{true};
1109 if(g_qcount.filter) {
1110 std::lock_guard<std::mutex> lock(g_luamutex);
dd1a3034 1111 std::tie (countQuery, qname) = g_qcount.filter(&dq);
786e4d8c
RS
1112 }
1113
1114 if(countQuery) {
1115 WriteLock wl(&g_qcount.queryLock);
1116 if(!g_qcount.records.count(qname)) {
1117 g_qcount.records[qname] = 0;
1118 }
1119 g_qcount.records[qname]++;
1120 }
1121 }
1122
0beaa5c8 1123 if(auto got = holders.dynNMGBlock->lookup(*dq.remote)) {
701f690b 1124 auto updateBlockStats = [&got]() {
cb167afd 1125 ++g_stats.dynBlocked;
701f690b 1126 got->second.blocks++;
1127 };
1128
e91084ce 1129 if(now < got->second.until) {
7b925432
RG
1130 DNSAction::Action action = got->second.action;
1131 if (action == DNSAction::Action::None) {
1132 action = g_dynBlockAction;
1133 }
477c86a0
RG
1134 switch (action) {
1135 case DNSAction::Action::NoOp:
1136 /* do nothing */
1137 break;
79ee8ff9
RG
1138
1139 case DNSAction::Action::Nxdomain:
1140 vinfolog("Query from %s turned into NXDomain because of dynamic block", dq.remote->toStringWithPort());
1141 updateBlockStats();
1142
1143 dq.dh->rcode = RCode::NXDomain;
1144 dq.dh->qr=true;
1145 return true;
1146
477c86a0 1147 case DNSAction::Action::Refused:
dd46e5e3 1148 vinfolog("Query from %s refused because of dynamic block", dq.remote->toStringWithPort());
701f690b 1149 updateBlockStats();
8477236d 1150
dd46e5e3 1151 dq.dh->rcode = RCode::Refused;
79ee8ff9 1152 dq.dh->qr = true;
dd46e5e3 1153 return true;
477c86a0
RG
1154
1155 case DNSAction::Action::Truncate:
8477236d 1156 if(!dq.tcp) {
701f690b 1157 updateBlockStats();
8477236d 1158 vinfolog("Query from %s truncated because of dynamic block", dq.remote->toStringWithPort());
1159 dq.dh->tc = true;
1160 dq.dh->qr = true;
1161 return true;
1162 }
1163 else {
348ef1c6 1164 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
8477236d 1165 }
477c86a0 1166 break;
3d60b39a 1167 case DNSAction::Action::NoRecurse:
1168 updateBlockStats();
1169 vinfolog("Query from %s setting rd=0 because of dynamic block", dq.remote->toStringWithPort());
1170 dq.dh->rd = false;
1171 return true;
477c86a0 1172 default:
701f690b 1173 updateBlockStats();
dd46e5e3
RG
1174 vinfolog("Query from %s dropped because of dynamic block", dq.remote->toStringWithPort());
1175 return false;
1176 }
e91084ce
RG
1177 }
1178 }
1179
0beaa5c8 1180 if(auto got = holders.dynSMTBlock->lookup(*dq.qname)) {
701f690b 1181 auto updateBlockStats = [&got]() {
cb167afd 1182 ++g_stats.dynBlocked;
701f690b 1183 got->blocks++;
1184 };
1185
71c94675 1186 if(now < got->until) {
7b925432
RG
1187 DNSAction::Action action = got->action;
1188 if (action == DNSAction::Action::None) {
1189 action = g_dynBlockAction;
1190 }
477c86a0
RG
1191 switch (action) {
1192 case DNSAction::Action::NoOp:
1193 /* do nothing */
1194 break;
79ee8ff9 1195 case DNSAction::Action::Nxdomain:
348ef1c6 1196 vinfolog("Query from %s for %s turned into NXDomain because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
79ee8ff9
RG
1197 updateBlockStats();
1198
1199 dq.dh->rcode = RCode::NXDomain;
1200 dq.dh->qr=true;
1201 return true;
477c86a0 1202 case DNSAction::Action::Refused:
348ef1c6 1203 vinfolog("Query from %s for %s refused because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
701f690b 1204 updateBlockStats();
8477236d 1205
dd46e5e3
RG
1206 dq.dh->rcode = RCode::Refused;
1207 dq.dh->qr=true;
1208 return true;
477c86a0 1209 case DNSAction::Action::Truncate:
8477236d 1210 if(!dq.tcp) {
701f690b 1211 updateBlockStats();
8477236d 1212
348ef1c6 1213 vinfolog("Query from %s for %s truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
8477236d 1214 dq.dh->tc = true;
1215 dq.dh->qr = true;
1216 return true;
1217 }
1218 else {
348ef1c6 1219 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
8477236d 1220 }
477c86a0 1221 break;
3d60b39a 1222 case DNSAction::Action::NoRecurse:
1223 updateBlockStats();
1224 vinfolog("Query from %s setting rd=0 because of dynamic block", dq.remote->toStringWithPort());
1225 dq.dh->rd = false;
1226 return true;
477c86a0 1227 default:
701f690b 1228 updateBlockStats();
348ef1c6 1229 vinfolog("Query from %s for %s dropped because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
dd46e5e3
RG
1230 return false;
1231 }
71c94675 1232 }
1233 }
1234
e91084ce
RG
1235 DNSAction::Action action=DNSAction::Action::None;
1236 string ruleresult;
2a28db86 1237 bool drop = false;
0beaa5c8 1238 for(const auto& lr : *holders.rulactions) {
4d5959e6
RG
1239 if(lr.d_rule->matches(&dq)) {
1240 lr.d_rule->d_matches++;
1241 action=(*lr.d_action)(&dq, &ruleresult);
2a28db86 1242 if (processRulesResult(action, dq, ruleresult, drop)) {
3d60b39a 1243 break;
e91084ce
RG
1244 }
1245 }
1246 }
1247
2a28db86
RG
1248 if (drop) {
1249 return false;
1250 }
1251
e91084ce
RG
1252 return true;
1253}
1254
fbf14b03 1255ssize_t udpClientSendRequestToBackend(const std::shared_ptr<DownstreamState>& ss, const int sd, const char* request, const size_t requestLen, bool healthCheck)
fbe2a2e0 1256{
b58f08e5
RG
1257 ssize_t result;
1258
fbe2a2e0 1259 if (ss->sourceItf == 0) {
b58f08e5
RG
1260 result = send(sd, request, requestLen, 0);
1261 }
1262 else {
1263 struct msghdr msgh;
1264 struct iovec iov;
7bec330a 1265 cmsgbuf_aligned cbuf;
a2353842 1266 ComboAddress remote(ss->remote);
7bec330a
OM
1267 fillMSGHdr(&msgh, &iov, &cbuf, sizeof(cbuf), const_cast<char*>(request), requestLen, &remote);
1268 addCMsgSrcAddr(&msgh, &cbuf, &ss->sourceAddr, ss->sourceItf);
b58f08e5 1269 result = sendmsg(sd, &msgh, 0);
fbe2a2e0
RG
1270 }
1271
b58f08e5
RG
1272 if (result == -1) {
1273 int savederrno = errno;
1274 vinfolog("Error sending request to backend %s: %d", ss->remote.toStringWithPort(), savederrno);
1275
1276 /* This might sound silly, but on Linux send() might fail with EINVAL
1b126225
RG
1277 if the interface the socket was bound to doesn't exist anymore.
1278 We don't want to reconnect the real socket if the healthcheck failed,
1279 because it's not using the same socket.
1280 */
1281 if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV)) {
b58f08e5
RG
1282 ss->reconnect();
1283 }
fbe2a2e0
RG
1284 }
1285
b58f08e5 1286 return result;
fbe2a2e0
RG
1287}
1288
0beaa5c8 1289static bool isUDPQueryAcceptable(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest)
24d5cb00 1290{
0beaa5c8
RG
1291 if (msgh->msg_flags & MSG_TRUNC) {
1292 /* message was too large for our buffer */
1293 vinfolog("Dropping message too large for our buffer");
cb167afd 1294 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1295 return false;
1296 }
a4652d55 1297
0beaa5c8
RG
1298 if(!holders.acl->match(remote)) {
1299 vinfolog("Query from %s dropped because of ACL", remote.toStringWithPort());
cb167afd 1300 ++g_stats.aclDrops;
0beaa5c8 1301 return false;
2b3eefc3 1302 }
2b3eefc3 1303
0beaa5c8 1304 cs.queries++;
cb167afd 1305 ++g_stats.queries;
2b3eefc3 1306
0beaa5c8
RG
1307 if (HarvestDestinationAddress(msgh, &dest)) {
1308 /* we don't get the port, only the address */
1309 dest.sin4.sin_port = cs.local.sin4.sin_port;
1310 }
1311 else {
1312 dest.sin4.sin_family = 0;
2b3eefc3 1313 }
549d63c9 1314
0beaa5c8
RG
1315 return true;
1316}
1317
4ab01344 1318boost::optional<std::vector<uint8_t>> checkDNSCryptQuery(const ClientState& cs, const char* query, uint16_t& len, std::shared_ptr<DNSCryptQuery>& dnsCryptQuery, time_t now, bool tcp)
0beaa5c8
RG
1319{
1320 if (cs.dnscryptCtx) {
7129b5c4 1321#ifdef HAVE_DNSCRYPT
0beaa5c8
RG
1322 vector<uint8_t> response;
1323 uint16_t decryptedQueryLen = 0;
2b3eefc3 1324
43234e76 1325 dnsCryptQuery = std::make_shared<DNSCryptQuery>(cs.dnscryptCtx);
0beaa5c8 1326
4ab01344 1327 bool decrypted = handleDNSCryptQuery(const_cast<char*>(query), len, dnsCryptQuery, &decryptedQueryLen, tcp, now, response);
0beaa5c8
RG
1328
1329 if (!decrypted) {
1330 if (response.size() > 0) {
4ab01344 1331 return response;
2b3eefc3 1332 }
4ab01344 1333 throw std::runtime_error("Unable to decrypt DNSCrypt query, dropping.");
0beaa5c8 1334 }
2b3eefc3 1335
0beaa5c8 1336 len = decryptedQueryLen;
7129b5c4 1337#endif /* HAVE_DNSCRYPT */
0beaa5c8 1338 }
4ab01344 1339 return boost::none;
0beaa5c8 1340}
2b3eefc3 1341
0beaa5c8
RG
1342bool checkQueryHeaders(const struct dnsheader* dh)
1343{
1344 if (dh->qr) { // don't respond to responses
cb167afd 1345 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1346 return false;
1347 }
2b3eefc3 1348
0beaa5c8 1349 if (dh->qdcount == 0) {
cb167afd 1350 ++g_stats.emptyQueries;
0beaa5c8
RG
1351 return false;
1352 }
0ba5eecf 1353
0beaa5c8 1354 if (dh->rd) {
cb167afd 1355 ++g_stats.rdQueries;
0beaa5c8 1356 }
e91084ce 1357
0beaa5c8
RG
1358 return true;
1359}
963bef8d 1360
0beaa5c8 1361#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
e59405cd 1362static void queueResponse(const ClientState& cs, const char* response, uint16_t responseLen, const ComboAddress& dest, const ComboAddress& remote, struct mmsghdr& outMsg, struct iovec* iov, cmsgbuf_aligned* cbuf)
0beaa5c8
RG
1363{
1364 outMsg.msg_len = 0;
1365 fillMSGHdr(&outMsg.msg_hdr, iov, nullptr, 0, const_cast<char*>(response), responseLen, const_cast<ComboAddress*>(&remote));
11e1e08b 1366
0beaa5c8
RG
1367 if (dest.sin4.sin_family == 0) {
1368 outMsg.msg_hdr.msg_control = nullptr;
1369 }
1370 else {
1371 addCMsgSrcAddr(&outMsg.msg_hdr, cbuf, &dest, 0);
1372 }
1373}
1374#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
43eeadc1 1375
3e425868
RG
1376/* self-generated responses or cache hits */
1377static bool prepareOutgoingResponse(LocalHolders& holders, ClientState& cs, DNSQuestion& dq, bool cacheHit)
54aaa82b 1378{
3e425868 1379 DNSResponse dr(dq.qname, dq.qtype, dq.qclass, dq.consumed, dq.local, dq.remote, reinterpret_cast<dnsheader*>(dq.dh), dq.size, dq.len, dq.tcp, dq.queryTime);
4ab01344 1380
54aaa82b 1381#ifdef HAVE_PROTOBUF
1382 dr.uniqueId = dq.uniqueId;
1383#endif
1384 dr.qTag = dq.qTag;
3e425868 1385 dr.delayMsec = dq.delayMsec;
54aaa82b 1386
3e425868
RG
1387 if (!applyRulesToResponse(cacheHit ? holders.cacheHitRespRulactions : holders.selfAnsweredRespRulactions, dr)) {
1388 return false;
54aaa82b 1389 }
1390
3e425868
RG
1391 /* in case a rule changed it */
1392 dq.delayMsec = dr.delayMsec;
1393
54aaa82b 1394#ifdef HAVE_DNSCRYPT
7129b5c4 1395 if (!cs.muted) {
3e425868
RG
1396 if (!encryptResponse(reinterpret_cast<char*>(dq.dh), &dq.len, dq.size, dq.tcp, dq.dnsCryptQuery, nullptr, nullptr)) {
1397 return false;
54aaa82b 1398 }
54aaa82b 1399 }
7129b5c4 1400#endif /* HAVE_DNSCRYPT */
54aaa82b 1401
389d903a
RG
1402 if (cacheHit) {
1403 ++g_stats.cacheHits;
1404 }
3e425868 1405
61d10a4d
MH
1406 switch (dr.dh->rcode) {
1407 case RCode::NXDomain:
1408 ++g_stats.frontendNXDomain;
1409 break;
1410 case RCode::ServFail:
1411 ++g_stats.frontendServFail;
1412 break;
1413 case RCode::NoError:
1414 ++g_stats.frontendNoError;
1415 break;
1416 }
3e425868 1417
54aaa82b 1418 doLatencyStats(0); // we're not going to measure this
3e425868 1419 return true;
54aaa82b 1420}
1421
3e425868 1422ProcessQueryResult processQuery(DNSQuestion& dq, ClientState& cs, LocalHolders& holders, std::shared_ptr<DownstreamState>& selectedBackend)
0beaa5c8 1423{
4ab01344 1424 const uint16_t queryId = ntohs(dq.dh->id);
2b3eefc3 1425
0beaa5c8 1426 try {
43234e76
RG
1427 /* we need an accurate ("real") value for the response and
1428 to store into the IDS, but not for insertion into the
1429 rings for example */
43234e76
RG
1430 struct timespec now;
1431 gettime(&now);
2efd427d 1432
2a28db86 1433 if (!applyRulesToQuery(holders, dq, now)) {
3e425868 1434 return ProcessQueryResult::Drop;
0beaa5c8 1435 }
b63add03 1436
0beaa5c8 1437 if(dq.dh->qr) { // something turned it into a response
4ab01344 1438 fixUpQueryTurnedResponse(dq, dq.origFlags);
0beaa5c8 1439
3e425868
RG
1440 if (!prepareOutgoingResponse(holders, cs, dq, false)) {
1441 return ProcessQueryResult::Drop;
22b2b326 1442 }
5f504638 1443
3e425868
RG
1444 ++g_stats.selfAnswered;
1445 return ProcessQueryResult::SendAnswer;
0beaa5c8
RG
1446 }
1447
2a28db86 1448 std::shared_ptr<ServerPool> serverPool = getPool(*holders.pools, dq.poolname);
4ab01344 1449 dq.packetCache = serverPool->packetCache;
a1b1a29d 1450 auto policy = *(holders.policy);
0beaa5c8 1451 if (serverPool->policy != nullptr) {
a1b1a29d 1452 policy = *(serverPool->policy);
0beaa5c8 1453 }
a1b1a29d
RG
1454 auto servers = serverPool->getServers();
1455 if (policy.isLua) {
0beaa5c8 1456 std::lock_guard<std::mutex> lock(g_luamutex);
3e425868 1457 selectedBackend = policy.policy(servers, &dq);
a1b1a29d
RG
1458 }
1459 else {
3e425868 1460 selectedBackend = policy.policy(servers, &dq);
0beaa5c8 1461 }
228e4fe8 1462
9837850d 1463 uint16_t cachedResponseSize = dq.size;
3e425868 1464 uint32_t allowExpired = selectedBackend ? 0 : g_staleCacheEntriesTTL;
9837850d 1465
4ab01344
RG
1466 if (dq.packetCache && !dq.skipCache) {
1467 dq.dnssecOK = (getEDNSZ(dq) & EDNS_HEADER_FLAG_DO);
1ef18cab 1468 }
1469
3e425868 1470 if (dq.useECS && ((selectedBackend && selectedBackend->useECS) || (!selectedBackend && serverPool->getECS()))) {
389d903a 1471 // we special case our cache in case a downstream explicitly gave us a universally valid response with a 0 scope
3e425868 1472 if (dq.packetCache && !dq.skipCache && (!selectedBackend || !selectedBackend->disableZeroScope) && dq.packetCache->isECSParsingEnabled()) {
4ab01344 1473 if (dq.packetCache->get(dq, dq.consumed, dq.dh->id, reinterpret_cast<char*>(dq.dh), &cachedResponseSize, &dq.cacheKeyNoECS, dq.subnet, dq.dnssecOK, allowExpired)) {
3e425868
RG
1474 dq.len = cachedResponseSize;
1475
1476 if (!prepareOutgoingResponse(holders, cs, dq, true)) {
1477 return ProcessQueryResult::Drop;
1478 }
1479
1480 return ProcessQueryResult::SendAnswer;
389d903a
RG
1481 }
1482
4ab01344 1483 if (!dq.subnet) {
389d903a 1484 /* there was no existing ECS on the query, enable the zero-scope feature */
4ab01344 1485 dq.useZeroScope = true;
389d903a 1486 }
9837850d 1487 }
389d903a 1488
4ab01344
RG
1489 if (!handleEDNSClientSubnet(dq, &(dq.ednsAdded), &(dq.ecsAdded), g_preserveTrailingData)) {
1490 vinfolog("Dropping query from %s because we couldn't insert the ECS value", dq.remote->toStringWithPort());
3e425868 1491 return ProcessQueryResult::Drop;
886e2cf2 1492 }
0beaa5c8 1493 }
886e2cf2 1494
4ab01344
RG
1495 if (dq.packetCache && !dq.skipCache) {
1496 if (dq.packetCache->get(dq, dq.consumed, dq.dh->id, reinterpret_cast<char*>(dq.dh), &cachedResponseSize, &dq.cacheKey, dq.subnet, dq.dnssecOK, allowExpired)) {
3e425868
RG
1497 dq.len = cachedResponseSize;
1498
1499 if (!prepareOutgoingResponse(holders, cs, dq, true)) {
1500 return ProcessQueryResult::Drop;
1501 }
1502
1503 return ProcessQueryResult::SendAnswer;
886e2cf2 1504 }
cb167afd 1505 ++g_stats.cacheMisses;
0beaa5c8 1506 }
886e2cf2 1507
3e425868 1508 if(!selectedBackend) {
cb167afd 1509 ++g_stats.noPolicy;
26a3cdb7 1510
348ef1c6 1511 vinfolog("%s query for %s|%s from %s, no policy applied", g_servFailOnNoPolicy ? "ServFailed" : "Dropped", dq.qname->toLogString(), QType(dq.qtype).getName(), dq.remote->toStringWithPort());
3e425868 1512 if (g_servFailOnNoPolicy) {
4ab01344 1513 restoreFlags(dq.dh, dq.origFlags);
26a3cdb7 1514
0beaa5c8
RG
1515 dq.dh->rcode = RCode::ServFail;
1516 dq.dh->qr = true;
26a3cdb7 1517
3e425868
RG
1518 if (!prepareOutgoingResponse(holders, cs, dq, false)) {
1519 return ProcessQueryResult::Drop;
1520 }
be6c318f 1521 // no response-only statistics counter to update.
3e425868 1522 return ProcessQueryResult::SendAnswer;
1ea747c0 1523 }
3e425868
RG
1524
1525 return ProcessQueryResult::Drop;
0beaa5c8 1526 }
1ea747c0 1527
3e425868
RG
1528 if (dq.addXPF && selectedBackend->xpfRRCode != 0) {
1529 addXPF(dq, selectedBackend->xpfRRCode, g_preserveTrailingData);
5cc8371b
RG
1530 }
1531
3e425868
RG
1532 selectedBackend->queries++;
1533 return ProcessQueryResult::PassToBackend;
4ab01344
RG
1534 }
1535 catch(const std::exception& e){
1536 vinfolog("Got an error while parsing a %s query from %s, id %d: %s", (dq.tcp ? "TCP" : "UDP"), dq.remote->toStringWithPort(), queryId, e.what());
4ab01344 1537 }
3e425868 1538 return ProcessQueryResult::Drop;
4ab01344
RG
1539}
1540
7bec330a 1541static void processUDPQuery(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest, char* query, uint16_t len, size_t queryBufferSize, struct mmsghdr* responsesVect, unsigned int* queuedResponses, struct iovec* respIOV, cmsgbuf_aligned* respCBuf)
4ab01344
RG
1542{
1543 assert(responsesVect == nullptr || (queuedResponses != nullptr && respIOV != nullptr && respCBuf != nullptr));
1544 uint16_t queryId = 0;
1545
1546 try {
1547 if (!isUDPQueryAcceptable(cs, holders, msgh, remote, dest)) {
1548 return;
1549 }
1550
1551 /* we need an accurate ("real") value for the response and
1552 to store into the IDS, but not for insertion into the
1553 rings for example */
1554 struct timespec queryRealTime;
4ab01344
RG
1555 gettime(&queryRealTime, true);
1556
1557 std::shared_ptr<DNSCryptQuery> dnsCryptQuery = nullptr;
4ab01344
RG
1558 auto dnsCryptResponse = checkDNSCryptQuery(cs, query, len, dnsCryptQuery, queryRealTime.tv_sec, false);
1559 if (dnsCryptResponse) {
1560 sendUDPResponse(cs.udpFD, reinterpret_cast<char*>(dnsCryptResponse->data()), static_cast<uint16_t>(dnsCryptResponse->size()), 0, dest, remote);
1561 return;
1562 }
4ab01344
RG
1563
1564 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(query);
1565 queryId = ntohs(dh->id);
1566
1567 if (!checkQueryHeaders(dh)) {
1568 return;
1569 }
1570
1571 uint16_t qtype, qclass;
1572 unsigned int consumed = 0;
1573 DNSName qname(query, len, sizeof(dnsheader), false, &qtype, &qclass, &consumed);
1574 DNSQuestion dq(&qname, qtype, qclass, consumed, dest.sin4.sin_family != 0 ? &dest : &cs.local, &remote, dh, queryBufferSize, len, false, &queryRealTime);
1575 dq.dnsCryptQuery = std::move(dnsCryptQuery);
3e425868
RG
1576 std::shared_ptr<DownstreamState> ss{nullptr};
1577 auto result = processQuery(dq, cs, holders, ss);
4ab01344 1578
3e425868
RG
1579 if (result == ProcessQueryResult::Drop) {
1580 return;
1581 }
1582
1583 if (result == ProcessQueryResult::SendAnswer) {
4ab01344 1584#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
3e425868
RG
1585 if (dq.delayMsec == 0 && responsesVect != nullptr) {
1586 queueResponse(cs, reinterpret_cast<char*>(dq.dh), dq.len, *dq.local, *dq.remote, responsesVect[*queuedResponses], respIOV, respCBuf);
4ab01344
RG
1587 (*queuedResponses)++;
1588 return;
1589 }
1590#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
6d192f51
RG
1591 /* we use dest, always, because we don't want to use the listening address to send a response since it could be 0.0.0.0 */
1592 sendUDPResponse(cs.udpFD, reinterpret_cast<char*>(dq.dh), dq.len, dq.delayMsec, dest, *dq.remote);
3e425868
RG
1593 return;
1594 }
4ab01344 1595
3e425868 1596 if (result != ProcessQueryResult::PassToBackend || ss == nullptr) {
4ab01344
RG
1597 return;
1598 }
11e1e08b 1599
0beaa5c8
RG
1600 unsigned int idOffset = (ss->idOffset++) % ss->idStates.size();
1601 IDState* ids = &ss->idStates[idOffset];
1602 ids->age = 0;
0956c5c5
RG
1603 DOHUnit* du = nullptr;
1604
9bd1a882
RG
1605 /* that means that the state was in use, possibly with an allocated
1606 DOHUnit that we will need to handle, but we can't touch it before
1607 confirming that we now own this state */
311f19d5 1608 if (ids->isInUse()) {
0956c5c5
RG
1609 du = ids->du;
1610 }
c9ba8478 1611
a9489723 1612 /* we atomically replace the value, we now own this state */
311f19d5
RG
1613 if (!ids->markAsUsed()) {
1614 /* the state was not in use.
9bd1a882 1615 we reset 'du' because it might have still been in use when we read it. */
0956c5c5 1616 du = nullptr;
9bd1a882 1617 ++ss->outstanding;
71b86bd8 1618 }
0beaa5c8 1619 else {
9bd1a882
RG
1620 /* we are reusing a state, no change in outstanding but if there was an existing DOHUnit we need
1621 to handle it because it's about to be overwritten. */
a9489723 1622 ids->du = nullptr;
fbf14b03 1623 ++ss->reuseds;
cb167afd 1624 ++g_stats.downstreamTimeouts;
0956c5c5 1625 handleDOHTimeout(du);
0beaa5c8 1626 }
0e41337b 1627
0beaa5c8 1628 ids->cs = &cs;
a9489723 1629 ids->origFD = cs.udpFD;
0beaa5c8 1630 ids->origID = dh->id;
d0ae6360 1631 setIDStateFromDNSQuestion(*ids, dq, std::move(qname));
0beaa5c8
RG
1632
1633 /* If we couldn't harvest the real dest addr, still
1634 write down the listening addr since it will be useful
1635 (especially if it's not an 'any' one).
1636 We need to keep track of which one it is since we may
1637 want to use the real but not the listening addr to reply.
1638 */
1639 if (dest.sin4.sin_family != 0) {
1640 ids->origDest = dest;
1641 ids->destHarvested = true;
1642 }
1643 else {
1644 ids->origDest = cs.local;
1645 ids->destHarvested = false;
1646 }
7129b5c4 1647
0beaa5c8 1648 dh->id = idOffset;
ca404e94 1649
38069e7e 1650 int fd = pickBackendSocketForSending(ss);
150105a2 1651 ssize_t ret = udpClientSendRequestToBackend(ss, fd, query, dq.len);
11e1e08b 1652
0beaa5c8 1653 if(ret < 0) {
fbf14b03 1654 ++ss->sendErrors;
cb167afd 1655 ++g_stats.downstreamSendErrors;
0beaa5c8 1656 }
ca404e94 1657
348ef1c6 1658 vinfolog("Got query for %s|%s from %s, relayed to %s", ids->qname.toLogString(), QType(ids->qtype).getName(), remote.toStringWithPort(), ss->getName());
0beaa5c8
RG
1659 }
1660 catch(const std::exception& e){
1661 vinfolog("Got an error in UDP question thread while parsing a query from %s, id %d: %s", remote.toStringWithPort(), queryId, e.what());
1662 }
1663}
1664
1665#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1666static void MultipleMessagesUDPClientThread(ClientState* cs, LocalHolders& holders)
1667{
1668 struct MMReceiver
1669 {
1670 char packet[4096];
0beaa5c8
RG
1671 ComboAddress remote;
1672 ComboAddress dest;
1673 struct iovec iov;
392966bb
OM
1674 /* used by HarvestDestinationAddress */
1675 cmsgbuf_aligned cbuf;
0beaa5c8
RG
1676 };
1677 const size_t vectSize = g_udpVectorSize;
1678 /* the actual buffer is larger because:
1679 - we may have to add EDNS and/or ECS
1680 - we use it for self-generated responses (from rule or cache)
1681 but we only accept incoming payloads up to that size
1682 */
1683 static_assert(s_udpIncomingBufferSize <= sizeof(MMReceiver::packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1684
1685 auto recvData = std::unique_ptr<MMReceiver[]>(new MMReceiver[vectSize]);
1686 auto msgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1687 auto outMsgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1688
1689 /* initialize the structures needed to receive our messages */
1690 for (size_t idx = 0; idx < vectSize; idx++) {
1691 recvData[idx].remote.sin4.sin_family = cs->local.sin4.sin_family;
7bec330a 1692 fillMSGHdr(&msgVec[idx].msg_hdr, &recvData[idx].iov, &recvData[idx].cbuf, sizeof(recvData[idx].cbuf), recvData[idx].packet, s_udpIncomingBufferSize, &recvData[idx].remote);
0beaa5c8
RG
1693 }
1694
1695 /* go now */
1696 for(;;) {
1697
1698 /* reset the IO vector, since it's also used to send the vector of responses
1699 to avoid having to copy the data around */
1700 for (size_t idx = 0; idx < vectSize; idx++) {
1701 recvData[idx].iov.iov_base = recvData[idx].packet;
1702 recvData[idx].iov.iov_len = sizeof(recvData[idx].packet);
5f504638 1703 }
0beaa5c8
RG
1704
1705 /* block until we have at least one message ready, but return
1706 as many as possible to save the syscall costs */
1707 int msgsGot = recvmmsg(cs->udpFD, msgVec.get(), vectSize, MSG_WAITFORONE | MSG_TRUNC, nullptr);
1708
1709 if (msgsGot <= 0) {
1710 vinfolog("Getting UDP messages via recvmmsg() failed with: %s", strerror(errno));
1711 continue;
773470ca 1712 }
0beaa5c8
RG
1713
1714 unsigned int msgsToSend = 0;
1715
1716 /* process the received messages */
1717 for (int msgIdx = 0; msgIdx < msgsGot; msgIdx++) {
1718 const struct msghdr* msgh = &msgVec[msgIdx].msg_hdr;
1719 unsigned int got = msgVec[msgIdx].msg_len;
1720 const ComboAddress& remote = recvData[msgIdx].remote;
1721
33d01bbd 1722 if (static_cast<size_t>(got) < sizeof(struct dnsheader)) {
3e425868 1723 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1724 continue;
1725 }
1726
7bec330a 1727 processUDPQuery(*cs, holders, msgh, remote, recvData[msgIdx].dest, recvData[msgIdx].packet, static_cast<uint16_t>(got), sizeof(recvData[msgIdx].packet), outMsgVec.get(), &msgsToSend, &recvData[msgIdx].iov, &recvData[msgIdx].cbuf);
0beaa5c8 1728
2b3eefc3 1729 }
0beaa5c8
RG
1730
1731 /* immediate (not delayed or sent to a backend) responses (mostly from a rule, dynamic block
1732 or the cache) can be sent in batch too */
1733
1734 if (msgsToSend > 0 && msgsToSend <= static_cast<unsigned int>(msgsGot)) {
1735 int sent = sendmmsg(cs->udpFD, outMsgVec.get(), msgsToSend, 0);
1736
2b3eefc3 1737 if (sent < 0 || static_cast<unsigned int>(sent) != msgsToSend) {
0beaa5c8 1738 vinfolog("Error sending responses with sendmmsg() (%d on %u): %s", sent, msgsToSend, strerror(errno));
2b3eefc3 1739 }
2b3eefc3 1740 }
0beaa5c8 1741
24d5cb00 1742 }
0beaa5c8
RG
1743}
1744#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1745
1746// listens to incoming queries, sends out to downstream servers, noting the intended return path
9b73b71c 1747static void udpClientThread(ClientState* cs)
0beaa5c8
RG
1748try
1749{
519f5484 1750 setThreadName("dnsdist/udpClie");
0beaa5c8
RG
1751 LocalHolders holders;
1752
1753#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1754 if (g_udpVectorSize > 1) {
1755 MultipleMessagesUDPClientThread(cs, holders);
1756
1757 }
1758 else
1759#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1760 {
1761 char packet[4096];
1762 /* the actual buffer is larger because:
1763 - we may have to add EDNS and/or ECS
1764 - we use it for self-generated responses (from rule or cache)
1765 but we only accept incoming payloads up to that size
1766 */
1767 static_assert(s_udpIncomingBufferSize <= sizeof(packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1768 struct msghdr msgh;
1769 struct iovec iov;
1770 /* used by HarvestDestinationAddress */
7bec330a 1771 cmsgbuf_aligned cbuf;
0beaa5c8
RG
1772
1773 ComboAddress remote;
1774 ComboAddress dest;
1775 remote.sin4.sin_family = cs->local.sin4.sin_family;
7bec330a 1776 fillMSGHdr(&msgh, &iov, &cbuf, sizeof(cbuf), packet, sizeof(packet), &remote);
0beaa5c8
RG
1777
1778 for(;;) {
1779 ssize_t got = recvmsg(cs->udpFD, &msgh, 0);
1780
1781 if (got < 0 || static_cast<size_t>(got) < sizeof(struct dnsheader)) {
cb167afd 1782 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1783 continue;
1784 }
1785
1786 processUDPQuery(*cs, holders, &msgh, remote, dest, packet, static_cast<uint16_t>(got), s_udpIncomingBufferSize, nullptr, nullptr, nullptr, nullptr);
1787 }
1788 }
24d5cb00 1789}
2b3eefc3 1790catch(const std::exception &e)
a4652d55 1791{
1792 errlog("UDP client thread died because of exception: %s", e.what());
a4652d55 1793}
2b3eefc3 1794catch(const PDNSException &e)
a4652d55 1795{
1796 errlog("UDP client thread died because of PowerDNS exception: %s", e.reason);
a4652d55 1797}
1798catch(...)
1799{
1800 errlog("UDP client thread died because of an exception: %s", "unknown");
a4652d55 1801}
24d5cb00 1802
555970c9
RG
1803uint16_t getRandomDNSID()
1804{
1805#ifdef HAVE_LIBSODIUM
1806 return (randombytes_random() % 65536);
1807#else
1808 return (random() % 65536);
1809#endif
1810}
1811
4ab01344 1812static bool upCheck(const shared_ptr<DownstreamState>& ds)
773470ca 1813try
1814{
4ab01344
RG
1815 DNSName checkName = ds->checkName;
1816 uint16_t checkType = ds->checkType.getCode();
1817 uint16_t checkClass = ds->checkClass;
98650fde
RG
1818 dnsheader checkHeader;
1819 memset(&checkHeader, 0, sizeof(checkHeader));
1820
1821 checkHeader.qdcount = htons(1);
555970c9 1822 checkHeader.id = getRandomDNSID();
98650fde
RG
1823
1824 checkHeader.rd = true;
4ab01344 1825 if (ds->setCD) {
98650fde
RG
1826 checkHeader.cd = true;
1827 }
1828
4ab01344 1829 if (ds->checkFunction) {
98650fde 1830 std::lock_guard<std::mutex> lock(g_luamutex);
4ab01344 1831 auto ret = ds->checkFunction(checkName, checkType, checkClass, &checkHeader);
98650fde
RG
1832 checkName = std::get<0>(ret);
1833 checkType = std::get<1>(ret);
1834 checkClass = std::get<2>(ret);
21830638 1835 }
773470ca 1836
98650fde
RG
1837 vector<uint8_t> packet;
1838 DNSPacketWriter dpw(packet, checkName, checkType, checkClass);
1839 dnsheader * requestHeader = dpw.getHeader();
1840 *requestHeader = checkHeader;
1841
4ab01344 1842 Socket sock(ds->remote.sin4.sin_family, SOCK_DGRAM);
773470ca 1843 sock.setNonBlocking();
4ab01344 1844 if (!IsAnyAddress(ds->sourceAddr)) {
fbe2a2e0 1845 sock.setReuseAddr();
4ab01344 1846 sock.bind(ds->sourceAddr);
fbe2a2e0 1847 }
4ab01344 1848 sock.connect(ds->remote);
3e425868 1849 ssize_t sent = udpClientSendRequestToBackend(ds, sock.getHandle(), reinterpret_cast<char*>(&packet[0]), packet.size(), true);
9e87dcb8
RG
1850 if (sent < 0) {
1851 int ret = errno;
1852 if (g_verboseHealthChecks)
4ab01344 1853 infolog("Error while sending a health check query to backend %s: %d", ds->getNameWithAddr(), ret);
fbe2a2e0 1854 return false;
9e87dcb8 1855 }
fbe2a2e0 1856
4ab01344 1857 int ret = waitForRWData(sock.getHandle(), true, /* ms to seconds */ ds->checkTimeout / 1000, /* remaining ms to us */ (ds->checkTimeout % 1000) * 1000);
9e87dcb8
RG
1858 if(ret < 0 || !ret) { // error, timeout, both are down!
1859 if (ret < 0) {
1860 ret = errno;
1861 if (g_verboseHealthChecks)
4ab01344 1862 infolog("Error while waiting for the health check response from backend %s: %d", ds->getNameWithAddr(), ret);
9e87dcb8
RG
1863 }
1864 else {
1865 if (g_verboseHealthChecks)
4ab01344 1866 infolog("Timeout while waiting for the health check response from backend %s", ds->getNameWithAddr());
9e87dcb8 1867 }
773470ca 1868 return false;
9e87dcb8
RG
1869 }
1870
773470ca 1871 string reply;
a2353842
RG
1872 ComboAddress from;
1873 sock.recvFrom(reply, from);
1874
1875 /* we are using a connected socket but hey.. */
4ab01344 1876 if (from != ds->remote) {
a2353842 1877 if (g_verboseHealthChecks)
4ab01344 1878 infolog("Invalid health check response received from %s, expecting one from %s", from.toStringWithPort(), ds->remote.toStringWithPort());
a2353842
RG
1879 return false;
1880 }
773470ca 1881
98650fde 1882 const dnsheader * responseHeader = reinterpret_cast<const dnsheader *>(reply.c_str());
fc01e0b1 1883
9e87dcb8
RG
1884 if (reply.size() < sizeof(*responseHeader)) {
1885 if (g_verboseHealthChecks)
4ab01344 1886 infolog("Invalid health check response of size %d from backend %s, expecting at least %d", reply.size(), ds->getNameWithAddr(), sizeof(*responseHeader));
fc01e0b1 1887 return false;
9e87dcb8 1888 }
fc01e0b1 1889
9e87dcb8
RG
1890 if (responseHeader->id != requestHeader->id) {
1891 if (g_verboseHealthChecks)
4ab01344 1892 infolog("Invalid health check response id %d from backend %s, expecting %d", responseHeader->id, ds->getNameWithAddr(), requestHeader->id);
fc01e0b1 1893 return false;
9e87dcb8
RG
1894 }
1895
1896 if (!responseHeader->qr) {
1897 if (g_verboseHealthChecks)
4ab01344 1898 infolog("Invalid health check response from backend %s, expecting QR to be set", ds->getNameWithAddr());
fc01e0b1 1899 return false;
9e87dcb8
RG
1900 }
1901
1902 if (responseHeader->rcode == RCode::ServFail) {
1903 if (g_verboseHealthChecks)
4ab01344 1904 infolog("Backend %s responded to health check with ServFail", ds->getNameWithAddr());
fc01e0b1 1905 return false;
9e87dcb8
RG
1906 }
1907
4ab01344 1908 if (ds->mustResolve && (responseHeader->rcode == RCode::NXDomain || responseHeader->rcode == RCode::Refused)) {
9e87dcb8 1909 if (g_verboseHealthChecks)
4ab01344 1910 infolog("Backend %s responded to health check with %s while mustResolve is set", ds->getNameWithAddr(), responseHeader->rcode == RCode::NXDomain ? "NXDomain" : "Refused");
a6e02424 1911 return false;
9e87dcb8 1912 }
fc01e0b1 1913
98650fde
RG
1914 uint16_t receivedType;
1915 uint16_t receivedClass;
1916 DNSName receivedName(reply.c_str(), reply.size(), sizeof(dnsheader), false, &receivedType, &receivedClass);
1917
1918 if (receivedName != checkName || receivedType != checkType || receivedClass != checkClass) {
1919 if (g_verboseHealthChecks)
4ab01344 1920 infolog("Backend %s responded to health check with an invalid qname (%s vs %s), qtype (%s vs %s) or qclass (%d vs %d)", ds->getNameWithAddr(), receivedName.toLogString(), checkName.toLogString(), QType(receivedType).getName(), QType(checkType).getName(), receivedClass, checkClass);
98650fde
RG
1921 return false;
1922 }
1923
773470ca 1924 return true;
1925}
9e87dcb8
RG
1926catch(const std::exception& e)
1927{
1928 if (g_verboseHealthChecks)
4ab01344 1929 infolog("Error checking the health of backend %s: %s", ds->getNameWithAddr(), e.what());
9e87dcb8
RG
1930 return false;
1931}
773470ca 1932catch(...)
1933{
9e87dcb8 1934 if (g_verboseHealthChecks)
4ab01344 1935 infolog("Unknown exception while checking the health of backend %s", ds->getNameWithAddr());
773470ca 1936 return false;
1937}
726ddf60 1938
6c1ca990 1939uint64_t g_maxTCPClientThreads{10};
886e2cf2 1940std::atomic<uint16_t> g_cacheCleaningDelay{60};
f65ea0c2 1941std::atomic<uint16_t> g_cacheCleaningPercentage{100};
e41f8165 1942
9b73b71c 1943void maintThread()
886e2cf2 1944{
519f5484 1945 setThreadName("dnsdist/main");
886e2cf2
RG
1946 int interval = 1;
1947 size_t counter = 0;
5f3ea719 1948 int32_t secondsToWaitLog = 0;
886e2cf2
RG
1949
1950 for(;;) {
1951 sleep(interval);
1952
069e59db 1953 {
1954 std::lock_guard<std::mutex> lock(g_luamutex);
5f3ea719
PL
1955 auto f = g_lua.readVariable<boost::optional<std::function<void()> > >("maintenance");
1956 if(f) {
1957 try {
1958 (*f)();
1959 secondsToWaitLog = 0;
1960 }
1961 catch(std::exception &e) {
1962 if (secondsToWaitLog <= 0) {
1963 infolog("Error during execution of maintenance function: %s", e.what());
1964 secondsToWaitLog = 61;
1965 }
1966 secondsToWaitLog -= interval;
1967 }
1968 }
069e59db 1969 }
886e2cf2
RG
1970
1971 counter++;
1972 if (counter >= g_cacheCleaningDelay) {
c1b81381
RG
1973 /* keep track, for each cache, of whether we should keep
1974 expired entries */
1975 std::map<std::shared_ptr<DNSDistPacketCache>, bool> caches;
1976
1977 /* gather all caches actually used by at least one pool, and see
1978 if something prevents us from cleaning the expired entries */
a9c2e4ab 1979 auto localPools = g_pools.getLocal();
a9c2e4ab 1980 for (const auto& entry : *localPools) {
c1b81381
RG
1981 auto& pool = entry.second;
1982
1983 auto packetCache = pool->packetCache;
1984 if (!packetCache) {
1985 continue;
1986 }
1987
1988 auto pair = caches.insert({packetCache, false});
1989 auto& iter = pair.first;
1990 /* if we need to keep stale data for this cache (ie, not clear
1991 expired entries when at least one pool using this cache
1992 has all its backends down) */
1993 if (packetCache->keepStaleData() && iter->second == false) {
1994 /* so far all pools had at least one backend up */
1995 if (pool->countServers(true) == 0) {
1996 iter->second = true;
1997 }
886e2cf2
RG
1998 }
1999 }
c1b81381
RG
2000
2001 for (auto pair : caches) {
2002 /* shall we keep expired entries ? */
2003 if (pair.second == true) {
2004 continue;
886e2cf2 2005 }
c1b81381
RG
2006 auto& packetCache = pair.first;
2007 size_t upTo = (packetCache->getMaxEntries()* (100 - g_cacheCleaningPercentage)) / 100;
2008 packetCache->purgeExpired(upTo);
886e2cf2
RG
2009 }
2010 counter = 0;
2011 }
2012
2013 // ponder pruning g_dynblocks of expired entries here
2014 }
886e2cf2
RG
2015}
2016
9b73b71c 2017static void secPollThread()
5d4e1ef8
RG
2018{
2019 setThreadName("dnsdist/secpoll");
2020
2021 for (;;) {
2022 try {
2023 doSecPoll(g_secPollSuffix);
2024 }
2025 catch(...) {
2026 }
2027 sleep(g_secPollInterval);
2028 }
2029}
2030
9b73b71c 2031static void healthChecksThread()
3ae86514 2032{
519f5484 2033 setThreadName("dnsdist/healthC");
77c9bc9a 2034
fcadd56e 2035 int interval = 1;
64e4ebb4 2036
3ae86514 2037 for(;;) {
2038 sleep(interval);
773470ca 2039
ded1985a 2040 if(g_tcpclientthreads->getQueuedCount() > 1 && !g_tcpclientthreads->hasReachedMaxThreads())
a9bf3ec4 2041 g_tcpclientthreads->addTCPClientThread();
3ae86514 2042
a9c2e4ab
RG
2043 auto states = g_dstates.getLocal(); // this points to the actual shared_ptrs!
2044 for(auto& dss : *states) {
7c9bf18d 2045 if(++dss->lastCheck < dss->checkInterval)
2046 continue;
2047 dss->lastCheck = 0;
773470ca 2048 if(dss->availability==DownstreamState::Availability::Auto) {
4ab01344 2049 bool newState=upCheck(dss);
2993d58d 2050 if (newState) {
1b633bec
RG
2051 /* check succeeded */
2052 dss->currentCheckFailures = 0;
2053
2054 if (!dss->upStatus) {
2055 /* we were marked as down */
853faf61
JS
2056 dss->consecutiveSuccessfulChecks++;
2057 if (dss->consecutiveSuccessfulChecks < dss->minRiseSuccesses) {
1b633bec
RG
2058 /* if we need more than one successful check to rise
2059 and we didn't reach the threshold yet,
2060 let's stay down */
2061 newState = false;
2062 }
2993d58d 2063 }
2064 }
1b633bec
RG
2065 else {
2066 /* check failed */
853faf61 2067 dss->consecutiveSuccessfulChecks = 0;
1b633bec
RG
2068
2069 if (dss->upStatus) {
2070 /* we are currently up */
2071 dss->currentCheckFailures++;
2072 if (dss->currentCheckFailures < dss->maxCheckFailures) {
2073 /* we need more than one failure to be marked as down,
2074 and we did not reach the threshold yet, let's stay down */
2075 newState = true;
2076 }
2993d58d 2077 }
2078 }
2079
2080 if(newState != dss->upStatus) {
2081 warnlog("Marking downstream %s as '%s'", dss->getNameWithAddr(), newState ? "up" : "down");
7565f4e6
RG
2082
2083 if (newState && !dss->connected) {
5d7e6765
RG
2084 newState = dss->reconnect();
2085
2086 if (dss->connected && !dss->threadStarted.test_and_set()) {
150105a2 2087 dss->tid = thread(responderThread, dss);
7565f4e6
RG
2088 }
2089 }
2090
2993d58d 2091 dss->upStatus = newState;
2092 dss->currentCheckFailures = 0;
853faf61 2093 dss->consecutiveSuccessfulChecks = 0;
9f4eb5cc
RG
2094 if (g_snmpAgent && g_snmpTrapsEnabled) {
2095 g_snmpAgent->sendBackendStatusChangeTrap(dss);
2096 }
2993d58d 2097 }
773470ca 2098 }
b076b34a 2099
773470ca 2100 auto delta = dss->sw.udiffAndSet()/1000000.0;
2101 dss->queryLoad = 1.0*(dss->queries.load() - dss->prev.queries.load())/delta;
2102 dss->dropRate = 1.0*(dss->reuseds.load() - dss->prev.reuseds.load())/delta;
3c115e0f 2103 dss->prev.queries.store(dss->queries.load());
773470ca 2104 dss->prev.reuseds.store(dss->reuseds.load());
3c115e0f 2105
773470ca 2106 for(IDState& ids : dss->idStates) { // timeouts
a9489723 2107 int64_t usageIndicator = ids.usageIndicator;
311f19d5
RG
2108 if(IDState::isInUse(usageIndicator) && ids.age++ > g_udpTimeout) {
2109 /* We mark the state as unused as soon as possible
51642fe3
RG
2110 to limit the risk of racing with the
2111 responder thread.
51642fe3 2112 */
0956c5c5
RG
2113 auto oldDU = ids.du;
2114
311f19d5 2115 if (!ids.tryMarkUnused(usageIndicator)) {
71b86bd8
RG
2116 /* this state has been altered in the meantime,
2117 don't go anywhere near it */
2118 continue;
2119 }
fbf14b03 2120 ids.du = nullptr;
9bd1a882 2121 handleDOHTimeout(oldDU);
64e4ebb4 2122 ids.age = 0;
3c115e0f 2123 dss->reuseds++;
2124 --dss->outstanding;
cb167afd 2125 ++g_stats.downstreamTimeouts; // this is an 'actively' discovered timeout
c08a5092 2126 vinfolog("Had a downstream timeout from %s (%s) for query for %s|%s from %s",
2127 dss->remote.toStringWithPort(), dss->name,
348ef1c6 2128 ids.qname.toLogString(), QType(ids.qtype).getName(), ids.origRemote.toStringWithPort());
51642fe3 2129
c2fbeb27 2130 struct timespec ts;
85c7ca75 2131 gettime(&ts);
c2fbeb27 2132
2d11d1b2 2133 struct dnsheader fake;
2134 memset(&fake, 0, sizeof(fake));
2135 fake.id = ids.origID;
c2fbeb27 2136
6d31c8b6 2137 g_rings.insertResponse(ts, ids.origRemote, ids.qname, ids.qtype, std::numeric_limits<unsigned int>::max(), 0, fake, dss->remote);
232f0877 2138 }
3ae86514 2139 }
2140 }
3ae86514 2141 }
3ae86514 2142}
2143
c2a42f97 2144static void bindAny(int af, int sock)
2145{
18f8e493 2146 __attribute__((unused)) int one = 1;
c2a42f97 2147
2148#ifdef IP_FREEBIND
2149 if (setsockopt(sock, IPPROTO_IP, IP_FREEBIND, &one, sizeof(one)) < 0)
2150 warnlog("Warning: IP_FREEBIND setsockopt failed: %s", strerror(errno));
2151#endif
2152
2153#ifdef IP_BINDANY
2154 if (af == AF_INET)
2155 if (setsockopt(sock, IPPROTO_IP, IP_BINDANY, &one, sizeof(one)) < 0)
2156 warnlog("Warning: IP_BINDANY setsockopt failed: %s", strerror(errno));
2157#endif
2158#ifdef IPV6_BINDANY
2159 if (af == AF_INET6)
2160 if (setsockopt(sock, IPPROTO_IPV6, IPV6_BINDANY, &one, sizeof(one)) < 0)
2161 warnlog("Warning: IPV6_BINDANY setsockopt failed: %s", strerror(errno));
2162#endif
2163#ifdef SO_BINDANY
2164 if (setsockopt(sock, SOL_SOCKET, SO_BINDANY, &one, sizeof(one)) < 0)
2165 warnlog("Warning: SO_BINDANY setsockopt failed: %s", strerror(errno));
2166#endif
2167}
2168
a36ce055
RG
2169static void dropGroupPrivs(gid_t gid)
2170{
2171 if (gid) {
2172 if (setgid(gid) == 0) {
2173 if (setgroups(0, NULL) < 0) {
2174 warnlog("Warning: Unable to drop supplementary gids: %s", strerror(errno));
2175 }
2176 }
2177 else {
2178 warnlog("Warning: Unable to set group ID to %d: %s", gid, strerror(errno));
2179 }
2180 }
2181}
2182
2183static void dropUserPrivs(uid_t uid)
2184{
2185 if(uid) {
2186 if(setuid(uid) < 0) {
2187 warnlog("Warning: Unable to set user ID to %d: %s", uid, strerror(errno));
2188 }
2189 }
2190}
2191
41408d3a
RG
2192static void checkFileDescriptorsLimits(size_t udpBindsCount, size_t tcpBindsCount)
2193{
2194 /* stdin, stdout, stderr */
2195 size_t requiredFDsCount = 3;
a9c2e4ab 2196 auto backends = g_dstates.getLocal();
cd73ceeb
RG
2197 /* UDP sockets to backends */
2198 size_t backendUDPSocketsCount = 0;
a9c2e4ab 2199 for (const auto& backend : *backends) {
5bdbb83d 2200 backendUDPSocketsCount += backend->sockets.size();
cd73ceeb
RG
2201 }
2202 requiredFDsCount += backendUDPSocketsCount;
2203 /* TCP sockets to backends */
a9c2e4ab 2204 requiredFDsCount += (backends->size() * g_maxTCPClientThreads);
9fcd6adb 2205 /* listening sockets */
41408d3a
RG
2206 requiredFDsCount += udpBindsCount;
2207 requiredFDsCount += tcpBindsCount;
2208 /* max TCP connections currently served */
2209 requiredFDsCount += g_maxTCPClientThreads;
f2e29d04 2210 /* max pipes for communicating between TCP acceptors and client threads */
41408d3a 2211 requiredFDsCount += (g_maxTCPClientThreads * 2);
41408d3a 2212 /* max TCP queued connections */
9fcd6adb 2213 requiredFDsCount += g_maxTCPQueuedConnections;
41408d3a
RG
2214 /* DelayPipe pipe */
2215 requiredFDsCount += 2;
2216 /* syslog socket */
2217 requiredFDsCount++;
2218 /* webserver main socket */
2219 requiredFDsCount++;
2220 /* console main socket */
2221 requiredFDsCount++;
2222 /* carbon export */
2223 requiredFDsCount++;
2224 /* history file */
2225 requiredFDsCount++;
2226 struct rlimit rl;
2227 getrlimit(RLIMIT_NOFILE, &rl);
9fcd6adb 2228 if (rl.rlim_cur <= requiredFDsCount) {
41408d3a
RG
2229 warnlog("Warning, this configuration can use more than %d file descriptors, web server and console connections not included, and the current limit is %d.", std::to_string(requiredFDsCount), std::to_string(rl.rlim_cur));
2230#ifdef HAVE_SYSTEMD
2231 warnlog("You can increase this value by using LimitNOFILE= in the systemd unit file or ulimit.");
2232#else
2233 warnlog("You can increase this value by using ulimit.");
2234#endif
2235 }
2236}
c2a42f97 2237
6e9fd124
RG
2238static void setUpLocalBind(std::unique_ptr<ClientState>& cs)
2239{
2240 /* skip some warnings if there is an identical UDP context */
fbf14b03 2241 bool warn = cs->tcp == false || cs->tlsFrontend != nullptr || cs->dohFrontend != nullptr;
6e9fd124
RG
2242 int& fd = cs->tcp == false ? cs->udpFD : cs->tcpFD;
2243 (void) warn;
2244
2245 fd = SSocket(cs->local.sin4.sin_family, cs->tcp == false ? SOCK_DGRAM : SOCK_STREAM, 0);
2246
2247 if (cs->tcp) {
2248 SSetsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 1);
2249#ifdef TCP_DEFER_ACCEPT
2250 SSetsockopt(fd, IPPROTO_TCP, TCP_DEFER_ACCEPT, 1);
2251#endif
2252 if (cs->fastOpenQueueSize > 0) {
2253#ifdef TCP_FASTOPEN
2254 SSetsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, cs->fastOpenQueueSize);
2255#else
2256 if (warn) {
2257 warnlog("TCP Fast Open has been configured on local address '%s' but is not supported", cs->local.toStringWithPort());
2258 }
2259#endif
2260 }
2261 }
2262
2263 if(cs->local.sin4.sin_family == AF_INET6) {
2264 SSetsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, 1);
2265 }
2266
2267 bindAny(cs->local.sin4.sin_family, fd);
2268
2269 if(!cs->tcp && IsAnyAddress(cs->local)) {
2270 int one=1;
2271 setsockopt(fd, IPPROTO_IP, GEN_IP_PKTINFO, &one, sizeof(one)); // linux supports this, so why not - might fail on other systems
2272#ifdef IPV6_RECVPKTINFO
2273 setsockopt(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
2274#endif
2275 }
2276
2277 if (cs->reuseport) {
2278#ifdef SO_REUSEPORT
2279 SSetsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
2280#else
2281 if (warn) {
2282 /* no need to warn again if configured but support is not available, we already did for UDP */
2283 warnlog("SO_REUSEPORT has been configured on local address '%s' but is not supported", cs->local.toStringWithPort());
2284 }
2285#endif
2286 }
2287
90f9fbc0
RG
2288 if (!cs->tcp) {
2289 if (cs->local.isIPv4()) {
2290 try {
2291 setSocketIgnorePMTU(cs->udpFD);
2292 }
2293 catch(const std::exception& e) {
2294 warnlog("Failed to set IP_MTU_DISCOVER on UDP server socket for local address '%s': %s", cs->local.toStringWithPort(), e.what());
2295 }
2296 }
2297 }
2298
6e9fd124
RG
2299 const std::string& itf = cs->interface;
2300 if (!itf.empty()) {
2301#ifdef SO_BINDTODEVICE
2302 int res = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, itf.c_str(), itf.length());
2303 if (res != 0) {
2304 warnlog("Error setting up the interface on local address '%s': %s", cs->local.toStringWithPort(), strerror(errno));
2305 }
2306#else
2307 if (warn) {
2308 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", cs->local.toStringWithPort());
2309 }
2310#endif
2311 }
2312
2313#ifdef HAVE_EBPF
2314 if (g_defaultBPFFilter) {
2315 cs->attachFilter(g_defaultBPFFilter);
2316 vinfolog("Attaching default BPF Filter to %s frontend %s", (!cs->tcp ? "UDP" : "TCP"), cs->local.toStringWithPort());
2317 }
2318#endif /* HAVE_EBPF */
2319
2320 if (cs->tlsFrontend != nullptr) {
2321 if (!cs->tlsFrontend->setupTLS()) {
2322 errlog("Error while setting up TLS on local address '%s', exiting", cs->local.toStringWithPort());
2323 _exit(EXIT_FAILURE);
2324 }
2325 }
2326
fbf14b03
RG
2327 if (cs->dohFrontend != nullptr) {
2328 cs->dohFrontend->setup();
2329 }
2330
6e9fd124
RG
2331 SBind(fd, cs->local);
2332
2333 if (cs->tcp) {
fbf14b03 2334 SListen(cs->tcpFD, SOMAXCONN);
6e9fd124
RG
2335 if (cs->tlsFrontend != nullptr) {
2336 warnlog("Listening on %s for TLS", cs->local.toStringWithPort());
2337 }
fbf14b03
RG
2338 else if (cs->dohFrontend != nullptr) {
2339 warnlog("Listening on %s for DoH", cs->local.toStringWithPort());
2340 }
6e9fd124
RG
2341 else if (cs->dnscryptCtx != nullptr) {
2342 warnlog("Listening on %s for DNSCrypt", cs->local.toStringWithPort());
2343 }
2344 else {
2345 warnlog("Listening on %s", cs->local.toStringWithPort());
2346 }
2347 }
2348
2349 cs->ready = true;
2350}
2351
7cc68f53 2352struct
2353{
2354 vector<string> locals;
2355 vector<string> remotes;
5efcfa63 2356 bool checkConfig{false};
7cc68f53 2357 bool beClient{false};
505ca3d1 2358 bool beSupervised{false};
7cc68f53 2359 string command;
2360 string config;
a36ce055
RG
2361 string uid;
2362 string gid;
7cc68f53 2363} g_cmdLine;
520eb5a0 2364
e41f8165 2365std::atomic<bool> g_configurationDone{false};
520eb5a0 2366
7d3ee2bb
PL
2367static void usage()
2368{
2369 cout<<endl;
b82a127e
RG
2370 cout<<"Syntax: dnsdist [-C,--config file] [-c,--client [IP[:PORT]]]\n";
2371 cout<<"[-e,--execute cmd] [-h,--help] [-l,--local addr]\n";
4406b79b 2372 cout<<"[-v,--verbose] [--check-config] [--version]\n";
7d3ee2bb
PL
2373 cout<<"\n";
2374 cout<<"-a,--acl netmask Add this netmask to the ACL\n";
2375 cout<<"-C,--config file Load configuration from 'file'\n";
2376 cout<<"-c,--client Operate as a client, connect to dnsdist. This reads\n";
2377 cout<<" controlSocket from your configuration file, but also\n";
2378 cout<<" accepts an IP:PORT argument\n";
2379#ifdef HAVE_LIBSODIUM
2380 cout<<"-k,--setkey KEY Use KEY for encrypted communication to dnsdist. This\n";
2381 cout<<" is similar to setting setKey in the configuration file.\n";
17a0ddad
CH
2382 cout<<" NOTE: this will leak this key in your shell's history\n";
2383 cout<<" and in the systems running process list.\n";
7d3ee2bb
PL
2384#endif
2385 cout<<"--check-config Validate the configuration file and exit. The exit-code\n";
2386 cout<<" reflects the validation, 0 is OK, 1 means an error.\n";
2387 cout<<" Any errors are printed as well.\n";
7d3ee2bb
PL
2388 cout<<"-e,--execute cmd Connect to dnsdist and execute 'cmd'\n";
2389 cout<<"-g,--gid gid Change the process group ID after binding sockets\n";
2390 cout<<"-h,--help Display this helpful message\n";
2391 cout<<"-l,--local address Listen on this local address\n";
2392 cout<<"--supervised Don't open a console, I'm supervised\n";
2393 cout<<" (use with e.g. systemd and daemontools)\n";
2394 cout<<"--disable-syslog Don't log to syslog, only to stdout\n";
2395 cout<<" (use with e.g. systemd)\n";
7d3ee2bb
PL
2396 cout<<"-u,--uid uid Change the process user ID after binding sockets\n";
2397 cout<<"-v,--verbose Enable verbose mode\n";
4406b79b 2398 cout<<"-V,--version Show dnsdist version information and exit\n";
7d3ee2bb
PL
2399}
2400
24d5cb00 2401int main(int argc, char** argv)
2402try
2403{
41408d3a
RG
2404 size_t udpBindsCount = 0;
2405 size_t tcpBindsCount = 0;
94721140 2406 rl_attempted_completion_function = my_completion;
2407 rl_completion_append_character = 0;
2408
726ddf60 2409 signal(SIGPIPE, SIG_IGN);
6d01c80c 2410 signal(SIGCHLD, SIG_IGN);
0ca6a67f 2411 openlog("dnsdist", LOG_PID|LOG_NDELAY, LOG_DAEMON);
6d01c80c 2412
0b62ec78 2413#ifdef HAVE_LIBSODIUM
6d01c80c 2414 if (sodium_init() == -1) {
2415 cerr<<"Unable to initialize crypto library"<<endl;
2416 exit(EXIT_FAILURE);
2417 }
7691e7df 2418 g_hashperturb=randombytes_uniform(0xffffffff);
2419 srandom(randombytes_uniform(0xffffffff));
2420#else
2421 {
2422 struct timeval tv;
2423 gettimeofday(&tv, 0);
2424 srandom(tv.tv_sec ^ tv.tv_usec ^ getpid());
2425 g_hashperturb=random();
2426 }
2427
0b62ec78 2428#endif
094b6aff 2429 ComboAddress clientAddress = ComboAddress();
11058bd6 2430 g_cmdLine.config=SYSCONFDIR "/dnsdist.conf";
359bdba5 2431 struct option longopts[]={
8f2d5ec3 2432 {"acl", required_argument, 0, 'a'},
359bdba5
CH
2433 {"check-config", no_argument, 0, 1},
2434 {"client", no_argument, 0, 'c'},
7cc68f53 2435 {"config", required_argument, 0, 'C'},
359bdba5 2436 {"disable-syslog", no_argument, 0, 2},
7cc68f53 2437 {"execute", required_argument, 0, 'e'},
359bdba5
CH
2438 {"gid", required_argument, 0, 'g'},
2439 {"help", no_argument, 0, 'h'},
2440 {"local", required_argument, 0, 'l'},
359bdba5 2441 {"setkey", required_argument, 0, 'k'},
359bdba5
CH
2442 {"supervised", no_argument, 0, 3},
2443 {"uid", required_argument, 0, 'u'},
2444 {"verbose", no_argument, 0, 'v'},
2445 {"version", no_argument, 0, 'V'},
2446 {0,0,0,0}
7cc68f53 2447 };
2448 int longindex=0;
8f2d5ec3 2449 string optstring;
7cc68f53 2450 for(;;) {
359bdba5 2451 int c=getopt_long(argc, argv, "a:cC:e:g:hk:l:u:vV", longopts, &longindex);
7cc68f53 2452 if(c==-1)
2453 break;
2454 switch(c) {
5efcfa63
PL
2455 case 1:
2456 g_cmdLine.checkConfig=true;
2457 break;
bbfaaa6f
PL
2458 case 2:
2459 g_syslog=false;
2460 break;
b7165327
CH
2461 case 3:
2462 g_cmdLine.beSupervised=true;
2463 break;
7cc68f53 2464 case 'C':
2465 g_cmdLine.config=optarg;
2466 break;
2467 case 'c':
2468 g_cmdLine.beClient=true;
2469 break;
7cc68f53 2470 case 'e':
2471 g_cmdLine.command=optarg;
2472 break;
a36ce055
RG
2473 case 'g':
2474 g_cmdLine.gid=optarg;
2475 break;
7cc68f53 2476 case 'h':
6306c282 2477 cout<<"dnsdist "<<VERSION<<endl;
7d3ee2bb 2478 usage();
7cc68f53 2479 cout<<"\n";
2480 exit(EXIT_SUCCESS);
2481 break;
8f2d5ec3 2482 case 'a':
2483 optstring=optarg;
2484 g_ACL.modify([optstring](NetmaskGroup& nmg) { nmg.addMask(optstring); });
2485 break;
ddb14ec9 2486 case 'k':
b4b5edbd 2487#ifdef HAVE_LIBSODIUM
b5521206 2488 if (B64Decode(string(optarg), g_consoleKey) < 0) {
ddb14ec9
PL
2489 cerr<<"Unable to decode key '"<<optarg<<"'."<<endl;
2490 exit(EXIT_FAILURE);
2491 }
b4b5edbd
CH
2492#else
2493 cerr<<"dnsdist has been built without libsodium, -k/--setkey is unsupported."<<endl;
2494 exit(EXIT_FAILURE);
ddb14ec9 2495#endif
b4b5edbd 2496 break;
7cc68f53 2497 case 'l':
6a363878 2498 g_cmdLine.locals.push_back(trim_copy(string(optarg)));
7cc68f53 2499 break;
a36ce055
RG
2500 case 'u':
2501 g_cmdLine.uid=optarg;
2502 break;
7cc68f53 2503 case 'v':
2504 g_verbose=true;
2505 break;
6306c282 2506 case 'V':
d4d796e5
PD
2507#ifdef LUAJIT_VERSION
2508 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<" ["<<LUAJIT_VERSION<<"])"<<endl;
2509#else
2510 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<")"<<endl;
2511#endif
70829d97 2512 cout<<"Enabled features: ";
a227f47d
RG
2513#ifdef HAVE_DNS_OVER_TLS
2514 cout<<"dns-over-tls(";
2515#ifdef HAVE_GNUTLS
3909bf10
CH
2516 cout<<"gnutls";
2517 #ifdef HAVE_LIBSSL
2518 cout<<" ";
2519 #endif
a227f47d
RG
2520#endif
2521#ifdef HAVE_LIBSSL
2522 cout<<"openssl";
2523#endif
2524 cout<<") ";
2525#endif
fbf14b03
RG
2526#ifdef HAVE_DNS_OVER_HTTPS
2527 cout<<"dns-over-https(DOH) ";
2528#endif
70829d97
PL
2529#ifdef HAVE_DNSCRYPT
2530 cout<<"dnscrypt ";
2531#endif
0beaa5c8
RG
2532#ifdef HAVE_EBPF
2533 cout<<"ebpf ";
2534#endif
82a91ddf
CH
2535#ifdef HAVE_FSTRM
2536 cout<<"fstrm ";
2537#endif
f4b1f1fd
RG
2538#ifdef HAVE_LIBCRYPTO
2539 cout<<"ipcipher ";
2540#endif
3909bf10
CH
2541#ifdef HAVE_LIBSODIUM
2542 cout<<"libsodium ";
2543#endif
70829d97
PL
2544#ifdef HAVE_PROTOBUF
2545 cout<<"protobuf ";
2546#endif
2547#ifdef HAVE_RE2
2548 cout<<"re2 ";
2549#endif
0beaa5c8
RG
2550#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
2551 cout<<"recvmmsg/sendmmsg ";
2552#endif
2553#ifdef HAVE_NET_SNMP
2554 cout<<"snmp ";
2555#endif
70829d97
PL
2556#ifdef HAVE_SYSTEMD
2557 cout<<"systemd";
2558#endif
2559 cout<<endl;
6306c282
PL
2560 exit(EXIT_SUCCESS);
2561 break;
7d3ee2bb
PL
2562 case '?':
2563 //getopt_long printed an error message.
2564 usage();
2565 exit(EXIT_FAILURE);
2566 break;
7cc68f53 2567 }
24d5cb00 2568 }
6ab65223 2569
7cc68f53 2570 argc-=optind;
2571 argv+=optind;
2572 for(auto p = argv; *p; ++p) {
094b6aff
PL
2573 if(g_cmdLine.beClient) {
2574 clientAddress = ComboAddress(*p, 5199);
2575 } else {
2576 g_cmdLine.remotes.push_back(*p);
2577 }
7cc68f53 2578 }
2579
a1b1a29d 2580 ServerPolicy leastOutstandingPol{"leastOutstanding", leastOutstanding, false};
cd29dcb1 2581
e5a14b2b 2582 g_policy.setState(leastOutstandingPol);
7cc68f53 2583 if(g_cmdLine.beClient || !g_cmdLine.command.empty()) {
2584 setupLua(true, g_cmdLine.config);
094b6aff
PL
2585 if (clientAddress != ComboAddress())
2586 g_serverControl = clientAddress;
7cc68f53 2587 doClient(g_serverControl, g_cmdLine.command);
e16fd59c 2588 _exit(EXIT_SUCCESS);
6d01c80c 2589 }
2e72cc0e 2590
8f133915 2591 auto acl = g_ACL.getCopy();
8f2d5ec3 2592 if(acl.empty()) {
2593 for(auto& addr : {"127.0.0.0/8", "10.0.0.0/8", "100.64.0.0/10", "169.254.0.0/16", "192.168.0.0/16", "172.16.0.0/12", "::1/128", "fc00::/7", "fe80::/10"})
2594 acl.addMask(addr);
2595 g_ACL.setState(acl);
2596 }
8f133915 2597
b5521206 2598 auto consoleACL = g_consoleACL.getCopy();
5ceea33e
RG
2599 for (const auto& mask : { "127.0.0.1/8", "::1/128" }) {
2600 consoleACL.addMask(mask);
2601 }
b5521206
RG
2602 g_consoleACL.setState(consoleACL);
2603
5efcfa63
PL
2604 if (g_cmdLine.checkConfig) {
2605 setupLua(true, g_cmdLine.config);
2606 // No exception was thrown
2607 infolog("Configuration '%s' OK!", g_cmdLine.config);
d8c19b98 2608 _exit(EXIT_SUCCESS);
5efcfa63
PL
2609 }
2610
7cc68f53 2611 auto todo=setupLua(false, g_cmdLine.config);
2e72cc0e 2612
636cc544
CHB
2613 auto localPools = g_pools.getCopy();
2614 {
2615 bool precompute = false;
2616 if (g_policy.getLocal()->name == "chashed") {
2617 precompute = true;
2618 } else {
2619 for (const auto& entry: localPools) {
2620 if (entry.second->policy != nullptr && entry.second->policy->name == "chashed") {
2621 precompute = true;
2622 break ;
2623 }
2624 }
2625 }
2626 if (precompute) {
2627 vinfolog("Pre-computing hashes for consistent hash load-balancing policy");
2628 // pre compute hashes
2629 auto backends = g_dstates.getLocal();
2630 for (auto& backend: *backends) {
2631 backend->hash();
2632 }
d58e616a
CHB
2633 }
2634 }
2635
6e9fd124
RG
2636 if (!g_cmdLine.locals.empty()) {
2637 for (auto it = g_frontends.begin(); it != g_frontends.end(); ) {
fbf14b03
RG
2638 /* DoH, DoT and DNSCrypt frontends are separate */
2639 if ((*it)->dohFrontend == nullptr && (*it)->tlsFrontend == nullptr && (*it)->dnscryptCtx == nullptr) {
6e9fd124 2640 it = g_frontends.erase(it);
9f67b883 2641 }
6e9fd124
RG
2642 else {
2643 ++it;
9f67b883 2644 }
9f67b883
RG
2645 }
2646
6e9fd124
RG
2647 for(const auto& loc : g_cmdLine.locals) {
2648 /* UDP */
2649 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress(loc, 53), false, false, 0, "", {})));
2650 /* TCP */
2651 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress(loc, 53), true, false, 0, "", {})));
87b515ed 2652 }
a36ce055
RG
2653 }
2654
6e9fd124
RG
2655 if (g_frontends.empty()) {
2656 /* UDP */
2657 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress("127.0.0.1", 53), false, false, 0, "", {})));
2658 /* TCP */
2659 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress("127.0.0.1", 53), true, false, 0, "", {})));
11e1e08b 2660 }
11e1e08b 2661
6e9fd124 2662 g_configurationDone = true;
a227f47d 2663
6e9fd124
RG
2664 for(auto& frontend : g_frontends) {
2665 setUpLocalBind(frontend);
a227f47d 2666
6e9fd124
RG
2667 if (frontend->tcp == false) {
2668 ++udpBindsCount;
a227f47d
RG
2669 }
2670 else {
6e9fd124 2671 ++tcpBindsCount;
a227f47d
RG
2672 }
2673 }
2674
b82a127e 2675 warnlog("dnsdist %s comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it according to the terms of the GPL version 2", VERSION);
9c9b4998 2676
b82a127e
RG
2677 vector<string> vec;
2678 std::string acls;
2679 g_ACL.getLocal()->toStringVector(&vec);
2680 for(const auto& s : vec) {
2681 if (!acls.empty())
2682 acls += ", ";
2683 acls += s;
b076b34a 2684 }
b82a127e 2685 infolog("ACL allowing queries from: %s", acls.c_str());
b5521206
RG
2686 vec.clear();
2687 acls.clear();
2688 g_consoleACL.getLocal()->toStringVector(&vec);
2689 for (const auto& entry : vec) {
2690 if (!acls.empty()) {
2691 acls += ", ";
2692 }
2693 acls += entry;
2694 }
2695 infolog("Console ACL allowing connections from: %s", acls.c_str());
6d01c80c 2696
9c9b4998
RG
2697#ifdef HAVE_LIBSODIUM
2698 if (g_consoleEnabled && g_consoleKey.empty()) {
2699 warnlog("Warning, the console has been enabled via 'controlSocket()' but no key has been set with 'setKey()' so all connections will fail until a key has been set");
2700 }
2701#endif
2702
aac59883
RG
2703 uid_t newgid=0;
2704 gid_t newuid=0;
2705
2706 if(!g_cmdLine.gid.empty())
2707 newgid = strToGID(g_cmdLine.gid.c_str());
2708
2709 if(!g_cmdLine.uid.empty())
2710 newuid = strToUID(g_cmdLine.uid.c_str());
2711
2712 dropGroupPrivs(newgid);
2713 dropUserPrivs(newuid);
fdc3ea42
RG
2714 try {
2715 /* we might still have capabilities remaining,
2716 for example if we have been started as root
2717 without --uid or --gid (please don't do that)
2718 or as an unprivileged user with ambient
2719 capabilities like CAP_NET_BIND_SERVICE.
2720 */
2721 dropCapabilities();
2722 }
2723 catch(const std::exception& e) {
2724 warnlog("%s", e.what());
2725 }
aac59883 2726
a36ce055
RG
2727 /* this need to be done _after_ dropping privileges */
2728 g_delay = new DelayPipe<DelayedPacket>();
2729
9f4eb5cc
RG
2730 if (g_snmpAgent) {
2731 g_snmpAgent->run();
2732 }
2733
1f7646c2 2734 g_tcpclientthreads = std::unique_ptr<TCPClientCollection>(new TCPClientCollection(g_maxTCPClientThreads, g_useTCPSinglePipe));
a9bf3ec4 2735
2e72cc0e 2736 for(auto& t : todo)
2737 t();
2738
636cc544 2739 localPools = g_pools.getCopy();
8f4f5186
RG
2740 /* create the default pool no matter what */
2741 createPoolIfNotExists(localPools, "");
7cc68f53 2742 if(g_cmdLine.remotes.size()) {
2743 for(const auto& address : g_cmdLine.remotes) {
c9262563 2744 auto ret=std::make_shared<DownstreamState>(ComboAddress(address, 53));
886e2cf2 2745 addServerToPool(localPools, "", ret);
5d7e6765 2746 if (ret->connected && !ret->threadStarted.test_and_set()) {
2717a92f 2747 ret->tid = thread(responderThread, ret);
7565f4e6 2748 }
ecbe9133 2749 g_dstates.modify([ret](servers_t& servers) { servers.push_back(ret); });
6d01c80c 2750 }
2751 }
886e2cf2 2752 g_pools.setState(localPools);
6d01c80c 2753
a9c2e4ab 2754 if(g_dstates.getLocal()->empty()) {
e73ec7d3 2755 errlog("No downstream servers defined: all packets will get dropped");
2756 // you might define them later, but you need to know
2757 }
2758
41408d3a
RG
2759 checkFileDescriptorsLimits(udpBindsCount, tcpBindsCount);
2760
e5a14b2b 2761 for(auto& dss : g_dstates.getCopy()) { // it is a copy, but the internal shared_ptrs are the real deal
773470ca 2762 if(dss->availability==DownstreamState::Availability::Auto) {
4ab01344 2763 bool newState=upCheck(dss);
a7940c06 2764 warnlog("Marking downstream %s as '%s'", dss->getNameWithAddr(), newState ? "up" : "down");
773470ca 2765 dss->upStatus = newState;
2766 }
2767 }
b076b34a 2768
6e9fd124 2769 for(auto& cs : g_frontends) {
fbf14b03
RG
2770 if (cs->dohFrontend != nullptr) {
2771#ifdef HAVE_DNS_OVER_HTTPS
2772 std::thread t1(dohThread, cs.get());
2b0cb8f8
RG
2773 if (!cs->cpus.empty()) {
2774 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2775 }
fbf14b03
RG
2776 t1.detach();
2777#endif /* HAVE_DNS_OVER_HTTPS */
2778 continue;
2779 }
a36ce055 2780 if (cs->udpFD >= 0) {
6e9fd124 2781 thread t1(udpClientThread, cs.get());
f0e4dcba
RG
2782 if (!cs->cpus.empty()) {
2783 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2784 }
a36ce055 2785 t1.detach();
652a7355 2786 }
a36ce055 2787 else if (cs->tcpFD >= 0) {
6e9fd124 2788 thread t1(tcpAcceptorThread, cs.get());
f0e4dcba
RG
2789 if (!cs->cpus.empty()) {
2790 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2791 }
a36ce055 2792 t1.detach();
726ddf60 2793 }
24d5cb00 2794 }
7730131a 2795
42fae326 2796 thread carbonthread(carbonDumpThread);
2797 carbonthread.detach();
2798
3c115e0f 2799 thread stattid(maintThread);
886e2cf2 2800 stattid.detach();
6d01c80c 2801
886e2cf2
RG
2802 thread healththread(healthChecksThread);
2803
5d4e1ef8
RG
2804 if (!g_secPollSuffix.empty()) {
2805 thread secpollthread(secPollThread);
2806 secpollthread.detach();
2807 }
2808
b82a127e 2809 if(g_cmdLine.beSupervised) {
6ab65223
PL
2810#ifdef HAVE_SYSTEMD
2811 sd_notify(0, "READY=1");
2812#endif
886e2cf2 2813 healththread.join();
773470ca 2814 }
6d01c80c 2815 else {
886e2cf2 2816 healththread.detach();
505ca3d1 2817 doConsole();
3c115e0f 2818 }
9cf811d1 2819 _exit(EXIT_SUCCESS);
3c115e0f 2820
6d01c80c 2821}
3f5c3f1d
PD
2822catch(const LuaContext::ExecutionErrorException& e) {
2823 try {
2824 errlog("Fatal Lua error: %s", e.what());
2825 std::rethrow_if_nested(e);
2010ac95
RG
2826 } catch(const std::exception& ne) {
2827 errlog("Details: %s", ne.what());
3f5c3f1d
PD
2828 }
2829 catch(PDNSException &ae)
2830 {
2831 errlog("Fatal pdns error: %s", ae.reason);
2832 }
2833 _exit(EXIT_FAILURE);
2834}
24d5cb00 2835catch(std::exception &e)
2836{
6d01c80c 2837 errlog("Fatal error: %s", e.what());
4a966472 2838 _exit(EXIT_FAILURE);
24d5cb00 2839}
3f81d239 2840catch(PDNSException &ae)
7730131a 2841{
6d01c80c 2842 errlog("Fatal pdns error: %s", ae.reason);
4a966472 2843 _exit(EXIT_FAILURE);
7730131a 2844}
eb0335ff
MC
2845
2846uint64_t getLatencyCount(const std::string&)
2847{
2848 return g_stats.responses + g_stats.selfAnswered + g_stats.cacheHits;
2849}