]> git.ipfire.org Git - thirdparty/pdns.git/blame - pdns/dnsdist.cc
Add functions to retrieve 'IO wait' and 'steal' metrics on Linux
[thirdparty/pdns.git] / pdns / dnsdist.cc
CommitLineData
24d5cb00 1/*
12471842
PL
2 * This file is part of PowerDNS or dnsdist.
3 * Copyright -- PowerDNS.COM B.V. and its contributors
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of version 2 of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * In addition, for the avoidance of any doubt, permission is granted to
10 * link this program with OpenSSL and to (re)distribute the binaries
11 * produced as the result of such linking.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
5cc8371b
RG
22
23#include "config.h"
24
25#include <fstream>
26#include <getopt.h>
27#include <grp.h>
2c5db3f7 28#include <limits>
5cc8371b
RG
29#include <netinet/tcp.h>
30#include <pwd.h>
31#include <sys/resource.h>
32#include <unistd.h>
424bdfb1 33
4d39d7f3 34#if defined (__OpenBSD__) || defined(__NetBSD__)
424bdfb1
PD
35#include <readline/readline.h>
36#else
37#include <editline/readline.h>
38#endif
39
6ab65223
PL
40#ifdef HAVE_SYSTEMD
41#include <systemd/sd-daemon.h>
42#endif
43
5cc8371b
RG
44#include "dnsdist.hh"
45#include "dnsdist-cache.hh"
b5521206 46#include "dnsdist-console.hh"
5cc8371b 47#include "dnsdist-ecs.hh"
dd9c8246 48#include "dnsdist-healthchecks.hh"
5cc8371b 49#include "dnsdist-lua.hh"
03b00917 50#include "dnsdist-rings.hh"
5d4e1ef8 51#include "dnsdist-secpoll.hh"
53c57da7 52#include "dnsdist-xpf.hh"
5cc8371b
RG
53
54#include "base64.hh"
55#include "delaypipe.hh"
56#include "dolog.hh"
57#include "dnsname.hh"
e0fd37ec 58#include "dnsparser.hh"
5cc8371b
RG
59#include "ednsoptions.hh"
60#include "gettime.hh"
61#include "lock.hh"
62#include "misc.hh"
63#include "sodcrypto.hh"
64#include "sstuff.hh"
519f5484 65#include "threadname.hh"
5cc8371b 66
a40df301 67/* Known sins:
e48090d1 68
d12cea01 69 Receiver is currently single threaded
e5a14b2b 70 not *that* bad actually, but now that we are thread safe, might want to scale
a40df301 71*/
24d5cb00 72
0940e4eb 73/* the Rulaction plan
74 Set of Rules, if one matches, it leads to an Action
75 Both rules and actions could conceivably be Lua based.
76 On the C++ side, both could be inherited from a class Rule and a class Action,
77 on the Lua side we can't do that. */
78
64e4ebb4 79using std::atomic;
80using std::thread;
7730131a 81bool g_verbose;
b065f701 82
e48090d1 83struct DNSDistStats g_stats;
37a5c2d5
PO
84MetricDefinitionStorage g_metricDefinitions;
85
3b203c83 86uint16_t g_maxOutstanding{std::numeric_limits<uint16_t>::max()};
1ea747c0 87uint32_t g_staleCacheEntriesTTL{0};
bbfaaa6f 88bool g_syslog{true};
0dffe9e3 89bool g_allowEmptyResponse{false};
cffde2fd 90
91GlobalStateHolder<NetmaskGroup> g_ACL;
c9262563 92string g_outputBuffer;
a227f47d 93
a227f47d 94std::vector<std::shared_ptr<TLSFrontend>> g_tlslocals;
fbf14b03 95std::vector<std::shared_ptr<DOHFrontend>> g_dohlocals;
6e9fd124 96std::vector<std::shared_ptr<DNSCryptContext>> g_dnsCryptLocals;
87b515ed
RG
97#ifdef HAVE_EBPF
98shared_ptr<BPFFilter> g_defaultBPFFilter;
8429ad04 99std::vector<std::shared_ptr<DynBPFFilter> > g_dynBPFFilters;
87b515ed 100#endif /* HAVE_EBPF */
6e9fd124 101std::vector<std::unique_ptr<ClientState>> g_frontends;
886e2cf2 102GlobalStateHolder<pools_t> g_pools;
0beaa5c8 103size_t g_udpVectorSize{1};
7c0860e1 104
9f4eb5cc
RG
105bool g_snmpEnabled{false};
106bool g_snmpTrapsEnabled{false};
107DNSDistSNMPAgent* g_snmpAgent{nullptr};
108
726ddf60 109/* UDP: the grand design. Per socket we listen on for incoming queries there is one thread.
7730131a 110 Then we have a bunch of connected sockets for talking to downstream servers.
111 We send directly to those sockets.
24d5cb00 112
7730131a 113 For the return path, per downstream server we have a thread that listens to responses.
24d5cb00 114
7730131a 115 Per socket there is an array of 2^16 states, when we send out a packet downstream, we note
116 there the original requestor and the original id. The new ID is the offset in the array.
117
118 When an answer comes in on a socket, we look up the offset by the id, and lob it to the
119 original requestor.
120
121 IDs are assigned by atomic increments of the socket offset.
122 */
123
4d5959e6
RG
124GlobalStateHolder<vector<DNSDistRuleAction> > g_rulactions;
125GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_resprulactions;
126GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_cachehitresprulactions;
2d4783a8 127GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_selfansweredresprulactions;
4d5959e6 128
df111b53 129Rings g_rings;
786e4d8c 130QueryCount g_qcount;
7c0860e1 131
ecbe9133 132GlobalStateHolder<servers_t> g_dstates;
78ffa782 133GlobalStateHolder<NetmaskTree<DynBlock>> g_dynblockNMG;
71c94675 134GlobalStateHolder<SuffixMatchTree<DynBlock>> g_dynblockSMT;
dd46e5e3 135DNSAction::Action g_dynBlockAction = DNSAction::Action::Drop;
3f6d07a4
RG
136int g_tcpRecvTimeout{2};
137int g_tcpSendTimeout{2};
e0b5e49d 138int g_udpTimeout{2};
3f6d07a4 139
26a3cdb7 140bool g_servFailOnNoPolicy{false};
e72fbfc4 141bool g_truncateTC{false};
53c57da7
RG
142bool g_fixupCase{false};
143bool g_preserveTrailingData{false};
32b86928 144bool g_roundrobinFailOnNoServer{false};
0beaa5c8 145
83fe2c55
RG
146std::set<std::string> g_capabilitiesToRetain;
147
e0fd37ec 148static void truncateTC(char* packet, uint16_t* len, size_t responseSize, unsigned int consumed)
6ad8b29a 149try
150{
e0fd37ec
RG
151 bool hadEDNS = false;
152 uint16_t payloadSize = 0;
153 uint16_t z = 0;
154
155 if (g_addEDNSToSelfGeneratedResponses) {
156 hadEDNS = getEDNSUDPPayloadSizeAndZ(packet, *len, &payloadSize, &z);
157 }
158
2aa2c0aa 159 *len=static_cast<uint16_t>(sizeof(dnsheader)+consumed+DNS_TYPE_SIZE+DNS_CLASS_SIZE);
e7c732b8
RG
160 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(packet);
161 dh->ancount = dh->arcount = dh->nscount = 0;
e0fd37ec
RG
162
163 if (hadEDNS) {
5e98ccfa 164 addEDNS(dh, *len, responseSize, z & EDNS_HEADER_FLAG_DO, payloadSize, 0);
e0fd37ec 165 }
6ad8b29a 166}
167catch(...)
168{
169 g_stats.truncFail++;
170}
171
7b3865cd 172struct DelayedPacket
173{
174 int fd;
175 string packet;
176 ComboAddress destination;
68f3eb4d 177 ComboAddress origDest;
7b3865cd 178 void operator()()
179 {
20b019fa
RG
180 ssize_t res;
181 if(origDest.sin4.sin_family == 0) {
182 res = sendto(fd, packet.c_str(), packet.size(), 0, (struct sockaddr*)&destination, destination.getSocklen());
183 }
184 else {
185 res = sendfromto(fd, packet.c_str(), packet.size(), 0, origDest, destination);
186 }
187 if (res == -1) {
188 int err = errno;
189 vinfolog("Error sending delayed response to %s: %s", destination.toStringWithPort(), strerror(err));
190 }
7b3865cd 191 }
192};
193
3e425868 194DelayPipe<DelayedPacket>* g_delay = nullptr;
7b3865cd 195
f653b8df 196void doLatencyStats(double udiff)
daacd477 197{
cb167afd
CHB
198 if(udiff < 1000) ++g_stats.latency0_1;
199 else if(udiff < 10000) ++g_stats.latency1_10;
200 else if(udiff < 50000) ++g_stats.latency10_50;
201 else if(udiff < 100000) ++g_stats.latency50_100;
202 else if(udiff < 1000000) ++g_stats.latency100_1000;
203 else ++g_stats.latencySlow;
eb0335ff 204 g_stats.latencySum += udiff / 1000;
be6c318f 205
daacd477 206 auto doAvg = [](double& var, double n, double weight) {
207 var = (weight -1) * var/weight + n/weight;
208 };
209
210 doAvg(g_stats.latencyAvg100, udiff, 100);
211 doAvg(g_stats.latencyAvg1000, udiff, 1000);
212 doAvg(g_stats.latencyAvg10000, udiff, 10000);
213 doAvg(g_stats.latencyAvg1000000, udiff, 1000000);
214}
ca404e94 215
e7c732b8 216bool responseContentMatches(const char* response, const uint16_t responseLen, const DNSName& qname, const uint16_t qtype, const uint16_t qclass, const ComboAddress& remote, unsigned int& consumed)
fcffc585 217{
fcffc585
RG
218 if (responseLen < sizeof(dnsheader)) {
219 return false;
220 }
221
3e425868 222 const struct dnsheader* dh = reinterpret_cast<const struct dnsheader*>(response);
c8c3d4e4 223 if (dh->qdcount == 0) {
ad7ec9a2 224 if ((dh->rcode != RCode::NoError && dh->rcode != RCode::NXDomain) || g_allowEmptyResponse) {
c8c3d4e4
RG
225 return true;
226 }
227 else {
cb167afd 228 ++g_stats.nonCompliantResponses;
c8c3d4e4
RG
229 return false;
230 }
231 }
232
3e425868
RG
233 uint16_t rqtype, rqclass;
234 DNSName rqname;
fcffc585
RG
235 try {
236 rqname=DNSName(response, responseLen, sizeof(dnsheader), false, &rqtype, &rqclass, &consumed);
237 }
3e425868
RG
238 catch(const std::exception& e) {
239 if(responseLen > 0 && static_cast<size_t>(responseLen) > sizeof(dnsheader)) {
fcffc585 240 infolog("Backend %s sent us a response with id %d that did not parse: %s", remote.toStringWithPort(), ntohs(dh->id), e.what());
3e425868 241 }
cb167afd 242 ++g_stats.nonCompliantResponses;
fcffc585
RG
243 return false;
244 }
245
246 if (rqtype != qtype || rqclass != qclass || rqname != qname) {
247 return false;
248 }
249
250 return true;
251}
252
4ab01344 253static void restoreFlags(struct dnsheader* dh, uint16_t origFlags)
fcffc585
RG
254{
255 static const uint16_t rdMask = 1 << FLAGS_RD_OFFSET;
256 static const uint16_t cdMask = 1 << FLAGS_CD_OFFSET;
257 static const uint16_t restoreFlagsMask = UINT16_MAX & ~(rdMask | cdMask);
0f72fd5c
RG
258 uint16_t * flags = getFlagsFromDNSHeader(dh);
259 /* clear the flags we are about to restore */
260 *flags &= restoreFlagsMask;
261 /* only keep the flags we want to restore */
262 origFlags &= ~restoreFlagsMask;
263 /* set the saved flags as they were */
264 *flags |= origFlags;
265}
266
4ab01344 267static bool fixUpQueryTurnedResponse(DNSQuestion& dq, const uint16_t origFlags)
e7c732b8
RG
268{
269 restoreFlags(dq.dh, origFlags);
270
271 return addEDNSToQueryTurnedResponse(dq);
272}
273
3e425868 274static bool fixUpResponse(char** response, uint16_t* responseLen, size_t* responseSize, const DNSName& qname, uint16_t origFlags, bool ednsAdded, bool ecsAdded, std::vector<uint8_t>& rewrittenResponse, uint16_t addRoom, bool* zeroScope)
0f72fd5c 275{
fcffc585
RG
276 if (*responseLen < sizeof(dnsheader)) {
277 return false;
278 }
279
3e425868 280 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(*response);
c8c3d4e4
RG
281 restoreFlags(dh, origFlags);
282
283 if (*responseLen == sizeof(dnsheader)) {
284 return true;
285 }
286
fcffc585
RG
287 if(g_fixupCase) {
288 string realname = qname.toDNSString();
289 if (*responseLen >= (sizeof(dnsheader) + realname.length())) {
290 memcpy(*response + sizeof(dnsheader), realname.c_str(), realname.length());
291 }
292 }
293
ff73f02b 294 if (ednsAdded || ecsAdded) {
ceae225c 295 uint16_t optStart;
fcffc585
RG
296 size_t optLen = 0;
297 bool last = false;
298
11d5d247
RG
299 const std::string responseStr(*response, *responseLen);
300 int res = locateEDNSOptRR(responseStr, &optStart, &optLen, &last);
fcffc585
RG
301
302 if (res == 0) {
49c33a6c
RG
303 if (zeroScope) { // this finds if an EDNS Client Subnet scope was set, and if it is 0
304 size_t optContentStart = 0;
305 uint16_t optContentLen = 0;
306 /* we need at least 4 bytes after the option length (family: 2, source prefix-length: 1, scope prefix-length: 1) */
307 if (isEDNSOptionInOpt(responseStr, optStart, optLen, EDNSOptionCode::ECS, &optContentStart, &optContentLen) && optContentLen >= 4) {
308 /* see if the EDNS Client Subnet SCOPE PREFIX-LENGTH byte in position 3 is set to 0, which is the only thing
309 we care about. */
310 *zeroScope = responseStr.at(optContentStart + 3) == 0;
311 }
40669042 312 }
9837850d 313
ff73f02b
RG
314 if (ednsAdded) {
315 /* we added the entire OPT RR,
316 therefore we need to remove it entirely */
317 if (last) {
318 /* simply remove the last AR */
319 *responseLen -= optLen;
320 uint16_t arcount = ntohs(dh->arcount);
321 arcount--;
322 dh->arcount = htons(arcount);
323 }
324 else {
325 /* Removing an intermediary RR could lead to compression error */
11d5d247 326 if (rewriteResponseWithoutEDNS(responseStr, rewrittenResponse) == 0) {
ff73f02b
RG
327 *responseLen = rewrittenResponse.size();
328 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
329 rewrittenResponse.reserve(*responseLen + addRoom);
330 }
331 *responseSize = rewrittenResponse.capacity();
332 *response = reinterpret_cast<char*>(rewrittenResponse.data());
333 }
334 else {
335 warnlog("Error rewriting content");
336 }
337 }
fcffc585
RG
338 }
339 else {
ff73f02b
RG
340 /* the OPT RR was already present, but without ECS,
341 we need to remove the ECS option if any */
342 if (last) {
343 /* nothing after the OPT RR, we can simply remove the
344 ECS option */
345 size_t existingOptLen = optLen;
11d5d247 346 removeEDNSOptionFromOPT(*response + optStart, &optLen, EDNSOptionCode::ECS);
ff73f02b 347 *responseLen -= (existingOptLen - optLen);
fcffc585
RG
348 }
349 else {
ff73f02b 350 /* Removing an intermediary RR could lead to compression error */
11d5d247 351 if (rewriteResponseWithoutEDNSOption(responseStr, EDNSOptionCode::ECS, rewrittenResponse) == 0) {
ff73f02b
RG
352 *responseLen = rewrittenResponse.size();
353 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
354 rewrittenResponse.reserve(*responseLen + addRoom);
355 }
356 *responseSize = rewrittenResponse.capacity();
357 *response = reinterpret_cast<char*>(rewrittenResponse.data());
358 }
359 else {
360 warnlog("Error rewriting content");
361 }
fcffc585
RG
362 }
363 }
364 }
365 }
366
367 return true;
368}
369
fcffc585 370#ifdef HAVE_DNSCRYPT
3e425868 371static bool encryptResponse(char* response, uint16_t* responseLen, size_t responseSize, bool tcp, std::shared_ptr<DNSCryptQuery> dnsCryptQuery, dnsheader** dh, dnsheader* dhCopy)
fcffc585 372{
0f72fd5c
RG
373 if (dnsCryptQuery) {
374 uint16_t encryptedResponseLen = 0;
57847d65
RG
375
376 /* save the original header before encrypting it in place */
377 if (dh != nullptr && *dh != nullptr && dhCopy != nullptr) {
378 memcpy(dhCopy, *dh, sizeof(dnsheader));
379 *dh = dhCopy;
380 }
381
43234e76 382 int res = dnsCryptQuery->encryptResponse(response, *responseLen, responseSize, tcp, &encryptedResponseLen);
fcffc585 383 if (res == 0) {
0f72fd5c 384 *responseLen = encryptedResponseLen;
fcffc585
RG
385 } else {
386 /* dropping response */
387 vinfolog("Error encrypting the response, dropping.");
388 return false;
389 }
390 }
0f72fd5c
RG
391 return true;
392}
7129b5c4 393#endif /* HAVE_DNSCRYPT */
fcffc585 394
3e425868
RG
395static bool applyRulesToResponse(LocalStateHolder<vector<DNSDistResponseRuleAction> >& localRespRulactions, DNSResponse& dr)
396{
397 DNSResponseAction::Action action=DNSResponseAction::Action::None;
398 std::string ruleresult;
399 for(const auto& lr : *localRespRulactions) {
400 if(lr.d_rule->matches(&dr)) {
401 lr.d_rule->d_matches++;
402 action=(*lr.d_action)(&dr, &ruleresult);
403 switch(action) {
404 case DNSResponseAction::Action::Allow:
405 return true;
406 break;
407 case DNSResponseAction::Action::Drop:
408 return false;
409 break;
410 case DNSResponseAction::Action::HeaderModify:
411 return true;
412 break;
413 case DNSResponseAction::Action::ServFail:
414 dr.dh->rcode = RCode::ServFail;
415 return true;
416 break;
417 /* non-terminal actions follow */
418 case DNSResponseAction::Action::Delay:
419 dr.delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
420 break;
421 case DNSResponseAction::Action::None:
422 break;
423 }
424 }
425 }
426
427 return true;
428}
429
430bool processResponse(char** response, uint16_t* responseLen, size_t* responseSize, LocalStateHolder<vector<DNSDistResponseRuleAction> >& localRespRulactions, DNSResponse& dr, size_t addRoom, std::vector<uint8_t>& rewrittenResponse, bool muted)
431{
432 if (!applyRulesToResponse(localRespRulactions, dr)) {
433 return false;
434 }
435
436 bool zeroScope = false;
437 if (!fixUpResponse(response, responseLen, responseSize, *dr.qname, dr.origFlags, dr.ednsAdded, dr.ecsAdded, rewrittenResponse, addRoom, dr.useZeroScope ? &zeroScope : nullptr)) {
438 return false;
439 }
440
01e23732 441 if (dr.packetCache && !dr.skipCache && *responseLen <= s_maxPacketCacheEntrySize) {
3e425868
RG
442 if (!dr.useZeroScope) {
443 /* if the query was not suitable for zero-scope, for
444 example because it had an existing ECS entry so the hash is
445 not really 'no ECS', so just insert it for the existing subnet
446 since:
447 - we don't have the correct hash for a non-ECS query
448 - inserting with hash computed before the ECS replacement but with
449 the subnet extracted _after_ the replacement would not work.
450 */
451 zeroScope = false;
452 }
453 // if zeroScope, pass the pre-ECS hash-key and do not pass the subnet to the cache
454 dr.packetCache->insert(zeroScope ? dr.cacheKeyNoECS : dr.cacheKey, zeroScope ? boost::none : dr.subnet, dr.origFlags, dr.dnssecOK, *dr.qname, dr.qtype, dr.qclass, *response, *responseLen, dr.tcp, dr.dh->rcode, dr.tempFailureTTL);
455 }
456
457#ifdef HAVE_DNSCRYPT
458 if (!muted) {
459 if (!encryptResponse(*response, responseLen, *responseSize, dr.tcp, dr.dnsCryptQuery, nullptr, nullptr)) {
460 return false;
461 }
462 }
7129b5c4 463#endif /* HAVE_DNSCRYPT */
3e425868
RG
464
465 return true;
466}
467
468static bool sendUDPResponse(int origFD, const char* response, const uint16_t responseLen, const int delayMsec, const ComboAddress& origDest, const ComboAddress& origRemote)
0f72fd5c 469{
fcffc585
RG
470 if(delayMsec && g_delay) {
471 DelayedPacket dp{origFD, string(response,responseLen), origRemote, origDest};
472 g_delay->submit(dp, delayMsec);
473 }
474 else {
20b019fa
RG
475 ssize_t res;
476 if(origDest.sin4.sin_family == 0) {
3e425868 477 res = sendto(origFD, response, responseLen, 0, reinterpret_cast<const struct sockaddr*>(&origRemote), origRemote.getSocklen());
20b019fa
RG
478 }
479 else {
480 res = sendfromto(origFD, response, responseLen, 0, origDest, origRemote);
481 }
482 if (res == -1) {
483 int err = errno;
a702a96c 484 vinfolog("Error sending response to %s: %s", origRemote.toStringWithPort(), stringerror(err));
20b019fa 485 }
fcffc585
RG
486 }
487
488 return true;
489}
490
150105a2 491
fbf14b03 492int pickBackendSocketForSending(std::shared_ptr<DownstreamState>& state)
150105a2 493{
5bdbb83d 494 return state->sockets[state->socketsOffset++ % state->sockets.size()];
150105a2
RG
495}
496
5bdbb83d 497static void pickBackendSocketsReadyForReceiving(const std::shared_ptr<DownstreamState>& state, std::vector<int>& ready)
150105a2 498{
5bdbb83d 499 ready.clear();
150105a2 500
5bdbb83d
RG
501 if (state->sockets.size() == 1) {
502 ready.push_back(state->sockets[0]);
503 return ;
150105a2
RG
504 }
505
5bdbb83d
RG
506 {
507 std::lock_guard<std::mutex> lock(state->socketsLock);
508 state->mplexer->getAvailableFDs(ready, -1);
150105a2 509 }
150105a2
RG
510}
511
4632045b 512// listens on a dedicated socket, lobs answers from downstream servers to original requestors
9b73b71c 513void responderThread(std::shared_ptr<DownstreamState> dss)
66d1c0bf 514try {
519f5484 515 setThreadName("dnsdist/respond");
d8c19b98 516 auto localRespRulactions = g_resprulactions.getLocal();
8179b6d6 517 char packet[s_maxPacketCacheEntrySize + DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE];
b50ee135 518 static_assert(sizeof(packet) <= UINT16_MAX, "Packet size should fit in a uint16_t");
3e425868
RG
519 /* when the answer is encrypted in place, we need to get a copy
520 of the original header before encryption to fill the ring buffer */
521 dnsheader cleartextDH;
ca404e94 522 vector<uint8_t> rewrittenResponse;
7267213a 523
66d1c0bf 524 uint16_t queryId = 0;
5bdbb83d
RG
525 std::vector<int> sockets;
526 sockets.reserve(dss->sockets.size());
527
7730131a 528 for(;;) {
57847d65 529 dnsheader* dh = reinterpret_cast<struct dnsheader*>(packet);
66d1c0bf 530 try {
5bdbb83d
RG
531 pickBackendSocketsReadyForReceiving(dss, sockets);
532 for (const auto& fd : sockets) {
533 ssize_t got = recv(fd, packet, sizeof(packet), 0);
534 char * response = packet;
535 size_t responseSize = sizeof(packet);
ca404e94 536
2aa2c0aa 537 if (got < 0 || static_cast<size_t>(got) < sizeof(dnsheader))
5bdbb83d 538 continue;
4f380af3 539
3e425868 540 uint16_t responseLen = static_cast<uint16_t>(got);
5bdbb83d 541 queryId = dh->id;
b50ee135 542
fbf14b03 543 if(queryId >= dss->idStates.size()) {
5bdbb83d 544 continue;
fbf14b03 545 }
4f380af3 546
5bdbb83d 547 IDState* ids = &dss->idStates[queryId];
a9489723 548 int64_t usageIndicator = ids->usageIndicator;
c9ba8478 549
311f19d5 550 if(!IDState::isInUse(usageIndicator)) {
9bd1a882 551 /* the corresponding state is marked as not in use, meaning that:
a9489723 552 - it was already cleaned up by another thread and the state is gone ;
9bd1a882
RG
553 - we already got a response for this query and this one is a duplicate.
554 Either way, we don't touch it.
555 */
5bdbb83d 556 continue;
9bd1a882 557 }
7267213a 558
9bd1a882 559 /* read the potential DOHUnit state as soon as possible, but don't use it
a9489723 560 until we have confirmed that we own this state by updating usageIndicator */
0956c5c5 561 auto du = ids->du;
5bdbb83d
RG
562 /* setting age to 0 to prevent the maintainer thread from
563 cleaning this IDS while we process the response.
5bdbb83d
RG
564 */
565 ids->age = 0;
a9489723 566 int origFD = ids->origFD;
fcffc585 567
e7c732b8
RG
568 unsigned int consumed = 0;
569 if (!responseContentMatches(response, responseLen, ids->qname, ids->qtype, ids->qclass, dss->remote, consumed)) {
5bdbb83d
RG
570 continue;
571 }
7267213a 572
3c72bc54 573 bool isDoH = du != nullptr;
a9489723
RG
574 /* atomically mark the state as available, but only if it has not been altered
575 in the meantime */
311f19d5 576 if (ids->tryMarkUnused(usageIndicator)) {
9bd1a882
RG
577 /* clear the potential DOHUnit asap, it's ours now
578 and since we just marked the state as unused,
579 someone could overwrite it. */
246ff31f 580 ids->du = nullptr;
71b86bd8
RG
581 /* we only decrement the outstanding counter if the value was not
582 altered in the meantime, which would mean that the state has been actively reused
583 and the other thread has not incremented the outstanding counter, so we don't
584 want it to be decremented twice. */
585 --dss->outstanding; // you'd think an attacker could game this, but we're using connected socket
0956c5c5 586 } else {
9bd1a882 587 /* someone updated the state in the meantime, we can't touch the existing pointer */
0956c5c5 588 du = nullptr;
f2d0c808
RG
589 /* since the state has been updated, we can't safely access it so let's just drop
590 this response */
591 continue;
71b86bd8 592 }
2c5db3f7 593
5bdbb83d 594 if(dh->tc && g_truncateTC) {
e0fd37ec 595 truncateTC(response, &responseLen, responseSize, consumed);
5bdbb83d 596 }
aeb36780 597
5bdbb83d 598 dh->id = ids->origID;
ca404e94 599
5bdbb83d 600 uint16_t addRoom = 0;
d0ae6360
RG
601 DNSResponse dr = makeDNSResponseFromIDState(*ids, dh, sizeof(packet), responseLen, false);
602 if (dr.dnsCryptQuery) {
5bdbb83d
RG
603 addRoom = DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE;
604 }
ca404e94 605
3e425868
RG
606 memcpy(&cleartextDH, dr.dh, sizeof(cleartextDH));
607 if (!processResponse(&response, &responseLen, &responseSize, localRespRulactions, dr, addRoom, rewrittenResponse, ids->cs && ids->cs->muted)) {
608 continue;
5bdbb83d 609 }
886e2cf2 610
5bdbb83d 611 if (ids->cs && !ids->cs->muted) {
0956c5c5 612 if (du) {
fbf14b03
RG
613#ifdef HAVE_DNS_OVER_HTTPS
614 // DoH query
246ff31f 615 du->response = std::string(response, responseLen);
0956c5c5 616 if (send(du->rsock, &du, sizeof(du), 0) != sizeof(du)) {
9bd1a882 617 /* at this point we have the only remaining pointer on this
b6d19fca
RG
618 DOHUnit object since we did set ids->du to nullptr earlier,
619 except if we got the response before the pointer could be
620 released by the frontend */
621 du->release();
fbf14b03
RG
622 }
623#endif /* HAVE_DNS_OVER_HTTPS */
0956c5c5 624 du = nullptr;
fbf14b03
RG
625 }
626 else {
627 ComboAddress empty;
628 empty.sin4.sin_family = 0;
629 /* if ids->destHarvested is false, origDest holds the listening address.
630 We don't want to use that as a source since it could be 0.0.0.0 for example. */
631 sendUDPResponse(origFD, response, responseLen, dr.delayMsec, ids->destHarvested ? ids->origDest : empty, ids->origRemote);
632 }
5bdbb83d 633 }
66d1c0bf 634
cb167afd 635 ++g_stats.responses;
83f7bbdb
RG
636 if (ids->cs) {
637 ++ids->cs->responses;
638 }
7fc95193 639 ++dss->responses;
66d1c0bf 640
5bdbb83d 641 double udiff = ids->sentTime.udiff();
fbf14b03 642 vinfolog("Got answer from %s, relayed to %s%s, took %f usec", dss->remote.toStringWithPort(), ids->origRemote.toStringWithPort(),
3c72bc54 643 isDoH ? " (https)": "", udiff);
66d1c0bf 644
01f6920b
RG
645 struct timespec ts;
646 gettime(&ts);
3e425868 647 g_rings.insertResponse(ts, *dr.remote, *dr.qname, dr.qtype, static_cast<unsigned int>(udiff), static_cast<unsigned int>(got), cleartextDH, dss->remote);
66d1c0bf 648
5d7a33dd 649 switch (cleartextDH.rcode) {
61d10a4d
MH
650 case RCode::NXDomain:
651 ++g_stats.frontendNXDomain;
652 break;
653 case RCode::ServFail:
cb167afd 654 ++g_stats.servfailResponses;
61d10a4d
MH
655 ++g_stats.frontendServFail;
656 break;
657 case RCode::NoError:
658 ++g_stats.frontendNoError;
659 break;
cb167afd 660 }
5bdbb83d 661 dss->latencyUsec = (127.0 * dss->latencyUsec / 128.0) + udiff/128.0;
66d1c0bf 662
5bdbb83d 663 doLatencyStats(udiff);
fcadd56e 664
5bdbb83d
RG
665 rewrittenResponse.clear();
666 }
66d1c0bf 667 }
df560083 668 catch(const std::exception& e){
cd7fa253 669 vinfolog("Got an error in UDP responder thread while parsing a response from %s, id %d: %s", dss->remote.toStringWithPort(), queryId, e.what());
66d1c0bf 670 }
7730131a 671 }
7730131a 672}
66d1c0bf
RG
673catch(const std::exception& e)
674{
675 errlog("UDP responder thread died because of exception: %s", e.what());
66d1c0bf
RG
676}
677catch(const PDNSException& e)
678{
679 errlog("UDP responder thread died because of PowerDNS exception: %s", e.reason);
66d1c0bf
RG
680}
681catch(...)
682{
683 errlog("UDP responder thread died because of an exception: %s", "unknown");
66d1c0bf 684}
24d5cb00 685
5d7e6765 686bool DownstreamState::reconnect()
3c115e0f 687{
5d7e6765
RG
688 std::unique_lock<std::mutex> tl(connectLock, std::try_to_lock);
689 if (!tl.owns_lock()) {
690 /* we are already reconnecting */
691 return false;
692 }
693
b58f08e5 694 connected = false;
5bdbb83d 695 for (auto& fd : sockets) {
150105a2 696 if (fd != -1) {
5d7e6765 697 if (sockets.size() > 1) {
5bdbb83d
RG
698 std::lock_guard<std::mutex> lock(socketsLock);
699 mplexer->removeReadFD(fd);
700 }
150105a2
RG
701 /* shutdown() is needed to wake up recv() in the responderThread */
702 shutdown(fd, SHUT_RDWR);
703 close(fd);
704 fd = -1;
f99e3aaf 705 }
150105a2
RG
706 if (!IsAnyAddress(remote)) {
707 fd = SSocket(remote.sin4.sin_family, SOCK_DGRAM, 0);
708 if (!IsAnyAddress(sourceAddr)) {
709 SSetsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 1);
70b0d0e2
RG
710 if (!sourceItfName.empty()) {
711#ifdef SO_BINDTODEVICE
712 int res = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, sourceItfName.c_str(), sourceItfName.length());
713 if (res != 0) {
714 infolog("Error setting up the interface on backend socket '%s': %s", remote.toStringWithPort(), stringerror());
715 }
716#endif
717 }
718
150105a2
RG
719 SBind(fd, sourceAddr);
720 }
721 try {
722 SConnect(fd, remote);
5d7e6765 723 if (sockets.size() > 1) {
5bdbb83d
RG
724 std::lock_guard<std::mutex> lock(socketsLock);
725 mplexer->addReadFD(fd, [](int, boost::any) {});
726 }
150105a2
RG
727 connected = true;
728 }
729 catch(const std::runtime_error& error) {
730 infolog("Error connecting to new server with address %s: %s", remote.toStringWithPort(), error.what());
38069e7e
RG
731 connected = false;
732 break;
733 }
7565f4e6 734 }
38069e7e
RG
735 }
736
737 /* if at least one (re-)connection failed, close all sockets */
738 if (!connected) {
5bdbb83d 739 for (auto& fd : sockets) {
38069e7e 740 if (fd != -1) {
5d7e6765
RG
741 if (sockets.size() > 1) {
742 std::lock_guard<std::mutex> lock(socketsLock);
743 mplexer->removeReadFD(fd);
744 }
38069e7e
RG
745 /* shutdown() is needed to wake up recv() in the responderThread */
746 shutdown(fd, SHUT_RDWR);
747 close(fd);
748 fd = -1;
150105a2 749 }
7565f4e6 750 }
b58f08e5 751 }
5d7e6765
RG
752
753 return connected;
b58f08e5 754}
f2caf657
CHB
755void DownstreamState::hash()
756{
d58e616a 757 vinfolog("Computing hashes for id=%s and weight=%d", id, weight);
f2caf657 758 auto w = weight;
d58e616a 759 WriteLock wl(&d_lock);
f2caf657
CHB
760 hashes.clear();
761 while (w > 0) {
762 std::string uuid = boost::str(boost::format("%s-%d") % id % w);
763 unsigned int wshash = burtleCI((const unsigned char*)uuid.c_str(), uuid.size(), g_hashperturb);
764 hashes.insert(wshash);
765 --w;
766 }
767}
768
769void DownstreamState::setId(const boost::uuids::uuid& newId)
770{
771 id = newId;
d58e616a
CHB
772 // compute hashes only if already done
773 if (!hashes.empty()) {
774 hash();
775 }
f2caf657
CHB
776}
777
778void DownstreamState::setWeight(int newWeight)
779{
5a2bbe8c
CHB
780 if (newWeight < 1) {
781 errlog("Error setting server's weight: downstream weight value must be greater than 0.");
782 return ;
783 }
f2caf657 784 weight = newWeight;
d58e616a
CHB
785 if (!hashes.empty()) {
786 hash();
787 }
f2caf657 788}
b58f08e5 789
203b5348 790DownstreamState::DownstreamState(const ComboAddress& remote_, const ComboAddress& sourceAddr_, unsigned int sourceItf_, const std::string& sourceItfName_, size_t numberOfSockets, bool connect=true): sourceItfName(sourceItfName_), remote(remote_), sourceAddr(sourceAddr_), sourceItf(sourceItf_)
b58f08e5 791{
d58e616a 792 pthread_rwlock_init(&d_lock, nullptr);
d61aa945 793 id = getUniqueID();
5d7e6765
RG
794 threadStarted.clear();
795
5bdbb83d
RG
796 mplexer = std::unique_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
797
798 sockets.resize(numberOfSockets);
799 for (auto& fd : sockets) {
150105a2
RG
800 fd = -1;
801 }
802
203b5348 803 if (connect && !IsAnyAddress(remote)) {
b58f08e5 804 reconnect();
f99e3aaf
RG
805 idStates.resize(g_maxOutstanding);
806 sw.start();
807 infolog("Added downstream server %s", remote.toStringWithPort());
fbe2a2e0 808 }
1720247e 809
3c115e0f 810}
811
773470ca 812std::mutex g_luamutex;
3f25bce6 813LuaContext g_lua;
3f25bce6 814
ecbe9133 815GlobalStateHolder<ServerPolicy> g_policy;
773470ca 816
497a6e3a 817shared_ptr<DownstreamState> firstAvailable(const NumberedServerVector& servers, const DNSQuestion* dq)
773470ca 818{
22b2b326 819 for(auto& d : servers) {
da4e7813 820 if(d.second->isUp() && d.second->qps.check())
821 return d.second;
773470ca 822 }
497a6e3a 823 return leastOutstanding(servers, dq);
2c5db3f7 824}
825
d1fba58e 826// get server with least outstanding queries, and within those, with the lowest order, and within those: the fastest
497a6e3a 827shared_ptr<DownstreamState> leastOutstanding(const NumberedServerVector& servers, const DNSQuestion* dq)
c9262563 828{
886e2cf2
RG
829 if (servers.size() == 1 && servers[0].second->isUp()) {
830 return servers[0].second;
831 }
832
d1fba58e 833 vector<pair<tuple<int,int,double>, shared_ptr<DownstreamState>>> poss;
834 /* so you might wonder, why do we go through this trouble? The data on which we sort could change during the sort,
835 which would suck royally and could even lead to crashes. So first we snapshot on what we sort, and then we sort */
478ce172 836 poss.reserve(servers.size());
0e41337b 837 for(auto& d : servers) {
da4e7813 838 if(d.second->isUp()) {
d1fba58e 839 poss.push_back({make_tuple(d.second->outstanding.load(), d.second->order, d.second->latencyUsec), d.second});
c9262563 840 }
841 }
842 if(poss.empty())
843 return shared_ptr<DownstreamState>();
844 nth_element(poss.begin(), poss.begin(), poss.end(), [](const decltype(poss)::value_type& a, const decltype(poss)::value_type& b) { return a.first < b.first; });
845 return poss.begin()->second;
846}
847
497a6e3a 848shared_ptr<DownstreamState> valrandom(unsigned int val, const NumberedServerVector& servers, const DNSQuestion* dq)
c9262563 849{
850 vector<pair<int, shared_ptr<DownstreamState>>> poss;
278403d3
DM
851 int sum = 0;
852 int max = std::numeric_limits<int>::max();
853
22b2b326 854 for(auto& d : servers) { // w=1, w=10 -> 1, 11
da4e7813 855 if(d.second->isUp()) {
278403d3
DM
856 // Don't overflow sum when adding high weights
857 if(d.second->weight > max - sum) {
858 sum = max;
859 } else {
860 sum += d.second->weight;
861 }
862
da4e7813 863 poss.push_back({sum, d.second});
c9262563 864 }
865 }
d2894425
NJ
866
867 // Catch poss & sum are empty to avoid SIGFPE
868 if(poss.empty())
869 return shared_ptr<DownstreamState>();
870
a7f3108c 871 int r = val % sum;
af619119 872 auto p = upper_bound(poss.begin(), poss.end(),r, [](int r_, const decltype(poss)::value_type& a) { return r_ < a.first;});
c9262563 873 if(p==poss.end())
874 return shared_ptr<DownstreamState>();
875 return p->second;
876}
877
497a6e3a 878shared_ptr<DownstreamState> wrandom(const NumberedServerVector& servers, const DNSQuestion* dq)
a7f3108c 879{
497a6e3a 880 return valrandom(random(), servers, dq);
a7f3108c 881}
882
36e763fa 883uint32_t g_hashperturb;
2b4287d4 884double g_consistentHashBalancingFactor = 0;
497a6e3a 885shared_ptr<DownstreamState> whashed(const NumberedServerVector& servers, const DNSQuestion* dq)
a7f3108c 886{
497a6e3a 887 return valrandom(dq->qname->hash(g_hashperturb), servers, dq);
a7f3108c 888}
889
1720247e
CHB
890shared_ptr<DownstreamState> chashed(const NumberedServerVector& servers, const DNSQuestion* dq)
891{
1720247e 892 unsigned int qhash = dq->qname->hash(g_hashperturb);
b712c51e
CHB
893 unsigned int sel = std::numeric_limits<unsigned int>::max();
894 unsigned int min = std::numeric_limits<unsigned int>::max();
895 shared_ptr<DownstreamState> ret = nullptr, first = nullptr;
1720247e 896
2b4287d4 897 double targetLoad = std::numeric_limits<double>::max();
80ee5144 898 if (g_consistentHashBalancingFactor > 0) {
2b4287d4
RG
899 /* we start with one, representing the query we are currently handling */
900 double currentLoad = 1;
901 for (const auto& pair : servers) {
902 currentLoad += pair.second->outstanding;
903 }
904 targetLoad = (currentLoad / servers.size()) * g_consistentHashBalancingFactor;
905 }
906
1720247e 907 for (const auto& d: servers) {
2b4287d4 908 if (d.second->isUp() && d.second->outstanding <= targetLoad) {
93ca495f 909 // make sure hashes have been computed
d58e616a
CHB
910 if (d.second->hashes.empty()) {
911 d.second->hash();
912 }
913 {
914 ReadLock rl(&(d.second->d_lock));
93ca495f
CHB
915 const auto& server = d.second;
916 // we want to keep track of the last hash
b712c51e
CHB
917 if (min > *(server->hashes.begin())) {
918 min = *(server->hashes.begin());
919 first = server;
93ca495f 920 }
b712c51e
CHB
921
922 auto hash_it = server->hashes.lower_bound(qhash);
923 if (hash_it != server->hashes.end()) {
924 if (*hash_it < sel) {
93ca495f
CHB
925 sel = *hash_it;
926 ret = server;
927 }
d58e616a 928 }
1720247e
CHB
929 }
930 }
931 }
93ca495f
CHB
932 if (ret != nullptr) {
933 return ret;
1720247e 934 }
b712c51e
CHB
935 if (first != nullptr) {
936 return first;
1720247e 937 }
93ca495f 938 return shared_ptr<DownstreamState>();
1720247e 939}
a7f3108c 940
497a6e3a 941shared_ptr<DownstreamState> roundrobin(const NumberedServerVector& servers, const DNSQuestion* dq)
5f504638 942{
da4e7813 943 NumberedServerVector poss;
c9262563 944
22b2b326 945 for(auto& d : servers) {
da4e7813 946 if(d.second->isUp()) {
5f504638 947 poss.push_back(d);
c9262563 948 }
5f504638 949 }
c9262563 950
ecbe9133 951 const auto *res=&poss;
32b86928 952 if(poss.empty() && !g_roundrobinFailOnNoServer)
ecbe9133 953 res = &servers;
c9262563 954
955 if(res->empty())
956 return shared_ptr<DownstreamState>();
957
958 static unsigned int counter;
959
da4e7813 960 return (*res)[(counter++) % res->size()].second;
5f504638 961}
962
e6841c9e 963ComboAddress g_serverControl{"127.0.0.1:5199"};
b076b34a 964
886e2cf2
RG
965std::shared_ptr<ServerPool> createPoolIfNotExists(pools_t& pools, const string& poolName)
966{
967 std::shared_ptr<ServerPool> pool;
968 pools_t::iterator it = pools.find(poolName);
969 if (it != pools.end()) {
970 pool = it->second;
971 }
972 else {
8f4f5186
RG
973 if (!poolName.empty())
974 vinfolog("Creating pool %s", poolName);
886e2cf2
RG
975 pool = std::make_shared<ServerPool>();
976 pools.insert(std::pair<std::string,std::shared_ptr<ServerPool> >(poolName, pool));
977 }
978 return pool;
979}
78c8047b 980
742c079a
RG
981void setPoolPolicy(pools_t& pools, const string& poolName, std::shared_ptr<ServerPolicy> policy)
982{
983 std::shared_ptr<ServerPool> pool = createPoolIfNotExists(pools, poolName);
984 if (!poolName.empty()) {
985 vinfolog("Setting pool %s server selection policy to %s", poolName, policy->name);
986 } else {
987 vinfolog("Setting default pool server selection policy to %s", policy->name);
988 }
989 pool->policy = policy;
990}
991
886e2cf2 992void addServerToPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server)
22b2b326 993{
886e2cf2 994 std::shared_ptr<ServerPool> pool = createPoolIfNotExists(pools, poolName);
c218b223 995 if (!poolName.empty()) {
8f4f5186 996 vinfolog("Adding server to pool %s", poolName);
c218b223 997 } else {
8f4f5186 998 vinfolog("Adding server to default pool");
c218b223 999 }
a1b1a29d 1000 pool->addServer(server);
886e2cf2
RG
1001}
1002
1003void removeServerFromPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server)
1004{
8f4f5186 1005 std::shared_ptr<ServerPool> pool = getPool(pools, poolName);
886e2cf2 1006
c218b223 1007 if (!poolName.empty()) {
8f4f5186 1008 vinfolog("Removing server from pool %s", poolName);
c218b223 1009 }
1010 else {
8f4f5186 1011 vinfolog("Removing server from default pool");
c218b223 1012 }
886e2cf2 1013
a1b1a29d 1014 pool->removeServer(server);
886e2cf2
RG
1015}
1016
1017std::shared_ptr<ServerPool> getPool(const pools_t& pools, const std::string& poolName)
1018{
1019 pools_t::const_iterator it = pools.find(poolName);
1020
1021 if (it == pools.end()) {
1022 throw std::out_of_range("No pool named " + poolName);
1023 }
1024
1025 return it->second;
1026}
1027
a1b1a29d 1028NumberedServerVector getDownstreamCandidates(const pools_t& pools, const std::string& poolName)
886e2cf2
RG
1029{
1030 std::shared_ptr<ServerPool> pool = getPool(pools, poolName);
a1b1a29d 1031 return pool->getServers();
22b2b326 1032}
c9262563 1033
614239d0 1034static void spoofResponseFromString(DNSQuestion& dq, const string& spoofContent)
7791f83a
RG
1035{
1036 string result;
614239d0
RG
1037
1038 std::vector<std::string> addrs;
1039 stringtok(addrs, spoofContent, " ,");
1040
1041 if (addrs.size() == 1) {
1042 try {
1043 ComboAddress spoofAddr(spoofContent);
1044 SpoofAction sa({spoofAddr});
1045 sa(&dq, &result);
1046 }
1047 catch(const PDNSException &e) {
1048 SpoofAction sa(spoofContent); // CNAME then
1049 sa(&dq, &result);
1050 }
1051 } else {
1052 std::vector<ComboAddress> cas;
1053 for (const auto& addr : addrs) {
1054 try {
1055 cas.push_back(ComboAddress(addr));
1056 }
1057 catch (...) {
1058 }
1059 }
1060 SpoofAction sa(cas);
6ca7a40a 1061 sa(&dq, &result);
7791f83a
RG
1062 }
1063}
1064
2a28db86
RG
1065bool processRulesResult(const DNSAction::Action& action, DNSQuestion& dq, std::string& ruleresult, bool& drop)
1066{
1067 switch(action) {
1068 case DNSAction::Action::Allow:
1069 return true;
1070 break;
1071 case DNSAction::Action::Drop:
1072 ++g_stats.ruleDrop;
1073 drop = true;
1074 return true;
1075 break;
1076 case DNSAction::Action::Nxdomain:
1077 dq.dh->rcode = RCode::NXDomain;
1078 dq.dh->qr=true;
1079 ++g_stats.ruleNXDomain;
1080 return true;
1081 break;
1082 case DNSAction::Action::Refused:
1083 dq.dh->rcode = RCode::Refused;
1084 dq.dh->qr=true;
1085 ++g_stats.ruleRefused;
1086 return true;
1087 break;
1088 case DNSAction::Action::ServFail:
1089 dq.dh->rcode = RCode::ServFail;
1090 dq.dh->qr=true;
1091 ++g_stats.ruleServFail;
1092 return true;
1093 break;
1094 case DNSAction::Action::Spoof:
1095 spoofResponseFromString(dq, ruleresult);
1096 return true;
1097 break;
1098 case DNSAction::Action::Truncate:
1099 dq.dh->tc = true;
1100 dq.dh->qr = true;
955b9377
RG
1101 dq.dh->ra = dq.dh->rd;
1102 dq.dh->aa = false;
1103 dq.dh->ad = false;
2a28db86
RG
1104 return true;
1105 break;
1106 case DNSAction::Action::HeaderModify:
1107 return true;
1108 break;
1109 case DNSAction::Action::Pool:
1110 dq.poolname=ruleresult;
1111 return true;
1112 break;
1113 case DNSAction::Action::NoRecurse:
1114 dq.dh->rd = false;
1115 return true;
1116 break;
1117 /* non-terminal actions follow */
1118 case DNSAction::Action::Delay:
1119 dq.delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
1120 break;
1121 case DNSAction::Action::None:
1122 /* fall-through */
1123 case DNSAction::Action::NoOp:
1124 break;
1125 }
1126
1127 /* false means that we don't stop the processing */
1128 return false;
1129}
1130
1131
1132static bool applyRulesToQuery(LocalHolders& holders, DNSQuestion& dq, const struct timespec& now)
e91084ce 1133{
4ab01344 1134 g_rings.insertQuery(now, *dq.remote, *dq.qname, dq.qtype, dq.len, *dq.dh);
e91084ce 1135
786e4d8c 1136 if(g_qcount.enabled) {
348ef1c6 1137 string qname = (*dq.qname).toLogString();
786e4d8c
RS
1138 bool countQuery{true};
1139 if(g_qcount.filter) {
1140 std::lock_guard<std::mutex> lock(g_luamutex);
dd1a3034 1141 std::tie (countQuery, qname) = g_qcount.filter(&dq);
786e4d8c
RS
1142 }
1143
1144 if(countQuery) {
1145 WriteLock wl(&g_qcount.queryLock);
1146 if(!g_qcount.records.count(qname)) {
1147 g_qcount.records[qname] = 0;
1148 }
1149 g_qcount.records[qname]++;
1150 }
1151 }
1152
0beaa5c8 1153 if(auto got = holders.dynNMGBlock->lookup(*dq.remote)) {
701f690b 1154 auto updateBlockStats = [&got]() {
cb167afd 1155 ++g_stats.dynBlocked;
701f690b 1156 got->second.blocks++;
1157 };
1158
e91084ce 1159 if(now < got->second.until) {
7b925432
RG
1160 DNSAction::Action action = got->second.action;
1161 if (action == DNSAction::Action::None) {
1162 action = g_dynBlockAction;
1163 }
477c86a0
RG
1164 switch (action) {
1165 case DNSAction::Action::NoOp:
1166 /* do nothing */
1167 break;
79ee8ff9
RG
1168
1169 case DNSAction::Action::Nxdomain:
1170 vinfolog("Query from %s turned into NXDomain because of dynamic block", dq.remote->toStringWithPort());
1171 updateBlockStats();
1172
1173 dq.dh->rcode = RCode::NXDomain;
1174 dq.dh->qr=true;
1175 return true;
1176
477c86a0 1177 case DNSAction::Action::Refused:
dd46e5e3 1178 vinfolog("Query from %s refused because of dynamic block", dq.remote->toStringWithPort());
701f690b 1179 updateBlockStats();
8477236d 1180
dd46e5e3 1181 dq.dh->rcode = RCode::Refused;
79ee8ff9 1182 dq.dh->qr = true;
dd46e5e3 1183 return true;
477c86a0
RG
1184
1185 case DNSAction::Action::Truncate:
8477236d 1186 if(!dq.tcp) {
701f690b 1187 updateBlockStats();
8477236d 1188 vinfolog("Query from %s truncated because of dynamic block", dq.remote->toStringWithPort());
1189 dq.dh->tc = true;
1190 dq.dh->qr = true;
955b9377
RG
1191 dq.dh->ra = dq.dh->rd;
1192 dq.dh->aa = false;
1193 dq.dh->ad = false;
8477236d 1194 return true;
1195 }
1196 else {
348ef1c6 1197 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
8477236d 1198 }
477c86a0 1199 break;
3d60b39a 1200 case DNSAction::Action::NoRecurse:
1201 updateBlockStats();
1202 vinfolog("Query from %s setting rd=0 because of dynamic block", dq.remote->toStringWithPort());
1203 dq.dh->rd = false;
1204 return true;
477c86a0 1205 default:
701f690b 1206 updateBlockStats();
dd46e5e3
RG
1207 vinfolog("Query from %s dropped because of dynamic block", dq.remote->toStringWithPort());
1208 return false;
1209 }
e91084ce
RG
1210 }
1211 }
1212
0beaa5c8 1213 if(auto got = holders.dynSMTBlock->lookup(*dq.qname)) {
701f690b 1214 auto updateBlockStats = [&got]() {
cb167afd 1215 ++g_stats.dynBlocked;
701f690b 1216 got->blocks++;
1217 };
1218
71c94675 1219 if(now < got->until) {
7b925432
RG
1220 DNSAction::Action action = got->action;
1221 if (action == DNSAction::Action::None) {
1222 action = g_dynBlockAction;
1223 }
477c86a0
RG
1224 switch (action) {
1225 case DNSAction::Action::NoOp:
1226 /* do nothing */
1227 break;
79ee8ff9 1228 case DNSAction::Action::Nxdomain:
348ef1c6 1229 vinfolog("Query from %s for %s turned into NXDomain because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
79ee8ff9
RG
1230 updateBlockStats();
1231
1232 dq.dh->rcode = RCode::NXDomain;
1233 dq.dh->qr=true;
1234 return true;
477c86a0 1235 case DNSAction::Action::Refused:
348ef1c6 1236 vinfolog("Query from %s for %s refused because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
701f690b 1237 updateBlockStats();
8477236d 1238
dd46e5e3
RG
1239 dq.dh->rcode = RCode::Refused;
1240 dq.dh->qr=true;
1241 return true;
477c86a0 1242 case DNSAction::Action::Truncate:
8477236d 1243 if(!dq.tcp) {
701f690b 1244 updateBlockStats();
8477236d 1245
348ef1c6 1246 vinfolog("Query from %s for %s truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
8477236d 1247 dq.dh->tc = true;
1248 dq.dh->qr = true;
955b9377
RG
1249 dq.dh->ra = dq.dh->rd;
1250 dq.dh->aa = false;
1251 dq.dh->ad = false;
8477236d 1252 return true;
1253 }
1254 else {
348ef1c6 1255 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
8477236d 1256 }
477c86a0 1257 break;
3d60b39a 1258 case DNSAction::Action::NoRecurse:
1259 updateBlockStats();
1260 vinfolog("Query from %s setting rd=0 because of dynamic block", dq.remote->toStringWithPort());
1261 dq.dh->rd = false;
1262 return true;
477c86a0 1263 default:
701f690b 1264 updateBlockStats();
348ef1c6 1265 vinfolog("Query from %s for %s dropped because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
dd46e5e3
RG
1266 return false;
1267 }
71c94675 1268 }
1269 }
1270
e91084ce
RG
1271 DNSAction::Action action=DNSAction::Action::None;
1272 string ruleresult;
2a28db86 1273 bool drop = false;
0beaa5c8 1274 for(const auto& lr : *holders.rulactions) {
4d5959e6
RG
1275 if(lr.d_rule->matches(&dq)) {
1276 lr.d_rule->d_matches++;
1277 action=(*lr.d_action)(&dq, &ruleresult);
2a28db86 1278 if (processRulesResult(action, dq, ruleresult, drop)) {
3d60b39a 1279 break;
e91084ce
RG
1280 }
1281 }
1282 }
1283
2a28db86
RG
1284 if (drop) {
1285 return false;
1286 }
1287
e91084ce
RG
1288 return true;
1289}
1290
fbf14b03 1291ssize_t udpClientSendRequestToBackend(const std::shared_ptr<DownstreamState>& ss, const int sd, const char* request, const size_t requestLen, bool healthCheck)
fbe2a2e0 1292{
b58f08e5
RG
1293 ssize_t result;
1294
fbe2a2e0 1295 if (ss->sourceItf == 0) {
b58f08e5
RG
1296 result = send(sd, request, requestLen, 0);
1297 }
1298 else {
1299 struct msghdr msgh;
1300 struct iovec iov;
7bec330a 1301 cmsgbuf_aligned cbuf;
a2353842 1302 ComboAddress remote(ss->remote);
7bec330a
OM
1303 fillMSGHdr(&msgh, &iov, &cbuf, sizeof(cbuf), const_cast<char*>(request), requestLen, &remote);
1304 addCMsgSrcAddr(&msgh, &cbuf, &ss->sourceAddr, ss->sourceItf);
b58f08e5 1305 result = sendmsg(sd, &msgh, 0);
fbe2a2e0
RG
1306 }
1307
b58f08e5
RG
1308 if (result == -1) {
1309 int savederrno = errno;
1310 vinfolog("Error sending request to backend %s: %d", ss->remote.toStringWithPort(), savederrno);
1311
1312 /* This might sound silly, but on Linux send() might fail with EINVAL
1b126225
RG
1313 if the interface the socket was bound to doesn't exist anymore.
1314 We don't want to reconnect the real socket if the healthcheck failed,
1315 because it's not using the same socket.
1316 */
1317 if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV)) {
b58f08e5
RG
1318 ss->reconnect();
1319 }
fbe2a2e0
RG
1320 }
1321
b58f08e5 1322 return result;
fbe2a2e0
RG
1323}
1324
0beaa5c8 1325static bool isUDPQueryAcceptable(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest)
24d5cb00 1326{
0beaa5c8
RG
1327 if (msgh->msg_flags & MSG_TRUNC) {
1328 /* message was too large for our buffer */
1329 vinfolog("Dropping message too large for our buffer");
cb167afd 1330 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1331 return false;
1332 }
a4652d55 1333
0beaa5c8
RG
1334 if(!holders.acl->match(remote)) {
1335 vinfolog("Query from %s dropped because of ACL", remote.toStringWithPort());
cb167afd 1336 ++g_stats.aclDrops;
0beaa5c8 1337 return false;
2b3eefc3 1338 }
2b3eefc3 1339
0beaa5c8 1340 cs.queries++;
cb167afd 1341 ++g_stats.queries;
2b3eefc3 1342
0beaa5c8
RG
1343 if (HarvestDestinationAddress(msgh, &dest)) {
1344 /* we don't get the port, only the address */
1345 dest.sin4.sin_port = cs.local.sin4.sin_port;
1346 }
1347 else {
1348 dest.sin4.sin_family = 0;
2b3eefc3 1349 }
549d63c9 1350
0beaa5c8
RG
1351 return true;
1352}
1353
4ab01344 1354boost::optional<std::vector<uint8_t>> checkDNSCryptQuery(const ClientState& cs, const char* query, uint16_t& len, std::shared_ptr<DNSCryptQuery>& dnsCryptQuery, time_t now, bool tcp)
0beaa5c8
RG
1355{
1356 if (cs.dnscryptCtx) {
7129b5c4 1357#ifdef HAVE_DNSCRYPT
0beaa5c8
RG
1358 vector<uint8_t> response;
1359 uint16_t decryptedQueryLen = 0;
2b3eefc3 1360
43234e76 1361 dnsCryptQuery = std::make_shared<DNSCryptQuery>(cs.dnscryptCtx);
0beaa5c8 1362
4ab01344 1363 bool decrypted = handleDNSCryptQuery(const_cast<char*>(query), len, dnsCryptQuery, &decryptedQueryLen, tcp, now, response);
0beaa5c8
RG
1364
1365 if (!decrypted) {
1366 if (response.size() > 0) {
4ab01344 1367 return response;
2b3eefc3 1368 }
4ab01344 1369 throw std::runtime_error("Unable to decrypt DNSCrypt query, dropping.");
0beaa5c8 1370 }
2b3eefc3 1371
0beaa5c8 1372 len = decryptedQueryLen;
7129b5c4 1373#endif /* HAVE_DNSCRYPT */
0beaa5c8 1374 }
4ab01344 1375 return boost::none;
0beaa5c8 1376}
2b3eefc3 1377
0beaa5c8
RG
1378bool checkQueryHeaders(const struct dnsheader* dh)
1379{
1380 if (dh->qr) { // don't respond to responses
cb167afd 1381 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1382 return false;
1383 }
2b3eefc3 1384
0beaa5c8 1385 if (dh->qdcount == 0) {
cb167afd 1386 ++g_stats.emptyQueries;
0beaa5c8
RG
1387 return false;
1388 }
0ba5eecf 1389
0beaa5c8 1390 if (dh->rd) {
cb167afd 1391 ++g_stats.rdQueries;
0beaa5c8 1392 }
e91084ce 1393
0beaa5c8
RG
1394 return true;
1395}
963bef8d 1396
0beaa5c8 1397#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
e59405cd 1398static void queueResponse(const ClientState& cs, const char* response, uint16_t responseLen, const ComboAddress& dest, const ComboAddress& remote, struct mmsghdr& outMsg, struct iovec* iov, cmsgbuf_aligned* cbuf)
0beaa5c8
RG
1399{
1400 outMsg.msg_len = 0;
1401 fillMSGHdr(&outMsg.msg_hdr, iov, nullptr, 0, const_cast<char*>(response), responseLen, const_cast<ComboAddress*>(&remote));
11e1e08b 1402
0beaa5c8
RG
1403 if (dest.sin4.sin_family == 0) {
1404 outMsg.msg_hdr.msg_control = nullptr;
1405 }
1406 else {
1407 addCMsgSrcAddr(&outMsg.msg_hdr, cbuf, &dest, 0);
1408 }
1409}
1410#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
43eeadc1 1411
3e425868
RG
1412/* self-generated responses or cache hits */
1413static bool prepareOutgoingResponse(LocalHolders& holders, ClientState& cs, DNSQuestion& dq, bool cacheHit)
54aaa82b 1414{
3e425868 1415 DNSResponse dr(dq.qname, dq.qtype, dq.qclass, dq.consumed, dq.local, dq.remote, reinterpret_cast<dnsheader*>(dq.dh), dq.size, dq.len, dq.tcp, dq.queryTime);
4ab01344 1416
54aaa82b 1417#ifdef HAVE_PROTOBUF
1418 dr.uniqueId = dq.uniqueId;
1419#endif
1420 dr.qTag = dq.qTag;
3e425868 1421 dr.delayMsec = dq.delayMsec;
54aaa82b 1422
3e425868
RG
1423 if (!applyRulesToResponse(cacheHit ? holders.cacheHitRespRulactions : holders.selfAnsweredRespRulactions, dr)) {
1424 return false;
54aaa82b 1425 }
1426
3e425868
RG
1427 /* in case a rule changed it */
1428 dq.delayMsec = dr.delayMsec;
1429
54aaa82b 1430#ifdef HAVE_DNSCRYPT
7129b5c4 1431 if (!cs.muted) {
3e425868
RG
1432 if (!encryptResponse(reinterpret_cast<char*>(dq.dh), &dq.len, dq.size, dq.tcp, dq.dnsCryptQuery, nullptr, nullptr)) {
1433 return false;
54aaa82b 1434 }
54aaa82b 1435 }
7129b5c4 1436#endif /* HAVE_DNSCRYPT */
54aaa82b 1437
389d903a
RG
1438 if (cacheHit) {
1439 ++g_stats.cacheHits;
1440 }
3e425868 1441
61d10a4d
MH
1442 switch (dr.dh->rcode) {
1443 case RCode::NXDomain:
1444 ++g_stats.frontendNXDomain;
1445 break;
1446 case RCode::ServFail:
1447 ++g_stats.frontendServFail;
1448 break;
1449 case RCode::NoError:
1450 ++g_stats.frontendNoError;
1451 break;
1452 }
3e425868 1453
54aaa82b 1454 doLatencyStats(0); // we're not going to measure this
3e425868 1455 return true;
54aaa82b 1456}
1457
3e425868 1458ProcessQueryResult processQuery(DNSQuestion& dq, ClientState& cs, LocalHolders& holders, std::shared_ptr<DownstreamState>& selectedBackend)
0beaa5c8 1459{
4ab01344 1460 const uint16_t queryId = ntohs(dq.dh->id);
2b3eefc3 1461
0beaa5c8 1462 try {
43234e76
RG
1463 /* we need an accurate ("real") value for the response and
1464 to store into the IDS, but not for insertion into the
1465 rings for example */
43234e76
RG
1466 struct timespec now;
1467 gettime(&now);
2efd427d 1468
2a28db86 1469 if (!applyRulesToQuery(holders, dq, now)) {
3e425868 1470 return ProcessQueryResult::Drop;
0beaa5c8 1471 }
b63add03 1472
0beaa5c8 1473 if(dq.dh->qr) { // something turned it into a response
4ab01344 1474 fixUpQueryTurnedResponse(dq, dq.origFlags);
0beaa5c8 1475
3e425868
RG
1476 if (!prepareOutgoingResponse(holders, cs, dq, false)) {
1477 return ProcessQueryResult::Drop;
22b2b326 1478 }
5f504638 1479
3e425868 1480 ++g_stats.selfAnswered;
7fc95193 1481 ++cs.responses;
3e425868 1482 return ProcessQueryResult::SendAnswer;
0beaa5c8
RG
1483 }
1484
2a28db86 1485 std::shared_ptr<ServerPool> serverPool = getPool(*holders.pools, dq.poolname);
4ab01344 1486 dq.packetCache = serverPool->packetCache;
a1b1a29d 1487 auto policy = *(holders.policy);
0beaa5c8 1488 if (serverPool->policy != nullptr) {
a1b1a29d 1489 policy = *(serverPool->policy);
0beaa5c8 1490 }
a1b1a29d
RG
1491 auto servers = serverPool->getServers();
1492 if (policy.isLua) {
0beaa5c8 1493 std::lock_guard<std::mutex> lock(g_luamutex);
3e425868 1494 selectedBackend = policy.policy(servers, &dq);
a1b1a29d
RG
1495 }
1496 else {
3e425868 1497 selectedBackend = policy.policy(servers, &dq);
0beaa5c8 1498 }
228e4fe8 1499
9837850d 1500 uint16_t cachedResponseSize = dq.size;
3e425868 1501 uint32_t allowExpired = selectedBackend ? 0 : g_staleCacheEntriesTTL;
9837850d 1502
4ab01344
RG
1503 if (dq.packetCache && !dq.skipCache) {
1504 dq.dnssecOK = (getEDNSZ(dq) & EDNS_HEADER_FLAG_DO);
1ef18cab 1505 }
1506
3e425868 1507 if (dq.useECS && ((selectedBackend && selectedBackend->useECS) || (!selectedBackend && serverPool->getECS()))) {
389d903a 1508 // we special case our cache in case a downstream explicitly gave us a universally valid response with a 0 scope
2d0cefab
RG
1509 // we need ECS parsing (parseECS) to be true so we can be sure that the initial incoming query did not have an existing
1510 // ECS option, which would make it unsuitable for the zero-scope feature.
3e425868 1511 if (dq.packetCache && !dq.skipCache && (!selectedBackend || !selectedBackend->disableZeroScope) && dq.packetCache->isECSParsingEnabled()) {
4ab01344 1512 if (dq.packetCache->get(dq, dq.consumed, dq.dh->id, reinterpret_cast<char*>(dq.dh), &cachedResponseSize, &dq.cacheKeyNoECS, dq.subnet, dq.dnssecOK, allowExpired)) {
3e425868
RG
1513 dq.len = cachedResponseSize;
1514
1515 if (!prepareOutgoingResponse(holders, cs, dq, true)) {
1516 return ProcessQueryResult::Drop;
1517 }
1518
1519 return ProcessQueryResult::SendAnswer;
389d903a
RG
1520 }
1521
4ab01344 1522 if (!dq.subnet) {
389d903a 1523 /* there was no existing ECS on the query, enable the zero-scope feature */
4ab01344 1524 dq.useZeroScope = true;
389d903a 1525 }
9837850d 1526 }
389d903a 1527
4ab01344
RG
1528 if (!handleEDNSClientSubnet(dq, &(dq.ednsAdded), &(dq.ecsAdded), g_preserveTrailingData)) {
1529 vinfolog("Dropping query from %s because we couldn't insert the ECS value", dq.remote->toStringWithPort());
3e425868 1530 return ProcessQueryResult::Drop;
886e2cf2 1531 }
0beaa5c8 1532 }
886e2cf2 1533
4ab01344
RG
1534 if (dq.packetCache && !dq.skipCache) {
1535 if (dq.packetCache->get(dq, dq.consumed, dq.dh->id, reinterpret_cast<char*>(dq.dh), &cachedResponseSize, &dq.cacheKey, dq.subnet, dq.dnssecOK, allowExpired)) {
3e425868
RG
1536 dq.len = cachedResponseSize;
1537
1538 if (!prepareOutgoingResponse(holders, cs, dq, true)) {
1539 return ProcessQueryResult::Drop;
1540 }
1541
1542 return ProcessQueryResult::SendAnswer;
886e2cf2 1543 }
cb167afd 1544 ++g_stats.cacheMisses;
0beaa5c8 1545 }
886e2cf2 1546
3e425868 1547 if(!selectedBackend) {
cb167afd 1548 ++g_stats.noPolicy;
26a3cdb7 1549
348ef1c6 1550 vinfolog("%s query for %s|%s from %s, no policy applied", g_servFailOnNoPolicy ? "ServFailed" : "Dropped", dq.qname->toLogString(), QType(dq.qtype).getName(), dq.remote->toStringWithPort());
3e425868 1551 if (g_servFailOnNoPolicy) {
4ab01344 1552 restoreFlags(dq.dh, dq.origFlags);
26a3cdb7 1553
0beaa5c8
RG
1554 dq.dh->rcode = RCode::ServFail;
1555 dq.dh->qr = true;
26a3cdb7 1556
3e425868
RG
1557 if (!prepareOutgoingResponse(holders, cs, dq, false)) {
1558 return ProcessQueryResult::Drop;
1559 }
be6c318f 1560 // no response-only statistics counter to update.
3e425868 1561 return ProcessQueryResult::SendAnswer;
1ea747c0 1562 }
3e425868
RG
1563
1564 return ProcessQueryResult::Drop;
0beaa5c8 1565 }
1ea747c0 1566
3e425868
RG
1567 if (dq.addXPF && selectedBackend->xpfRRCode != 0) {
1568 addXPF(dq, selectedBackend->xpfRRCode, g_preserveTrailingData);
5cc8371b
RG
1569 }
1570
3e425868
RG
1571 selectedBackend->queries++;
1572 return ProcessQueryResult::PassToBackend;
4ab01344
RG
1573 }
1574 catch(const std::exception& e){
1575 vinfolog("Got an error while parsing a %s query from %s, id %d: %s", (dq.tcp ? "TCP" : "UDP"), dq.remote->toStringWithPort(), queryId, e.what());
4ab01344 1576 }
3e425868 1577 return ProcessQueryResult::Drop;
4ab01344
RG
1578}
1579
7bec330a 1580static void processUDPQuery(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest, char* query, uint16_t len, size_t queryBufferSize, struct mmsghdr* responsesVect, unsigned int* queuedResponses, struct iovec* respIOV, cmsgbuf_aligned* respCBuf)
4ab01344
RG
1581{
1582 assert(responsesVect == nullptr || (queuedResponses != nullptr && respIOV != nullptr && respCBuf != nullptr));
1583 uint16_t queryId = 0;
1584
1585 try {
1586 if (!isUDPQueryAcceptable(cs, holders, msgh, remote, dest)) {
1587 return;
1588 }
1589
1590 /* we need an accurate ("real") value for the response and
1591 to store into the IDS, but not for insertion into the
1592 rings for example */
1593 struct timespec queryRealTime;
4ab01344
RG
1594 gettime(&queryRealTime, true);
1595
1596 std::shared_ptr<DNSCryptQuery> dnsCryptQuery = nullptr;
4ab01344
RG
1597 auto dnsCryptResponse = checkDNSCryptQuery(cs, query, len, dnsCryptQuery, queryRealTime.tv_sec, false);
1598 if (dnsCryptResponse) {
1599 sendUDPResponse(cs.udpFD, reinterpret_cast<char*>(dnsCryptResponse->data()), static_cast<uint16_t>(dnsCryptResponse->size()), 0, dest, remote);
1600 return;
1601 }
4ab01344
RG
1602
1603 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(query);
1604 queryId = ntohs(dh->id);
1605
1606 if (!checkQueryHeaders(dh)) {
1607 return;
1608 }
1609
1610 uint16_t qtype, qclass;
1611 unsigned int consumed = 0;
1612 DNSName qname(query, len, sizeof(dnsheader), false, &qtype, &qclass, &consumed);
1613 DNSQuestion dq(&qname, qtype, qclass, consumed, dest.sin4.sin_family != 0 ? &dest : &cs.local, &remote, dh, queryBufferSize, len, false, &queryRealTime);
1614 dq.dnsCryptQuery = std::move(dnsCryptQuery);
3e425868
RG
1615 std::shared_ptr<DownstreamState> ss{nullptr};
1616 auto result = processQuery(dq, cs, holders, ss);
4ab01344 1617
3e425868
RG
1618 if (result == ProcessQueryResult::Drop) {
1619 return;
1620 }
1621
1622 if (result == ProcessQueryResult::SendAnswer) {
4ab01344 1623#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
3e425868
RG
1624 if (dq.delayMsec == 0 && responsesVect != nullptr) {
1625 queueResponse(cs, reinterpret_cast<char*>(dq.dh), dq.len, *dq.local, *dq.remote, responsesVect[*queuedResponses], respIOV, respCBuf);
4ab01344
RG
1626 (*queuedResponses)++;
1627 return;
1628 }
1629#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
6d192f51
RG
1630 /* we use dest, always, because we don't want to use the listening address to send a response since it could be 0.0.0.0 */
1631 sendUDPResponse(cs.udpFD, reinterpret_cast<char*>(dq.dh), dq.len, dq.delayMsec, dest, *dq.remote);
3e425868
RG
1632 return;
1633 }
4ab01344 1634
3e425868 1635 if (result != ProcessQueryResult::PassToBackend || ss == nullptr) {
4ab01344
RG
1636 return;
1637 }
11e1e08b 1638
0beaa5c8
RG
1639 unsigned int idOffset = (ss->idOffset++) % ss->idStates.size();
1640 IDState* ids = &ss->idStates[idOffset];
1641 ids->age = 0;
0956c5c5 1642 DOHUnit* du = nullptr;
c9ba8478 1643
9bd1a882
RG
1644 /* that means that the state was in use, possibly with an allocated
1645 DOHUnit that we will need to handle, but we can't touch it before
1646 confirming that we now own this state */
311f19d5 1647 if (ids->isInUse()) {
0956c5c5
RG
1648 du = ids->du;
1649 }
c9ba8478 1650
a9489723 1651 /* we atomically replace the value, we now own this state */
311f19d5
RG
1652 if (!ids->markAsUsed()) {
1653 /* the state was not in use.
9bd1a882 1654 we reset 'du' because it might have still been in use when we read it. */
0956c5c5 1655 du = nullptr;
fbf14b03 1656 ++ss->outstanding;
71b86bd8 1657 }
0beaa5c8 1658 else {
9bd1a882
RG
1659 /* we are reusing a state, no change in outstanding but if there was an existing DOHUnit we need
1660 to handle it because it's about to be overwritten. */
a9489723 1661 ids->du = nullptr;
fbf14b03 1662 ++ss->reuseds;
cb167afd 1663 ++g_stats.downstreamTimeouts;
0956c5c5 1664 handleDOHTimeout(du);
0beaa5c8 1665 }
0e41337b 1666
0beaa5c8 1667 ids->cs = &cs;
a9489723 1668 ids->origFD = cs.udpFD;
0beaa5c8 1669 ids->origID = dh->id;
d0ae6360 1670 setIDStateFromDNSQuestion(*ids, dq, std::move(qname));
0beaa5c8
RG
1671
1672 /* If we couldn't harvest the real dest addr, still
1673 write down the listening addr since it will be useful
1674 (especially if it's not an 'any' one).
1675 We need to keep track of which one it is since we may
1676 want to use the real but not the listening addr to reply.
1677 */
1678 if (dest.sin4.sin_family != 0) {
1679 ids->origDest = dest;
1680 ids->destHarvested = true;
1681 }
1682 else {
1683 ids->origDest = cs.local;
1684 ids->destHarvested = false;
1685 }
7129b5c4 1686
0beaa5c8 1687 dh->id = idOffset;
ca404e94 1688
38069e7e 1689 int fd = pickBackendSocketForSending(ss);
150105a2 1690 ssize_t ret = udpClientSendRequestToBackend(ss, fd, query, dq.len);
11e1e08b 1691
0beaa5c8 1692 if(ret < 0) {
fbf14b03 1693 ++ss->sendErrors;
cb167afd 1694 ++g_stats.downstreamSendErrors;
0beaa5c8 1695 }
ca404e94 1696
348ef1c6 1697 vinfolog("Got query for %s|%s from %s, relayed to %s", ids->qname.toLogString(), QType(ids->qtype).getName(), remote.toStringWithPort(), ss->getName());
0beaa5c8
RG
1698 }
1699 catch(const std::exception& e){
1700 vinfolog("Got an error in UDP question thread while parsing a query from %s, id %d: %s", remote.toStringWithPort(), queryId, e.what());
1701 }
1702}
1703
1704#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1705static void MultipleMessagesUDPClientThread(ClientState* cs, LocalHolders& holders)
1706{
1707 struct MMReceiver
1708 {
8179b6d6 1709 char packet[s_maxPacketCacheEntrySize];
0beaa5c8
RG
1710 ComboAddress remote;
1711 ComboAddress dest;
1712 struct iovec iov;
392966bb
OM
1713 /* used by HarvestDestinationAddress */
1714 cmsgbuf_aligned cbuf;
0beaa5c8
RG
1715 };
1716 const size_t vectSize = g_udpVectorSize;
1717 /* the actual buffer is larger because:
1718 - we may have to add EDNS and/or ECS
1719 - we use it for self-generated responses (from rule or cache)
1720 but we only accept incoming payloads up to that size
1721 */
1722 static_assert(s_udpIncomingBufferSize <= sizeof(MMReceiver::packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1723
1724 auto recvData = std::unique_ptr<MMReceiver[]>(new MMReceiver[vectSize]);
1725 auto msgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1726 auto outMsgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1727
1728 /* initialize the structures needed to receive our messages */
1729 for (size_t idx = 0; idx < vectSize; idx++) {
1730 recvData[idx].remote.sin4.sin_family = cs->local.sin4.sin_family;
7bec330a 1731 fillMSGHdr(&msgVec[idx].msg_hdr, &recvData[idx].iov, &recvData[idx].cbuf, sizeof(recvData[idx].cbuf), recvData[idx].packet, s_udpIncomingBufferSize, &recvData[idx].remote);
0beaa5c8
RG
1732 }
1733
1734 /* go now */
1735 for(;;) {
1736
1737 /* reset the IO vector, since it's also used to send the vector of responses
1738 to avoid having to copy the data around */
1739 for (size_t idx = 0; idx < vectSize; idx++) {
1740 recvData[idx].iov.iov_base = recvData[idx].packet;
1741 recvData[idx].iov.iov_len = sizeof(recvData[idx].packet);
5f504638 1742 }
0beaa5c8
RG
1743
1744 /* block until we have at least one message ready, but return
1745 as many as possible to save the syscall costs */
1746 int msgsGot = recvmmsg(cs->udpFD, msgVec.get(), vectSize, MSG_WAITFORONE | MSG_TRUNC, nullptr);
1747
1748 if (msgsGot <= 0) {
a702a96c 1749 vinfolog("Getting UDP messages via recvmmsg() failed with: %s", stringerror());
0beaa5c8 1750 continue;
773470ca 1751 }
0beaa5c8
RG
1752
1753 unsigned int msgsToSend = 0;
1754
1755 /* process the received messages */
1756 for (int msgIdx = 0; msgIdx < msgsGot; msgIdx++) {
1757 const struct msghdr* msgh = &msgVec[msgIdx].msg_hdr;
1758 unsigned int got = msgVec[msgIdx].msg_len;
1759 const ComboAddress& remote = recvData[msgIdx].remote;
1760
33d01bbd 1761 if (static_cast<size_t>(got) < sizeof(struct dnsheader)) {
3e425868 1762 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1763 continue;
1764 }
1765
7bec330a 1766 processUDPQuery(*cs, holders, msgh, remote, recvData[msgIdx].dest, recvData[msgIdx].packet, static_cast<uint16_t>(got), sizeof(recvData[msgIdx].packet), outMsgVec.get(), &msgsToSend, &recvData[msgIdx].iov, &recvData[msgIdx].cbuf);
0beaa5c8 1767
2b3eefc3 1768 }
0beaa5c8
RG
1769
1770 /* immediate (not delayed or sent to a backend) responses (mostly from a rule, dynamic block
1771 or the cache) can be sent in batch too */
1772
1773 if (msgsToSend > 0 && msgsToSend <= static_cast<unsigned int>(msgsGot)) {
1774 int sent = sendmmsg(cs->udpFD, outMsgVec.get(), msgsToSend, 0);
1775
2b3eefc3 1776 if (sent < 0 || static_cast<unsigned int>(sent) != msgsToSend) {
a702a96c 1777 vinfolog("Error sending responses with sendmmsg() (%d on %u): %s", sent, msgsToSend, stringerror());
2b3eefc3 1778 }
2b3eefc3 1779 }
0beaa5c8 1780
24d5cb00 1781 }
0beaa5c8
RG
1782}
1783#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1784
1785// listens to incoming queries, sends out to downstream servers, noting the intended return path
9b73b71c 1786static void udpClientThread(ClientState* cs)
0beaa5c8
RG
1787try
1788{
519f5484 1789 setThreadName("dnsdist/udpClie");
0beaa5c8
RG
1790 LocalHolders holders;
1791
1792#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1793 if (g_udpVectorSize > 1) {
1794 MultipleMessagesUDPClientThread(cs, holders);
1795
1796 }
1797 else
1798#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1799 {
8179b6d6 1800 char packet[s_maxPacketCacheEntrySize];
0beaa5c8
RG
1801 /* the actual buffer is larger because:
1802 - we may have to add EDNS and/or ECS
1803 - we use it for self-generated responses (from rule or cache)
1804 but we only accept incoming payloads up to that size
1805 */
1806 static_assert(s_udpIncomingBufferSize <= sizeof(packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1807 struct msghdr msgh;
1808 struct iovec iov;
1809 /* used by HarvestDestinationAddress */
7bec330a 1810 cmsgbuf_aligned cbuf;
0beaa5c8
RG
1811
1812 ComboAddress remote;
1813 ComboAddress dest;
1814 remote.sin4.sin_family = cs->local.sin4.sin_family;
be7dec02 1815 fillMSGHdr(&msgh, &iov, &cbuf, sizeof(cbuf), packet, s_udpIncomingBufferSize, &remote);
0beaa5c8
RG
1816
1817 for(;;) {
1818 ssize_t got = recvmsg(cs->udpFD, &msgh, 0);
1819
1820 if (got < 0 || static_cast<size_t>(got) < sizeof(struct dnsheader)) {
cb167afd 1821 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1822 continue;
1823 }
1824
be7dec02 1825 processUDPQuery(*cs, holders, &msgh, remote, dest, packet, static_cast<uint16_t>(got), sizeof(packet), nullptr, nullptr, nullptr, nullptr);
0beaa5c8
RG
1826 }
1827 }
24d5cb00 1828}
2b3eefc3 1829catch(const std::exception &e)
a4652d55 1830{
1831 errlog("UDP client thread died because of exception: %s", e.what());
a4652d55 1832}
2b3eefc3 1833catch(const PDNSException &e)
a4652d55 1834{
1835 errlog("UDP client thread died because of PowerDNS exception: %s", e.reason);
a4652d55 1836}
1837catch(...)
1838{
1839 errlog("UDP client thread died because of an exception: %s", "unknown");
a4652d55 1840}
24d5cb00 1841
555970c9
RG
1842uint16_t getRandomDNSID()
1843{
1844#ifdef HAVE_LIBSODIUM
dd9c8246 1845 return randombytes_uniform(65536);
555970c9
RG
1846#else
1847 return (random() % 65536);
1848#endif
1849}
1850
6c1ca990 1851uint64_t g_maxTCPClientThreads{10};
886e2cf2 1852std::atomic<uint16_t> g_cacheCleaningDelay{60};
f65ea0c2 1853std::atomic<uint16_t> g_cacheCleaningPercentage{100};
e41f8165 1854
9b73b71c 1855void maintThread()
886e2cf2 1856{
519f5484 1857 setThreadName("dnsdist/main");
886e2cf2
RG
1858 int interval = 1;
1859 size_t counter = 0;
5f3ea719 1860 int32_t secondsToWaitLog = 0;
886e2cf2
RG
1861
1862 for(;;) {
1863 sleep(interval);
1864
069e59db 1865 {
1866 std::lock_guard<std::mutex> lock(g_luamutex);
5f3ea719
PL
1867 auto f = g_lua.readVariable<boost::optional<std::function<void()> > >("maintenance");
1868 if(f) {
1869 try {
1870 (*f)();
1871 secondsToWaitLog = 0;
1872 }
1873 catch(std::exception &e) {
1874 if (secondsToWaitLog <= 0) {
1875 infolog("Error during execution of maintenance function: %s", e.what());
1876 secondsToWaitLog = 61;
1877 }
1878 secondsToWaitLog -= interval;
1879 }
1880 }
069e59db 1881 }
886e2cf2
RG
1882
1883 counter++;
1884 if (counter >= g_cacheCleaningDelay) {
c1b81381
RG
1885 /* keep track, for each cache, of whether we should keep
1886 expired entries */
1887 std::map<std::shared_ptr<DNSDistPacketCache>, bool> caches;
1888
1889 /* gather all caches actually used by at least one pool, and see
1890 if something prevents us from cleaning the expired entries */
a9c2e4ab 1891 auto localPools = g_pools.getLocal();
a9c2e4ab 1892 for (const auto& entry : *localPools) {
c1b81381
RG
1893 auto& pool = entry.second;
1894
1895 auto packetCache = pool->packetCache;
1896 if (!packetCache) {
1897 continue;
1898 }
1899
1900 auto pair = caches.insert({packetCache, false});
1901 auto& iter = pair.first;
1902 /* if we need to keep stale data for this cache (ie, not clear
1903 expired entries when at least one pool using this cache
1904 has all its backends down) */
1905 if (packetCache->keepStaleData() && iter->second == false) {
1906 /* so far all pools had at least one backend up */
1907 if (pool->countServers(true) == 0) {
1908 iter->second = true;
1909 }
886e2cf2
RG
1910 }
1911 }
c1b81381
RG
1912
1913 for (auto pair : caches) {
1914 /* shall we keep expired entries ? */
1915 if (pair.second == true) {
1916 continue;
886e2cf2 1917 }
c1b81381
RG
1918 auto& packetCache = pair.first;
1919 size_t upTo = (packetCache->getMaxEntries()* (100 - g_cacheCleaningPercentage)) / 100;
1920 packetCache->purgeExpired(upTo);
886e2cf2
RG
1921 }
1922 counter = 0;
1923 }
1924
1925 // ponder pruning g_dynblocks of expired entries here
1926 }
886e2cf2
RG
1927}
1928
9b73b71c 1929static void secPollThread()
5d4e1ef8
RG
1930{
1931 setThreadName("dnsdist/secpoll");
1932
1933 for (;;) {
1934 try {
1935 doSecPoll(g_secPollSuffix);
1936 }
1937 catch(...) {
1938 }
1939 sleep(g_secPollInterval);
1940 }
1941}
1942
9b73b71c 1943static void healthChecksThread()
3ae86514 1944{
519f5484 1945 setThreadName("dnsdist/healthC");
77c9bc9a 1946
dd9c8246 1947 static const int interval = 1;
64e4ebb4 1948
3ae86514 1949 for(;;) {
1950 sleep(interval);
773470ca 1951
dd9c8246 1952 if(g_tcpclientthreads->getQueuedCount() > 1 && !g_tcpclientthreads->hasReachedMaxThreads()) {
a9bf3ec4 1953 g_tcpclientthreads->addTCPClientThread();
dd9c8246 1954 }
3ae86514 1955
dd9c8246 1956 auto mplexer = std::shared_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
a9c2e4ab
RG
1957 auto states = g_dstates.getLocal(); // this points to the actual shared_ptrs!
1958 for(auto& dss : *states) {
dd9c8246 1959 if(++dss->lastCheck < dss->checkInterval) {
7c9bf18d 1960 continue;
dd9c8246 1961 }
5d7e6765 1962
dd9c8246 1963 dss->lastCheck = 0;
7565f4e6 1964
dd9c8246
RG
1965 if (dss->availability == DownstreamState::Availability::Auto) {
1966 if (!queueHealthCheck(mplexer, dss)) {
1967 updateHealthCheckResult(dss, false);
2993d58d 1968 }
773470ca 1969 }
b076b34a 1970
773470ca 1971 auto delta = dss->sw.udiffAndSet()/1000000.0;
1972 dss->queryLoad = 1.0*(dss->queries.load() - dss->prev.queries.load())/delta;
1973 dss->dropRate = 1.0*(dss->reuseds.load() - dss->prev.reuseds.load())/delta;
3c115e0f 1974 dss->prev.queries.store(dss->queries.load());
773470ca 1975 dss->prev.reuseds.store(dss->reuseds.load());
3c115e0f 1976
dd9c8246 1977 for (IDState& ids : dss->idStates) { // timeouts
a9489723 1978 int64_t usageIndicator = ids.usageIndicator;
311f19d5
RG
1979 if(IDState::isInUse(usageIndicator) && ids.age++ > g_udpTimeout) {
1980 /* We mark the state as unused as soon as possible
51642fe3
RG
1981 to limit the risk of racing with the
1982 responder thread.
51642fe3 1983 */
0956c5c5
RG
1984 auto oldDU = ids.du;
1985
311f19d5 1986 if (!ids.tryMarkUnused(usageIndicator)) {
71b86bd8
RG
1987 /* this state has been altered in the meantime,
1988 don't go anywhere near it */
1989 continue;
1990 }
fbf14b03 1991 ids.du = nullptr;
9bd1a882 1992 handleDOHTimeout(oldDU);
64e4ebb4 1993 ids.age = 0;
3c115e0f 1994 dss->reuseds++;
1995 --dss->outstanding;
cb167afd 1996 ++g_stats.downstreamTimeouts; // this is an 'actively' discovered timeout
c08a5092 1997 vinfolog("Had a downstream timeout from %s (%s) for query for %s|%s from %s",
1998 dss->remote.toStringWithPort(), dss->name,
348ef1c6 1999 ids.qname.toLogString(), QType(ids.qtype).getName(), ids.origRemote.toStringWithPort());
51642fe3 2000
c2fbeb27 2001 struct timespec ts;
85c7ca75 2002 gettime(&ts);
c2fbeb27 2003
2d11d1b2 2004 struct dnsheader fake;
2005 memset(&fake, 0, sizeof(fake));
2006 fake.id = ids.origID;
c2fbeb27 2007
6d31c8b6 2008 g_rings.insertResponse(ts, ids.origRemote, ids.qname, ids.qtype, std::numeric_limits<unsigned int>::max(), 0, fake, dss->remote);
232f0877 2009 }
3ae86514 2010 }
2011 }
dd9c8246
RG
2012
2013 handleQueuedHealthChecks(mplexer);
3ae86514 2014 }
3ae86514 2015}
2016
c2a42f97 2017static void bindAny(int af, int sock)
2018{
18f8e493 2019 __attribute__((unused)) int one = 1;
c2a42f97 2020
2021#ifdef IP_FREEBIND
2022 if (setsockopt(sock, IPPROTO_IP, IP_FREEBIND, &one, sizeof(one)) < 0)
a702a96c 2023 warnlog("Warning: IP_FREEBIND setsockopt failed: %s", stringerror());
c2a42f97 2024#endif
2025
2026#ifdef IP_BINDANY
2027 if (af == AF_INET)
2028 if (setsockopt(sock, IPPROTO_IP, IP_BINDANY, &one, sizeof(one)) < 0)
a702a96c 2029 warnlog("Warning: IP_BINDANY setsockopt failed: %s", stringerror());
c2a42f97 2030#endif
2031#ifdef IPV6_BINDANY
2032 if (af == AF_INET6)
2033 if (setsockopt(sock, IPPROTO_IPV6, IPV6_BINDANY, &one, sizeof(one)) < 0)
a702a96c 2034 warnlog("Warning: IPV6_BINDANY setsockopt failed: %s", stringerror());
c2a42f97 2035#endif
2036#ifdef SO_BINDANY
2037 if (setsockopt(sock, SOL_SOCKET, SO_BINDANY, &one, sizeof(one)) < 0)
a702a96c 2038 warnlog("Warning: SO_BINDANY setsockopt failed: %s", stringerror());
c2a42f97 2039#endif
2040}
2041
a36ce055
RG
2042static void dropGroupPrivs(gid_t gid)
2043{
2044 if (gid) {
2045 if (setgid(gid) == 0) {
2046 if (setgroups(0, NULL) < 0) {
a702a96c 2047 warnlog("Warning: Unable to drop supplementary gids: %s", stringerror());
a36ce055
RG
2048 }
2049 }
2050 else {
a702a96c 2051 warnlog("Warning: Unable to set group ID to %d: %s", gid, stringerror());
a36ce055
RG
2052 }
2053 }
2054}
2055
2056static void dropUserPrivs(uid_t uid)
2057{
2058 if(uid) {
2059 if(setuid(uid) < 0) {
a702a96c 2060 warnlog("Warning: Unable to set user ID to %d: %s", uid, stringerror());
a36ce055
RG
2061 }
2062 }
2063}
2064
41408d3a
RG
2065static void checkFileDescriptorsLimits(size_t udpBindsCount, size_t tcpBindsCount)
2066{
2067 /* stdin, stdout, stderr */
2068 size_t requiredFDsCount = 3;
a9c2e4ab 2069 auto backends = g_dstates.getLocal();
cd73ceeb
RG
2070 /* UDP sockets to backends */
2071 size_t backendUDPSocketsCount = 0;
a9c2e4ab 2072 for (const auto& backend : *backends) {
5bdbb83d 2073 backendUDPSocketsCount += backend->sockets.size();
cd73ceeb
RG
2074 }
2075 requiredFDsCount += backendUDPSocketsCount;
2076 /* TCP sockets to backends */
a9c2e4ab 2077 requiredFDsCount += (backends->size() * g_maxTCPClientThreads);
9fcd6adb 2078 /* listening sockets */
41408d3a
RG
2079 requiredFDsCount += udpBindsCount;
2080 requiredFDsCount += tcpBindsCount;
2081 /* max TCP connections currently served */
2082 requiredFDsCount += g_maxTCPClientThreads;
f2e29d04 2083 /* max pipes for communicating between TCP acceptors and client threads */
41408d3a 2084 requiredFDsCount += (g_maxTCPClientThreads * 2);
41408d3a 2085 /* max TCP queued connections */
9fcd6adb 2086 requiredFDsCount += g_maxTCPQueuedConnections;
41408d3a
RG
2087 /* DelayPipe pipe */
2088 requiredFDsCount += 2;
2089 /* syslog socket */
2090 requiredFDsCount++;
2091 /* webserver main socket */
2092 requiredFDsCount++;
2093 /* console main socket */
2094 requiredFDsCount++;
2095 /* carbon export */
2096 requiredFDsCount++;
2097 /* history file */
2098 requiredFDsCount++;
2099 struct rlimit rl;
2100 getrlimit(RLIMIT_NOFILE, &rl);
9fcd6adb 2101 if (rl.rlim_cur <= requiredFDsCount) {
41408d3a
RG
2102 warnlog("Warning, this configuration can use more than %d file descriptors, web server and console connections not included, and the current limit is %d.", std::to_string(requiredFDsCount), std::to_string(rl.rlim_cur));
2103#ifdef HAVE_SYSTEMD
2104 warnlog("You can increase this value by using LimitNOFILE= in the systemd unit file or ulimit.");
2105#else
2106 warnlog("You can increase this value by using ulimit.");
2107#endif
2108 }
2109}
c2a42f97 2110
6e9fd124
RG
2111static void setUpLocalBind(std::unique_ptr<ClientState>& cs)
2112{
2113 /* skip some warnings if there is an identical UDP context */
fbf14b03 2114 bool warn = cs->tcp == false || cs->tlsFrontend != nullptr || cs->dohFrontend != nullptr;
6e9fd124
RG
2115 int& fd = cs->tcp == false ? cs->udpFD : cs->tcpFD;
2116 (void) warn;
2117
2118 fd = SSocket(cs->local.sin4.sin_family, cs->tcp == false ? SOCK_DGRAM : SOCK_STREAM, 0);
2119
2120 if (cs->tcp) {
2121 SSetsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 1);
2122#ifdef TCP_DEFER_ACCEPT
2123 SSetsockopt(fd, IPPROTO_TCP, TCP_DEFER_ACCEPT, 1);
2124#endif
2125 if (cs->fastOpenQueueSize > 0) {
2126#ifdef TCP_FASTOPEN
2127 SSetsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, cs->fastOpenQueueSize);
2128#else
2129 if (warn) {
2130 warnlog("TCP Fast Open has been configured on local address '%s' but is not supported", cs->local.toStringWithPort());
2131 }
2132#endif
2133 }
2134 }
2135
2136 if(cs->local.sin4.sin_family == AF_INET6) {
2137 SSetsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, 1);
2138 }
2139
2140 bindAny(cs->local.sin4.sin_family, fd);
2141
2142 if(!cs->tcp && IsAnyAddress(cs->local)) {
2143 int one=1;
2144 setsockopt(fd, IPPROTO_IP, GEN_IP_PKTINFO, &one, sizeof(one)); // linux supports this, so why not - might fail on other systems
2145#ifdef IPV6_RECVPKTINFO
2146 setsockopt(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
2147#endif
2148 }
2149
2150 if (cs->reuseport) {
2151#ifdef SO_REUSEPORT
2152 SSetsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
2153#else
2154 if (warn) {
2155 /* no need to warn again if configured but support is not available, we already did for UDP */
2156 warnlog("SO_REUSEPORT has been configured on local address '%s' but is not supported", cs->local.toStringWithPort());
2157 }
2158#endif
2159 }
2160
90f9fbc0
RG
2161 if (!cs->tcp) {
2162 if (cs->local.isIPv4()) {
2163 try {
2164 setSocketIgnorePMTU(cs->udpFD);
2165 }
2166 catch(const std::exception& e) {
2167 warnlog("Failed to set IP_MTU_DISCOVER on UDP server socket for local address '%s': %s", cs->local.toStringWithPort(), e.what());
2168 }
2169 }
2170 }
2171
6e9fd124
RG
2172 const std::string& itf = cs->interface;
2173 if (!itf.empty()) {
2174#ifdef SO_BINDTODEVICE
2175 int res = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, itf.c_str(), itf.length());
2176 if (res != 0) {
a702a96c 2177 warnlog("Error setting up the interface on local address '%s': %s", cs->local.toStringWithPort(), stringerror());
6e9fd124
RG
2178 }
2179#else
2180 if (warn) {
2181 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", cs->local.toStringWithPort());
2182 }
2183#endif
2184 }
2185
2186#ifdef HAVE_EBPF
2187 if (g_defaultBPFFilter) {
2188 cs->attachFilter(g_defaultBPFFilter);
2189 vinfolog("Attaching default BPF Filter to %s frontend %s", (!cs->tcp ? "UDP" : "TCP"), cs->local.toStringWithPort());
2190 }
2191#endif /* HAVE_EBPF */
2192
2193 if (cs->tlsFrontend != nullptr) {
2194 if (!cs->tlsFrontend->setupTLS()) {
2195 errlog("Error while setting up TLS on local address '%s', exiting", cs->local.toStringWithPort());
2196 _exit(EXIT_FAILURE);
2197 }
2198 }
2199
fbf14b03
RG
2200 if (cs->dohFrontend != nullptr) {
2201 cs->dohFrontend->setup();
2202 }
2203
6e9fd124
RG
2204 SBind(fd, cs->local);
2205
2206 if (cs->tcp) {
fbf14b03 2207 SListen(cs->tcpFD, SOMAXCONN);
6e9fd124
RG
2208 if (cs->tlsFrontend != nullptr) {
2209 warnlog("Listening on %s for TLS", cs->local.toStringWithPort());
2210 }
fbf14b03
RG
2211 else if (cs->dohFrontend != nullptr) {
2212 warnlog("Listening on %s for DoH", cs->local.toStringWithPort());
2213 }
6e9fd124
RG
2214 else if (cs->dnscryptCtx != nullptr) {
2215 warnlog("Listening on %s for DNSCrypt", cs->local.toStringWithPort());
2216 }
2217 else {
2218 warnlog("Listening on %s", cs->local.toStringWithPort());
2219 }
2220 }
2221
2222 cs->ready = true;
2223}
2224
7cc68f53 2225struct
2226{
2227 vector<string> locals;
2228 vector<string> remotes;
5efcfa63 2229 bool checkConfig{false};
7cc68f53 2230 bool beClient{false};
505ca3d1 2231 bool beSupervised{false};
7cc68f53 2232 string command;
2233 string config;
a36ce055
RG
2234 string uid;
2235 string gid;
7cc68f53 2236} g_cmdLine;
520eb5a0 2237
e41f8165 2238std::atomic<bool> g_configurationDone{false};
520eb5a0 2239
7d3ee2bb
PL
2240static void usage()
2241{
2242 cout<<endl;
b82a127e
RG
2243 cout<<"Syntax: dnsdist [-C,--config file] [-c,--client [IP[:PORT]]]\n";
2244 cout<<"[-e,--execute cmd] [-h,--help] [-l,--local addr]\n";
4406b79b 2245 cout<<"[-v,--verbose] [--check-config] [--version]\n";
7d3ee2bb
PL
2246 cout<<"\n";
2247 cout<<"-a,--acl netmask Add this netmask to the ACL\n";
2248 cout<<"-C,--config file Load configuration from 'file'\n";
2249 cout<<"-c,--client Operate as a client, connect to dnsdist. This reads\n";
2250 cout<<" controlSocket from your configuration file, but also\n";
2251 cout<<" accepts an IP:PORT argument\n";
2252#ifdef HAVE_LIBSODIUM
2253 cout<<"-k,--setkey KEY Use KEY for encrypted communication to dnsdist. This\n";
2254 cout<<" is similar to setting setKey in the configuration file.\n";
17a0ddad
CH
2255 cout<<" NOTE: this will leak this key in your shell's history\n";
2256 cout<<" and in the systems running process list.\n";
7d3ee2bb
PL
2257#endif
2258 cout<<"--check-config Validate the configuration file and exit. The exit-code\n";
2259 cout<<" reflects the validation, 0 is OK, 1 means an error.\n";
2260 cout<<" Any errors are printed as well.\n";
7d3ee2bb
PL
2261 cout<<"-e,--execute cmd Connect to dnsdist and execute 'cmd'\n";
2262 cout<<"-g,--gid gid Change the process group ID after binding sockets\n";
2263 cout<<"-h,--help Display this helpful message\n";
2264 cout<<"-l,--local address Listen on this local address\n";
2265 cout<<"--supervised Don't open a console, I'm supervised\n";
2266 cout<<" (use with e.g. systemd and daemontools)\n";
2267 cout<<"--disable-syslog Don't log to syslog, only to stdout\n";
2268 cout<<" (use with e.g. systemd)\n";
7d3ee2bb
PL
2269 cout<<"-u,--uid uid Change the process user ID after binding sockets\n";
2270 cout<<"-v,--verbose Enable verbose mode\n";
4406b79b 2271 cout<<"-V,--version Show dnsdist version information and exit\n";
7d3ee2bb
PL
2272}
2273
24d5cb00 2274int main(int argc, char** argv)
2275try
2276{
41408d3a
RG
2277 size_t udpBindsCount = 0;
2278 size_t tcpBindsCount = 0;
94721140 2279 rl_attempted_completion_function = my_completion;
2280 rl_completion_append_character = 0;
2281
726ddf60 2282 signal(SIGPIPE, SIG_IGN);
6d01c80c 2283 signal(SIGCHLD, SIG_IGN);
0ca6a67f 2284 openlog("dnsdist", LOG_PID|LOG_NDELAY, LOG_DAEMON);
6d01c80c 2285
0b62ec78 2286#ifdef HAVE_LIBSODIUM
6d01c80c 2287 if (sodium_init() == -1) {
2288 cerr<<"Unable to initialize crypto library"<<endl;
2289 exit(EXIT_FAILURE);
2290 }
7691e7df 2291 g_hashperturb=randombytes_uniform(0xffffffff);
2292 srandom(randombytes_uniform(0xffffffff));
2293#else
2294 {
2295 struct timeval tv;
2296 gettimeofday(&tv, 0);
2297 srandom(tv.tv_sec ^ tv.tv_usec ^ getpid());
2298 g_hashperturb=random();
2299 }
2300
0b62ec78 2301#endif
094b6aff 2302 ComboAddress clientAddress = ComboAddress();
11058bd6 2303 g_cmdLine.config=SYSCONFDIR "/dnsdist.conf";
359bdba5 2304 struct option longopts[]={
8f2d5ec3 2305 {"acl", required_argument, 0, 'a'},
359bdba5
CH
2306 {"check-config", no_argument, 0, 1},
2307 {"client", no_argument, 0, 'c'},
7cc68f53 2308 {"config", required_argument, 0, 'C'},
359bdba5 2309 {"disable-syslog", no_argument, 0, 2},
7cc68f53 2310 {"execute", required_argument, 0, 'e'},
359bdba5
CH
2311 {"gid", required_argument, 0, 'g'},
2312 {"help", no_argument, 0, 'h'},
2313 {"local", required_argument, 0, 'l'},
359bdba5 2314 {"setkey", required_argument, 0, 'k'},
359bdba5
CH
2315 {"supervised", no_argument, 0, 3},
2316 {"uid", required_argument, 0, 'u'},
2317 {"verbose", no_argument, 0, 'v'},
2318 {"version", no_argument, 0, 'V'},
2319 {0,0,0,0}
7cc68f53 2320 };
2321 int longindex=0;
8f2d5ec3 2322 string optstring;
7cc68f53 2323 for(;;) {
359bdba5 2324 int c=getopt_long(argc, argv, "a:cC:e:g:hk:l:u:vV", longopts, &longindex);
7cc68f53 2325 if(c==-1)
2326 break;
2327 switch(c) {
5efcfa63
PL
2328 case 1:
2329 g_cmdLine.checkConfig=true;
2330 break;
bbfaaa6f
PL
2331 case 2:
2332 g_syslog=false;
2333 break;
b7165327
CH
2334 case 3:
2335 g_cmdLine.beSupervised=true;
2336 break;
7cc68f53 2337 case 'C':
2338 g_cmdLine.config=optarg;
2339 break;
2340 case 'c':
2341 g_cmdLine.beClient=true;
2342 break;
7cc68f53 2343 case 'e':
2344 g_cmdLine.command=optarg;
2345 break;
a36ce055
RG
2346 case 'g':
2347 g_cmdLine.gid=optarg;
2348 break;
7cc68f53 2349 case 'h':
6306c282 2350 cout<<"dnsdist "<<VERSION<<endl;
7d3ee2bb 2351 usage();
7cc68f53 2352 cout<<"\n";
2353 exit(EXIT_SUCCESS);
2354 break;
8f2d5ec3 2355 case 'a':
2356 optstring=optarg;
2357 g_ACL.modify([optstring](NetmaskGroup& nmg) { nmg.addMask(optstring); });
2358 break;
ddb14ec9 2359 case 'k':
b4b5edbd 2360#ifdef HAVE_LIBSODIUM
b5521206 2361 if (B64Decode(string(optarg), g_consoleKey) < 0) {
ddb14ec9
PL
2362 cerr<<"Unable to decode key '"<<optarg<<"'."<<endl;
2363 exit(EXIT_FAILURE);
2364 }
b4b5edbd
CH
2365#else
2366 cerr<<"dnsdist has been built without libsodium, -k/--setkey is unsupported."<<endl;
2367 exit(EXIT_FAILURE);
ddb14ec9 2368#endif
b4b5edbd 2369 break;
7cc68f53 2370 case 'l':
6a363878 2371 g_cmdLine.locals.push_back(trim_copy(string(optarg)));
7cc68f53 2372 break;
a36ce055
RG
2373 case 'u':
2374 g_cmdLine.uid=optarg;
2375 break;
7cc68f53 2376 case 'v':
2377 g_verbose=true;
2378 break;
6306c282 2379 case 'V':
d4d796e5
PD
2380#ifdef LUAJIT_VERSION
2381 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<" ["<<LUAJIT_VERSION<<"])"<<endl;
2382#else
2383 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<")"<<endl;
2384#endif
70829d97 2385 cout<<"Enabled features: ";
90fe8ae6
RG
2386#ifdef HAVE_CDB
2387 cout<<"cdb ";
2388#endif
a227f47d
RG
2389#ifdef HAVE_DNS_OVER_TLS
2390 cout<<"dns-over-tls(";
2391#ifdef HAVE_GNUTLS
3909bf10
CH
2392 cout<<"gnutls";
2393 #ifdef HAVE_LIBSSL
2394 cout<<" ";
2395 #endif
a227f47d
RG
2396#endif
2397#ifdef HAVE_LIBSSL
2398 cout<<"openssl";
2399#endif
2400 cout<<") ";
2401#endif
fbf14b03
RG
2402#ifdef HAVE_DNS_OVER_HTTPS
2403 cout<<"dns-over-https(DOH) ";
2404#endif
70829d97
PL
2405#ifdef HAVE_DNSCRYPT
2406 cout<<"dnscrypt ";
2407#endif
0beaa5c8
RG
2408#ifdef HAVE_EBPF
2409 cout<<"ebpf ";
2410#endif
82a91ddf
CH
2411#ifdef HAVE_FSTRM
2412 cout<<"fstrm ";
2413#endif
f4b1f1fd
RG
2414#ifdef HAVE_LIBCRYPTO
2415 cout<<"ipcipher ";
2416#endif
3909bf10
CH
2417#ifdef HAVE_LIBSODIUM
2418 cout<<"libsodium ";
2419#endif
f441962a
RG
2420#ifdef HAVE_LMDB
2421 cout<<"lmdb ";
2422#endif
70829d97
PL
2423#ifdef HAVE_PROTOBUF
2424 cout<<"protobuf ";
2425#endif
2426#ifdef HAVE_RE2
2427 cout<<"re2 ";
2428#endif
0beaa5c8
RG
2429#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
2430 cout<<"recvmmsg/sendmmsg ";
2431#endif
2432#ifdef HAVE_NET_SNMP
2433 cout<<"snmp ";
2434#endif
70829d97
PL
2435#ifdef HAVE_SYSTEMD
2436 cout<<"systemd";
2437#endif
2438 cout<<endl;
6306c282
PL
2439 exit(EXIT_SUCCESS);
2440 break;
7d3ee2bb
PL
2441 case '?':
2442 //getopt_long printed an error message.
2443 usage();
2444 exit(EXIT_FAILURE);
2445 break;
7cc68f53 2446 }
24d5cb00 2447 }
6ab65223 2448
7cc68f53 2449 argc-=optind;
2450 argv+=optind;
2451 for(auto p = argv; *p; ++p) {
094b6aff
PL
2452 if(g_cmdLine.beClient) {
2453 clientAddress = ComboAddress(*p, 5199);
2454 } else {
2455 g_cmdLine.remotes.push_back(*p);
2456 }
7cc68f53 2457 }
2458
a1b1a29d 2459 ServerPolicy leastOutstandingPol{"leastOutstanding", leastOutstanding, false};
cd29dcb1 2460
e5a14b2b 2461 g_policy.setState(leastOutstandingPol);
7cc68f53 2462 if(g_cmdLine.beClient || !g_cmdLine.command.empty()) {
203b5348 2463 setupLua(true, false, g_cmdLine.config);
094b6aff
PL
2464 if (clientAddress != ComboAddress())
2465 g_serverControl = clientAddress;
7cc68f53 2466 doClient(g_serverControl, g_cmdLine.command);
e16fd59c 2467 _exit(EXIT_SUCCESS);
6d01c80c 2468 }
2e72cc0e 2469
8f133915 2470 auto acl = g_ACL.getCopy();
8f2d5ec3 2471 if(acl.empty()) {
2472 for(auto& addr : {"127.0.0.0/8", "10.0.0.0/8", "100.64.0.0/10", "169.254.0.0/16", "192.168.0.0/16", "172.16.0.0/12", "::1/128", "fc00::/7", "fe80::/10"})
2473 acl.addMask(addr);
2474 g_ACL.setState(acl);
2475 }
8f133915 2476
b5521206 2477 auto consoleACL = g_consoleACL.getCopy();
5ceea33e
RG
2478 for (const auto& mask : { "127.0.0.1/8", "::1/128" }) {
2479 consoleACL.addMask(mask);
2480 }
b5521206
RG
2481 g_consoleACL.setState(consoleACL);
2482
5efcfa63 2483 if (g_cmdLine.checkConfig) {
203b5348 2484 setupLua(false, true, g_cmdLine.config);
5efcfa63
PL
2485 // No exception was thrown
2486 infolog("Configuration '%s' OK!", g_cmdLine.config);
d8c19b98 2487 _exit(EXIT_SUCCESS);
5efcfa63
PL
2488 }
2489
203b5348 2490 auto todo=setupLua(false, false, g_cmdLine.config);
2e72cc0e 2491
636cc544
CHB
2492 auto localPools = g_pools.getCopy();
2493 {
2494 bool precompute = false;
2495 if (g_policy.getLocal()->name == "chashed") {
2496 precompute = true;
2497 } else {
2498 for (const auto& entry: localPools) {
2499 if (entry.second->policy != nullptr && entry.second->policy->name == "chashed") {
2500 precompute = true;
2501 break ;
2502 }
2503 }
2504 }
2505 if (precompute) {
2506 vinfolog("Pre-computing hashes for consistent hash load-balancing policy");
2507 // pre compute hashes
2508 auto backends = g_dstates.getLocal();
2509 for (auto& backend: *backends) {
2510 backend->hash();
2511 }
d58e616a
CHB
2512 }
2513 }
2514
6e9fd124
RG
2515 if (!g_cmdLine.locals.empty()) {
2516 for (auto it = g_frontends.begin(); it != g_frontends.end(); ) {
fbf14b03
RG
2517 /* DoH, DoT and DNSCrypt frontends are separate */
2518 if ((*it)->dohFrontend == nullptr && (*it)->tlsFrontend == nullptr && (*it)->dnscryptCtx == nullptr) {
6e9fd124 2519 it = g_frontends.erase(it);
9f67b883 2520 }
6e9fd124
RG
2521 else {
2522 ++it;
9f67b883 2523 }
9f67b883
RG
2524 }
2525
6e9fd124
RG
2526 for(const auto& loc : g_cmdLine.locals) {
2527 /* UDP */
2528 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress(loc, 53), false, false, 0, "", {})));
2529 /* TCP */
2530 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress(loc, 53), true, false, 0, "", {})));
87b515ed 2531 }
a36ce055
RG
2532 }
2533
6e9fd124
RG
2534 if (g_frontends.empty()) {
2535 /* UDP */
2536 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress("127.0.0.1", 53), false, false, 0, "", {})));
2537 /* TCP */
2538 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress("127.0.0.1", 53), true, false, 0, "", {})));
11e1e08b 2539 }
11e1e08b 2540
6e9fd124 2541 g_configurationDone = true;
a227f47d 2542
6e9fd124
RG
2543 for(auto& frontend : g_frontends) {
2544 setUpLocalBind(frontend);
a227f47d 2545
6e9fd124
RG
2546 if (frontend->tcp == false) {
2547 ++udpBindsCount;
a227f47d
RG
2548 }
2549 else {
6e9fd124 2550 ++tcpBindsCount;
a227f47d
RG
2551 }
2552 }
2553
b82a127e 2554 warnlog("dnsdist %s comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it according to the terms of the GPL version 2", VERSION);
9c9b4998 2555
b82a127e
RG
2556 vector<string> vec;
2557 std::string acls;
2558 g_ACL.getLocal()->toStringVector(&vec);
2559 for(const auto& s : vec) {
2560 if (!acls.empty())
2561 acls += ", ";
2562 acls += s;
b076b34a 2563 }
b82a127e 2564 infolog("ACL allowing queries from: %s", acls.c_str());
b5521206
RG
2565 vec.clear();
2566 acls.clear();
2567 g_consoleACL.getLocal()->toStringVector(&vec);
2568 for (const auto& entry : vec) {
2569 if (!acls.empty()) {
2570 acls += ", ";
2571 }
2572 acls += entry;
2573 }
2574 infolog("Console ACL allowing connections from: %s", acls.c_str());
6d01c80c 2575
9c9b4998
RG
2576#ifdef HAVE_LIBSODIUM
2577 if (g_consoleEnabled && g_consoleKey.empty()) {
2578 warnlog("Warning, the console has been enabled via 'controlSocket()' but no key has been set with 'setKey()' so all connections will fail until a key has been set");
2579 }
2580#endif
2581
aac59883
RG
2582 uid_t newgid=0;
2583 gid_t newuid=0;
2584
2585 if(!g_cmdLine.gid.empty())
2586 newgid = strToGID(g_cmdLine.gid.c_str());
2587
2588 if(!g_cmdLine.uid.empty())
2589 newuid = strToUID(g_cmdLine.uid.c_str());
2590
2591 dropGroupPrivs(newgid);
2592 dropUserPrivs(newuid);
fdc3ea42
RG
2593 try {
2594 /* we might still have capabilities remaining,
2595 for example if we have been started as root
2596 without --uid or --gid (please don't do that)
2597 or as an unprivileged user with ambient
2598 capabilities like CAP_NET_BIND_SERVICE.
2599 */
83fe2c55 2600 dropCapabilities(g_capabilitiesToRetain);
fdc3ea42
RG
2601 }
2602 catch(const std::exception& e) {
2603 warnlog("%s", e.what());
2604 }
aac59883 2605
a36ce055
RG
2606 /* this need to be done _after_ dropping privileges */
2607 g_delay = new DelayPipe<DelayedPacket>();
2608
9f4eb5cc
RG
2609 if (g_snmpAgent) {
2610 g_snmpAgent->run();
2611 }
2612
1f7646c2 2613 g_tcpclientthreads = std::unique_ptr<TCPClientCollection>(new TCPClientCollection(g_maxTCPClientThreads, g_useTCPSinglePipe));
a9bf3ec4 2614
2e72cc0e 2615 for(auto& t : todo)
2616 t();
2617
636cc544 2618 localPools = g_pools.getCopy();
8f4f5186
RG
2619 /* create the default pool no matter what */
2620 createPoolIfNotExists(localPools, "");
7cc68f53 2621 if(g_cmdLine.remotes.size()) {
2622 for(const auto& address : g_cmdLine.remotes) {
c9262563 2623 auto ret=std::make_shared<DownstreamState>(ComboAddress(address, 53));
886e2cf2 2624 addServerToPool(localPools, "", ret);
5d7e6765 2625 if (ret->connected && !ret->threadStarted.test_and_set()) {
2717a92f 2626 ret->tid = thread(responderThread, ret);
7565f4e6 2627 }
ecbe9133 2628 g_dstates.modify([ret](servers_t& servers) { servers.push_back(ret); });
6d01c80c 2629 }
2630 }
886e2cf2 2631 g_pools.setState(localPools);
6d01c80c 2632
a9c2e4ab 2633 if(g_dstates.getLocal()->empty()) {
e73ec7d3 2634 errlog("No downstream servers defined: all packets will get dropped");
2635 // you might define them later, but you need to know
2636 }
2637
41408d3a
RG
2638 checkFileDescriptorsLimits(udpBindsCount, tcpBindsCount);
2639
dd9c8246 2640 auto mplexer = std::shared_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
e5a14b2b 2641 for(auto& dss : g_dstates.getCopy()) { // it is a copy, but the internal shared_ptrs are the real deal
dd9c8246
RG
2642 if (dss->availability == DownstreamState::Availability::Auto) {
2643 if (!queueHealthCheck(mplexer, dss, true)) {
2644 dss->upStatus = false;
2645 warnlog("Marking downstream %s as 'down'", dss->getNameWithAddr());
2646 }
773470ca 2647 }
2648 }
dd9c8246 2649 handleQueuedHealthChecks(mplexer, true);
b076b34a 2650
6e9fd124 2651 for(auto& cs : g_frontends) {
fbf14b03
RG
2652 if (cs->dohFrontend != nullptr) {
2653#ifdef HAVE_DNS_OVER_HTTPS
2654 std::thread t1(dohThread, cs.get());
2b0cb8f8
RG
2655 if (!cs->cpus.empty()) {
2656 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2657 }
fbf14b03
RG
2658 t1.detach();
2659#endif /* HAVE_DNS_OVER_HTTPS */
2660 continue;
2661 }
a36ce055 2662 if (cs->udpFD >= 0) {
6e9fd124 2663 thread t1(udpClientThread, cs.get());
f0e4dcba
RG
2664 if (!cs->cpus.empty()) {
2665 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2666 }
a36ce055 2667 t1.detach();
652a7355 2668 }
a36ce055 2669 else if (cs->tcpFD >= 0) {
6e9fd124 2670 thread t1(tcpAcceptorThread, cs.get());
f0e4dcba
RG
2671 if (!cs->cpus.empty()) {
2672 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2673 }
a36ce055 2674 t1.detach();
726ddf60 2675 }
24d5cb00 2676 }
7730131a 2677
42fae326 2678 thread carbonthread(carbonDumpThread);
2679 carbonthread.detach();
2680
3c115e0f 2681 thread stattid(maintThread);
886e2cf2 2682 stattid.detach();
6d01c80c 2683
886e2cf2
RG
2684 thread healththread(healthChecksThread);
2685
5d4e1ef8
RG
2686 if (!g_secPollSuffix.empty()) {
2687 thread secpollthread(secPollThread);
2688 secpollthread.detach();
2689 }
2690
b82a127e 2691 if(g_cmdLine.beSupervised) {
6ab65223
PL
2692#ifdef HAVE_SYSTEMD
2693 sd_notify(0, "READY=1");
2694#endif
886e2cf2 2695 healththread.join();
773470ca 2696 }
6d01c80c 2697 else {
886e2cf2 2698 healththread.detach();
505ca3d1 2699 doConsole();
3c115e0f 2700 }
9cf811d1 2701 _exit(EXIT_SUCCESS);
3c115e0f 2702
6d01c80c 2703}
3f5c3f1d
PD
2704catch(const LuaContext::ExecutionErrorException& e) {
2705 try {
2706 errlog("Fatal Lua error: %s", e.what());
2707 std::rethrow_if_nested(e);
2010ac95
RG
2708 } catch(const std::exception& ne) {
2709 errlog("Details: %s", ne.what());
3f5c3f1d
PD
2710 }
2711 catch(PDNSException &ae)
2712 {
2713 errlog("Fatal pdns error: %s", ae.reason);
2714 }
2715 _exit(EXIT_FAILURE);
2716}
24d5cb00 2717catch(std::exception &e)
2718{
6d01c80c 2719 errlog("Fatal error: %s", e.what());
4a966472 2720 _exit(EXIT_FAILURE);
24d5cb00 2721}
3f81d239 2722catch(PDNSException &ae)
7730131a 2723{
6d01c80c 2724 errlog("Fatal pdns error: %s", ae.reason);
4a966472 2725 _exit(EXIT_FAILURE);
7730131a 2726}
eb0335ff
MC
2727
2728uint64_t getLatencyCount(const std::string&)
2729{
2730 return g_stats.responses + g_stats.selfAnswered + g_stats.cacheHits;
2731}