]> git.ipfire.org Git - thirdparty/pdns.git/blame - pdns/dnsdist.cc
Merge pull request #8096 from mind04/pdns-notify-db-queries
[thirdparty/pdns.git] / pdns / dnsdist.cc
CommitLineData
24d5cb00 1/*
12471842
PL
2 * This file is part of PowerDNS or dnsdist.
3 * Copyright -- PowerDNS.COM B.V. and its contributors
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of version 2 of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * In addition, for the avoidance of any doubt, permission is granted to
10 * link this program with OpenSSL and to (re)distribute the binaries
11 * produced as the result of such linking.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
5cc8371b
RG
22
23#include "config.h"
24
25#include <fstream>
26#include <getopt.h>
27#include <grp.h>
2c5db3f7 28#include <limits>
5cc8371b
RG
29#include <netinet/tcp.h>
30#include <pwd.h>
31#include <sys/resource.h>
32#include <unistd.h>
424bdfb1 33
4d39d7f3 34#if defined (__OpenBSD__) || defined(__NetBSD__)
424bdfb1
PD
35#include <readline/readline.h>
36#else
37#include <editline/readline.h>
38#endif
39
d2138f20 40#include "dnsdist-systemd.hh"
6ab65223
PL
41#ifdef HAVE_SYSTEMD
42#include <systemd/sd-daemon.h>
43#endif
44
5cc8371b
RG
45#include "dnsdist.hh"
46#include "dnsdist-cache.hh"
b5521206 47#include "dnsdist-console.hh"
5cc8371b 48#include "dnsdist-ecs.hh"
dd9c8246 49#include "dnsdist-healthchecks.hh"
5cc8371b 50#include "dnsdist-lua.hh"
03b00917 51#include "dnsdist-rings.hh"
5d4e1ef8 52#include "dnsdist-secpoll.hh"
53c57da7 53#include "dnsdist-xpf.hh"
5cc8371b
RG
54
55#include "base64.hh"
56#include "delaypipe.hh"
57#include "dolog.hh"
58#include "dnsname.hh"
e0fd37ec 59#include "dnsparser.hh"
5cc8371b
RG
60#include "ednsoptions.hh"
61#include "gettime.hh"
62#include "lock.hh"
63#include "misc.hh"
64#include "sodcrypto.hh"
65#include "sstuff.hh"
519f5484 66#include "threadname.hh"
5cc8371b 67
a40df301 68/* Known sins:
e48090d1 69
d12cea01 70 Receiver is currently single threaded
e5a14b2b 71 not *that* bad actually, but now that we are thread safe, might want to scale
a40df301 72*/
24d5cb00 73
0940e4eb 74/* the Rulaction plan
75 Set of Rules, if one matches, it leads to an Action
76 Both rules and actions could conceivably be Lua based.
77 On the C++ side, both could be inherited from a class Rule and a class Action,
78 on the Lua side we can't do that. */
79
64e4ebb4 80using std::atomic;
81using std::thread;
7730131a 82bool g_verbose;
b065f701 83
e48090d1 84struct DNSDistStats g_stats;
37a5c2d5 85
3b203c83 86uint16_t g_maxOutstanding{std::numeric_limits<uint16_t>::max()};
1ea747c0 87uint32_t g_staleCacheEntriesTTL{0};
bbfaaa6f 88bool g_syslog{true};
0dffe9e3 89bool g_allowEmptyResponse{false};
cffde2fd 90
91GlobalStateHolder<NetmaskGroup> g_ACL;
c9262563 92string g_outputBuffer;
a227f47d 93
a227f47d 94std::vector<std::shared_ptr<TLSFrontend>> g_tlslocals;
fbf14b03 95std::vector<std::shared_ptr<DOHFrontend>> g_dohlocals;
6e9fd124 96std::vector<std::shared_ptr<DNSCryptContext>> g_dnsCryptLocals;
87b515ed
RG
97#ifdef HAVE_EBPF
98shared_ptr<BPFFilter> g_defaultBPFFilter;
8429ad04 99std::vector<std::shared_ptr<DynBPFFilter> > g_dynBPFFilters;
87b515ed 100#endif /* HAVE_EBPF */
6e9fd124 101std::vector<std::unique_ptr<ClientState>> g_frontends;
886e2cf2 102GlobalStateHolder<pools_t> g_pools;
0beaa5c8 103size_t g_udpVectorSize{1};
7c0860e1 104
9f4eb5cc
RG
105bool g_snmpEnabled{false};
106bool g_snmpTrapsEnabled{false};
107DNSDistSNMPAgent* g_snmpAgent{nullptr};
108
726ddf60 109/* UDP: the grand design. Per socket we listen on for incoming queries there is one thread.
7730131a 110 Then we have a bunch of connected sockets for talking to downstream servers.
111 We send directly to those sockets.
24d5cb00 112
7730131a 113 For the return path, per downstream server we have a thread that listens to responses.
24d5cb00 114
7730131a 115 Per socket there is an array of 2^16 states, when we send out a packet downstream, we note
116 there the original requestor and the original id. The new ID is the offset in the array.
117
118 When an answer comes in on a socket, we look up the offset by the id, and lob it to the
119 original requestor.
120
121 IDs are assigned by atomic increments of the socket offset.
122 */
123
4d5959e6
RG
124GlobalStateHolder<vector<DNSDistRuleAction> > g_rulactions;
125GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_resprulactions;
126GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_cachehitresprulactions;
2d4783a8 127GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_selfansweredresprulactions;
4d5959e6 128
df111b53 129Rings g_rings;
786e4d8c 130QueryCount g_qcount;
7c0860e1 131
ecbe9133 132GlobalStateHolder<servers_t> g_dstates;
78ffa782 133GlobalStateHolder<NetmaskTree<DynBlock>> g_dynblockNMG;
71c94675 134GlobalStateHolder<SuffixMatchTree<DynBlock>> g_dynblockSMT;
dd46e5e3 135DNSAction::Action g_dynBlockAction = DNSAction::Action::Drop;
3f6d07a4
RG
136int g_tcpRecvTimeout{2};
137int g_tcpSendTimeout{2};
e0b5e49d 138int g_udpTimeout{2};
3f6d07a4 139
26a3cdb7 140bool g_servFailOnNoPolicy{false};
e72fbfc4 141bool g_truncateTC{false};
53c57da7
RG
142bool g_fixupCase{false};
143bool g_preserveTrailingData{false};
32b86928 144bool g_roundrobinFailOnNoServer{false};
0beaa5c8 145
83fe2c55
RG
146std::set<std::string> g_capabilitiesToRetain;
147
e0fd37ec 148static void truncateTC(char* packet, uint16_t* len, size_t responseSize, unsigned int consumed)
6ad8b29a 149try
150{
e0fd37ec
RG
151 bool hadEDNS = false;
152 uint16_t payloadSize = 0;
153 uint16_t z = 0;
154
155 if (g_addEDNSToSelfGeneratedResponses) {
156 hadEDNS = getEDNSUDPPayloadSizeAndZ(packet, *len, &payloadSize, &z);
157 }
158
2aa2c0aa 159 *len=static_cast<uint16_t>(sizeof(dnsheader)+consumed+DNS_TYPE_SIZE+DNS_CLASS_SIZE);
e7c732b8
RG
160 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(packet);
161 dh->ancount = dh->arcount = dh->nscount = 0;
e0fd37ec
RG
162
163 if (hadEDNS) {
5e98ccfa 164 addEDNS(dh, *len, responseSize, z & EDNS_HEADER_FLAG_DO, payloadSize, 0);
e0fd37ec 165 }
6ad8b29a 166}
167catch(...)
168{
169 g_stats.truncFail++;
170}
171
7b3865cd 172struct DelayedPacket
173{
174 int fd;
175 string packet;
176 ComboAddress destination;
68f3eb4d 177 ComboAddress origDest;
7b3865cd 178 void operator()()
179 {
20b019fa
RG
180 ssize_t res;
181 if(origDest.sin4.sin_family == 0) {
182 res = sendto(fd, packet.c_str(), packet.size(), 0, (struct sockaddr*)&destination, destination.getSocklen());
183 }
184 else {
185 res = sendfromto(fd, packet.c_str(), packet.size(), 0, origDest, destination);
186 }
187 if (res == -1) {
188 int err = errno;
189 vinfolog("Error sending delayed response to %s: %s", destination.toStringWithPort(), strerror(err));
190 }
7b3865cd 191 }
192};
193
3e425868 194DelayPipe<DelayedPacket>* g_delay = nullptr;
7b3865cd 195
f653b8df 196void doLatencyStats(double udiff)
daacd477 197{
cb167afd
CHB
198 if(udiff < 1000) ++g_stats.latency0_1;
199 else if(udiff < 10000) ++g_stats.latency1_10;
200 else if(udiff < 50000) ++g_stats.latency10_50;
201 else if(udiff < 100000) ++g_stats.latency50_100;
202 else if(udiff < 1000000) ++g_stats.latency100_1000;
203 else ++g_stats.latencySlow;
eb0335ff 204 g_stats.latencySum += udiff / 1000;
be6c318f 205
daacd477 206 auto doAvg = [](double& var, double n, double weight) {
207 var = (weight -1) * var/weight + n/weight;
208 };
209
210 doAvg(g_stats.latencyAvg100, udiff, 100);
211 doAvg(g_stats.latencyAvg1000, udiff, 1000);
212 doAvg(g_stats.latencyAvg10000, udiff, 10000);
213 doAvg(g_stats.latencyAvg1000000, udiff, 1000000);
214}
ca404e94 215
e7c732b8 216bool responseContentMatches(const char* response, const uint16_t responseLen, const DNSName& qname, const uint16_t qtype, const uint16_t qclass, const ComboAddress& remote, unsigned int& consumed)
fcffc585 217{
fcffc585
RG
218 if (responseLen < sizeof(dnsheader)) {
219 return false;
220 }
221
3e425868 222 const struct dnsheader* dh = reinterpret_cast<const struct dnsheader*>(response);
c8c3d4e4 223 if (dh->qdcount == 0) {
ad7ec9a2 224 if ((dh->rcode != RCode::NoError && dh->rcode != RCode::NXDomain) || g_allowEmptyResponse) {
c8c3d4e4
RG
225 return true;
226 }
227 else {
cb167afd 228 ++g_stats.nonCompliantResponses;
c8c3d4e4
RG
229 return false;
230 }
231 }
232
3e425868
RG
233 uint16_t rqtype, rqclass;
234 DNSName rqname;
fcffc585
RG
235 try {
236 rqname=DNSName(response, responseLen, sizeof(dnsheader), false, &rqtype, &rqclass, &consumed);
237 }
3e425868
RG
238 catch(const std::exception& e) {
239 if(responseLen > 0 && static_cast<size_t>(responseLen) > sizeof(dnsheader)) {
fcffc585 240 infolog("Backend %s sent us a response with id %d that did not parse: %s", remote.toStringWithPort(), ntohs(dh->id), e.what());
3e425868 241 }
cb167afd 242 ++g_stats.nonCompliantResponses;
fcffc585
RG
243 return false;
244 }
245
246 if (rqtype != qtype || rqclass != qclass || rqname != qname) {
247 return false;
248 }
249
250 return true;
251}
252
4ab01344 253static void restoreFlags(struct dnsheader* dh, uint16_t origFlags)
fcffc585
RG
254{
255 static const uint16_t rdMask = 1 << FLAGS_RD_OFFSET;
256 static const uint16_t cdMask = 1 << FLAGS_CD_OFFSET;
257 static const uint16_t restoreFlagsMask = UINT16_MAX & ~(rdMask | cdMask);
0f72fd5c
RG
258 uint16_t * flags = getFlagsFromDNSHeader(dh);
259 /* clear the flags we are about to restore */
260 *flags &= restoreFlagsMask;
261 /* only keep the flags we want to restore */
262 origFlags &= ~restoreFlagsMask;
263 /* set the saved flags as they were */
264 *flags |= origFlags;
265}
266
4ab01344 267static bool fixUpQueryTurnedResponse(DNSQuestion& dq, const uint16_t origFlags)
e7c732b8
RG
268{
269 restoreFlags(dq.dh, origFlags);
270
271 return addEDNSToQueryTurnedResponse(dq);
272}
273
3e425868 274static bool fixUpResponse(char** response, uint16_t* responseLen, size_t* responseSize, const DNSName& qname, uint16_t origFlags, bool ednsAdded, bool ecsAdded, std::vector<uint8_t>& rewrittenResponse, uint16_t addRoom, bool* zeroScope)
0f72fd5c 275{
fcffc585
RG
276 if (*responseLen < sizeof(dnsheader)) {
277 return false;
278 }
279
3e425868 280 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(*response);
c8c3d4e4
RG
281 restoreFlags(dh, origFlags);
282
283 if (*responseLen == sizeof(dnsheader)) {
284 return true;
285 }
286
fcffc585
RG
287 if(g_fixupCase) {
288 string realname = qname.toDNSString();
289 if (*responseLen >= (sizeof(dnsheader) + realname.length())) {
290 memcpy(*response + sizeof(dnsheader), realname.c_str(), realname.length());
291 }
292 }
293
ff73f02b 294 if (ednsAdded || ecsAdded) {
ceae225c 295 uint16_t optStart;
fcffc585
RG
296 size_t optLen = 0;
297 bool last = false;
298
11d5d247
RG
299 const std::string responseStr(*response, *responseLen);
300 int res = locateEDNSOptRR(responseStr, &optStart, &optLen, &last);
fcffc585
RG
301
302 if (res == 0) {
49c33a6c
RG
303 if (zeroScope) { // this finds if an EDNS Client Subnet scope was set, and if it is 0
304 size_t optContentStart = 0;
305 uint16_t optContentLen = 0;
306 /* we need at least 4 bytes after the option length (family: 2, source prefix-length: 1, scope prefix-length: 1) */
307 if (isEDNSOptionInOpt(responseStr, optStart, optLen, EDNSOptionCode::ECS, &optContentStart, &optContentLen) && optContentLen >= 4) {
308 /* see if the EDNS Client Subnet SCOPE PREFIX-LENGTH byte in position 3 is set to 0, which is the only thing
309 we care about. */
310 *zeroScope = responseStr.at(optContentStart + 3) == 0;
311 }
40669042 312 }
9837850d 313
ff73f02b
RG
314 if (ednsAdded) {
315 /* we added the entire OPT RR,
316 therefore we need to remove it entirely */
317 if (last) {
318 /* simply remove the last AR */
319 *responseLen -= optLen;
320 uint16_t arcount = ntohs(dh->arcount);
321 arcount--;
322 dh->arcount = htons(arcount);
323 }
324 else {
325 /* Removing an intermediary RR could lead to compression error */
11d5d247 326 if (rewriteResponseWithoutEDNS(responseStr, rewrittenResponse) == 0) {
ff73f02b
RG
327 *responseLen = rewrittenResponse.size();
328 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
329 rewrittenResponse.reserve(*responseLen + addRoom);
330 }
331 *responseSize = rewrittenResponse.capacity();
332 *response = reinterpret_cast<char*>(rewrittenResponse.data());
333 }
334 else {
335 warnlog("Error rewriting content");
336 }
337 }
fcffc585
RG
338 }
339 else {
ff73f02b
RG
340 /* the OPT RR was already present, but without ECS,
341 we need to remove the ECS option if any */
342 if (last) {
343 /* nothing after the OPT RR, we can simply remove the
344 ECS option */
345 size_t existingOptLen = optLen;
11d5d247 346 removeEDNSOptionFromOPT(*response + optStart, &optLen, EDNSOptionCode::ECS);
ff73f02b 347 *responseLen -= (existingOptLen - optLen);
fcffc585
RG
348 }
349 else {
ff73f02b 350 /* Removing an intermediary RR could lead to compression error */
11d5d247 351 if (rewriteResponseWithoutEDNSOption(responseStr, EDNSOptionCode::ECS, rewrittenResponse) == 0) {
ff73f02b
RG
352 *responseLen = rewrittenResponse.size();
353 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
354 rewrittenResponse.reserve(*responseLen + addRoom);
355 }
356 *responseSize = rewrittenResponse.capacity();
357 *response = reinterpret_cast<char*>(rewrittenResponse.data());
358 }
359 else {
360 warnlog("Error rewriting content");
361 }
fcffc585
RG
362 }
363 }
364 }
365 }
366
367 return true;
368}
369
fcffc585 370#ifdef HAVE_DNSCRYPT
3e425868 371static bool encryptResponse(char* response, uint16_t* responseLen, size_t responseSize, bool tcp, std::shared_ptr<DNSCryptQuery> dnsCryptQuery, dnsheader** dh, dnsheader* dhCopy)
fcffc585 372{
0f72fd5c
RG
373 if (dnsCryptQuery) {
374 uint16_t encryptedResponseLen = 0;
57847d65
RG
375
376 /* save the original header before encrypting it in place */
377 if (dh != nullptr && *dh != nullptr && dhCopy != nullptr) {
378 memcpy(dhCopy, *dh, sizeof(dnsheader));
379 *dh = dhCopy;
380 }
381
43234e76 382 int res = dnsCryptQuery->encryptResponse(response, *responseLen, responseSize, tcp, &encryptedResponseLen);
fcffc585 383 if (res == 0) {
0f72fd5c 384 *responseLen = encryptedResponseLen;
fcffc585
RG
385 } else {
386 /* dropping response */
387 vinfolog("Error encrypting the response, dropping.");
388 return false;
389 }
390 }
0f72fd5c
RG
391 return true;
392}
7129b5c4 393#endif /* HAVE_DNSCRYPT */
fcffc585 394
3e425868
RG
395static bool applyRulesToResponse(LocalStateHolder<vector<DNSDistResponseRuleAction> >& localRespRulactions, DNSResponse& dr)
396{
397 DNSResponseAction::Action action=DNSResponseAction::Action::None;
398 std::string ruleresult;
399 for(const auto& lr : *localRespRulactions) {
400 if(lr.d_rule->matches(&dr)) {
401 lr.d_rule->d_matches++;
402 action=(*lr.d_action)(&dr, &ruleresult);
403 switch(action) {
404 case DNSResponseAction::Action::Allow:
405 return true;
406 break;
407 case DNSResponseAction::Action::Drop:
408 return false;
409 break;
410 case DNSResponseAction::Action::HeaderModify:
411 return true;
412 break;
413 case DNSResponseAction::Action::ServFail:
414 dr.dh->rcode = RCode::ServFail;
415 return true;
416 break;
417 /* non-terminal actions follow */
418 case DNSResponseAction::Action::Delay:
419 dr.delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
420 break;
421 case DNSResponseAction::Action::None:
422 break;
423 }
424 }
425 }
426
427 return true;
428}
429
430bool processResponse(char** response, uint16_t* responseLen, size_t* responseSize, LocalStateHolder<vector<DNSDistResponseRuleAction> >& localRespRulactions, DNSResponse& dr, size_t addRoom, std::vector<uint8_t>& rewrittenResponse, bool muted)
431{
432 if (!applyRulesToResponse(localRespRulactions, dr)) {
433 return false;
434 }
435
436 bool zeroScope = false;
437 if (!fixUpResponse(response, responseLen, responseSize, *dr.qname, dr.origFlags, dr.ednsAdded, dr.ecsAdded, rewrittenResponse, addRoom, dr.useZeroScope ? &zeroScope : nullptr)) {
438 return false;
439 }
440
01e23732 441 if (dr.packetCache && !dr.skipCache && *responseLen <= s_maxPacketCacheEntrySize) {
3e425868
RG
442 if (!dr.useZeroScope) {
443 /* if the query was not suitable for zero-scope, for
444 example because it had an existing ECS entry so the hash is
445 not really 'no ECS', so just insert it for the existing subnet
446 since:
447 - we don't have the correct hash for a non-ECS query
448 - inserting with hash computed before the ECS replacement but with
449 the subnet extracted _after_ the replacement would not work.
450 */
451 zeroScope = false;
452 }
453 // if zeroScope, pass the pre-ECS hash-key and do not pass the subnet to the cache
454 dr.packetCache->insert(zeroScope ? dr.cacheKeyNoECS : dr.cacheKey, zeroScope ? boost::none : dr.subnet, dr.origFlags, dr.dnssecOK, *dr.qname, dr.qtype, dr.qclass, *response, *responseLen, dr.tcp, dr.dh->rcode, dr.tempFailureTTL);
455 }
456
457#ifdef HAVE_DNSCRYPT
458 if (!muted) {
459 if (!encryptResponse(*response, responseLen, *responseSize, dr.tcp, dr.dnsCryptQuery, nullptr, nullptr)) {
460 return false;
461 }
462 }
7129b5c4 463#endif /* HAVE_DNSCRYPT */
3e425868
RG
464
465 return true;
466}
467
468static bool sendUDPResponse(int origFD, const char* response, const uint16_t responseLen, const int delayMsec, const ComboAddress& origDest, const ComboAddress& origRemote)
0f72fd5c 469{
fcffc585
RG
470 if(delayMsec && g_delay) {
471 DelayedPacket dp{origFD, string(response,responseLen), origRemote, origDest};
472 g_delay->submit(dp, delayMsec);
473 }
474 else {
20b019fa
RG
475 ssize_t res;
476 if(origDest.sin4.sin_family == 0) {
3e425868 477 res = sendto(origFD, response, responseLen, 0, reinterpret_cast<const struct sockaddr*>(&origRemote), origRemote.getSocklen());
20b019fa
RG
478 }
479 else {
480 res = sendfromto(origFD, response, responseLen, 0, origDest, origRemote);
481 }
482 if (res == -1) {
483 int err = errno;
a702a96c 484 vinfolog("Error sending response to %s: %s", origRemote.toStringWithPort(), stringerror(err));
20b019fa 485 }
fcffc585
RG
486 }
487
488 return true;
489}
490
150105a2 491
fbf14b03 492int pickBackendSocketForSending(std::shared_ptr<DownstreamState>& state)
150105a2 493{
5bdbb83d 494 return state->sockets[state->socketsOffset++ % state->sockets.size()];
150105a2
RG
495}
496
5bdbb83d 497static void pickBackendSocketsReadyForReceiving(const std::shared_ptr<DownstreamState>& state, std::vector<int>& ready)
150105a2 498{
5bdbb83d 499 ready.clear();
150105a2 500
5bdbb83d
RG
501 if (state->sockets.size() == 1) {
502 ready.push_back(state->sockets[0]);
503 return ;
150105a2
RG
504 }
505
5bdbb83d
RG
506 {
507 std::lock_guard<std::mutex> lock(state->socketsLock);
508 state->mplexer->getAvailableFDs(ready, -1);
150105a2 509 }
150105a2
RG
510}
511
4632045b 512// listens on a dedicated socket, lobs answers from downstream servers to original requestors
9b73b71c 513void responderThread(std::shared_ptr<DownstreamState> dss)
66d1c0bf 514try {
519f5484 515 setThreadName("dnsdist/respond");
d8c19b98 516 auto localRespRulactions = g_resprulactions.getLocal();
8179b6d6 517 char packet[s_maxPacketCacheEntrySize + DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE];
b50ee135 518 static_assert(sizeof(packet) <= UINT16_MAX, "Packet size should fit in a uint16_t");
3e425868
RG
519 /* when the answer is encrypted in place, we need to get a copy
520 of the original header before encryption to fill the ring buffer */
521 dnsheader cleartextDH;
ca404e94 522 vector<uint8_t> rewrittenResponse;
7267213a 523
66d1c0bf 524 uint16_t queryId = 0;
5bdbb83d
RG
525 std::vector<int> sockets;
526 sockets.reserve(dss->sockets.size());
527
7730131a 528 for(;;) {
57847d65 529 dnsheader* dh = reinterpret_cast<struct dnsheader*>(packet);
66d1c0bf 530 try {
5bdbb83d
RG
531 pickBackendSocketsReadyForReceiving(dss, sockets);
532 for (const auto& fd : sockets) {
533 ssize_t got = recv(fd, packet, sizeof(packet), 0);
534 char * response = packet;
535 size_t responseSize = sizeof(packet);
ca404e94 536
2aa2c0aa 537 if (got < 0 || static_cast<size_t>(got) < sizeof(dnsheader))
5bdbb83d 538 continue;
4f380af3 539
3e425868 540 uint16_t responseLen = static_cast<uint16_t>(got);
5bdbb83d 541 queryId = dh->id;
b50ee135 542
fbf14b03 543 if(queryId >= dss->idStates.size()) {
5bdbb83d 544 continue;
fbf14b03 545 }
4f380af3 546
5bdbb83d 547 IDState* ids = &dss->idStates[queryId];
a9489723 548 int64_t usageIndicator = ids->usageIndicator;
c9ba8478 549
311f19d5 550 if(!IDState::isInUse(usageIndicator)) {
9bd1a882 551 /* the corresponding state is marked as not in use, meaning that:
a9489723 552 - it was already cleaned up by another thread and the state is gone ;
9bd1a882
RG
553 - we already got a response for this query and this one is a duplicate.
554 Either way, we don't touch it.
555 */
5bdbb83d 556 continue;
9bd1a882 557 }
7267213a 558
9bd1a882 559 /* read the potential DOHUnit state as soon as possible, but don't use it
a9489723 560 until we have confirmed that we own this state by updating usageIndicator */
0956c5c5 561 auto du = ids->du;
5bdbb83d
RG
562 /* setting age to 0 to prevent the maintainer thread from
563 cleaning this IDS while we process the response.
5bdbb83d
RG
564 */
565 ids->age = 0;
a9489723 566 int origFD = ids->origFD;
fcffc585 567
e7c732b8
RG
568 unsigned int consumed = 0;
569 if (!responseContentMatches(response, responseLen, ids->qname, ids->qtype, ids->qclass, dss->remote, consumed)) {
5bdbb83d
RG
570 continue;
571 }
7267213a 572
3c72bc54 573 bool isDoH = du != nullptr;
a9489723
RG
574 /* atomically mark the state as available, but only if it has not been altered
575 in the meantime */
311f19d5 576 if (ids->tryMarkUnused(usageIndicator)) {
9bd1a882
RG
577 /* clear the potential DOHUnit asap, it's ours now
578 and since we just marked the state as unused,
579 someone could overwrite it. */
246ff31f 580 ids->du = nullptr;
71b86bd8
RG
581 /* we only decrement the outstanding counter if the value was not
582 altered in the meantime, which would mean that the state has been actively reused
583 and the other thread has not incremented the outstanding counter, so we don't
584 want it to be decremented twice. */
585 --dss->outstanding; // you'd think an attacker could game this, but we're using connected socket
0956c5c5 586 } else {
9bd1a882 587 /* someone updated the state in the meantime, we can't touch the existing pointer */
0956c5c5 588 du = nullptr;
f2d0c808
RG
589 /* since the state has been updated, we can't safely access it so let's just drop
590 this response */
591 continue;
71b86bd8 592 }
2c5db3f7 593
5bdbb83d 594 if(dh->tc && g_truncateTC) {
e0fd37ec 595 truncateTC(response, &responseLen, responseSize, consumed);
5bdbb83d 596 }
aeb36780 597
5bdbb83d 598 dh->id = ids->origID;
ca404e94 599
5bdbb83d 600 uint16_t addRoom = 0;
d0ae6360
RG
601 DNSResponse dr = makeDNSResponseFromIDState(*ids, dh, sizeof(packet), responseLen, false);
602 if (dr.dnsCryptQuery) {
5bdbb83d
RG
603 addRoom = DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE;
604 }
ca404e94 605
3e425868
RG
606 memcpy(&cleartextDH, dr.dh, sizeof(cleartextDH));
607 if (!processResponse(&response, &responseLen, &responseSize, localRespRulactions, dr, addRoom, rewrittenResponse, ids->cs && ids->cs->muted)) {
608 continue;
5bdbb83d 609 }
886e2cf2 610
5bdbb83d 611 if (ids->cs && !ids->cs->muted) {
0956c5c5 612 if (du) {
fbf14b03
RG
613#ifdef HAVE_DNS_OVER_HTTPS
614 // DoH query
246ff31f 615 du->response = std::string(response, responseLen);
0956c5c5 616 if (send(du->rsock, &du, sizeof(du), 0) != sizeof(du)) {
9bd1a882 617 /* at this point we have the only remaining pointer on this
b6d19fca
RG
618 DOHUnit object since we did set ids->du to nullptr earlier,
619 except if we got the response before the pointer could be
620 released by the frontend */
621 du->release();
fbf14b03
RG
622 }
623#endif /* HAVE_DNS_OVER_HTTPS */
0956c5c5 624 du = nullptr;
fbf14b03
RG
625 }
626 else {
627 ComboAddress empty;
628 empty.sin4.sin_family = 0;
629 /* if ids->destHarvested is false, origDest holds the listening address.
630 We don't want to use that as a source since it could be 0.0.0.0 for example. */
631 sendUDPResponse(origFD, response, responseLen, dr.delayMsec, ids->destHarvested ? ids->origDest : empty, ids->origRemote);
632 }
5bdbb83d 633 }
66d1c0bf 634
cb167afd 635 ++g_stats.responses;
83f7bbdb
RG
636 if (ids->cs) {
637 ++ids->cs->responses;
638 }
7fc95193 639 ++dss->responses;
66d1c0bf 640
5bdbb83d 641 double udiff = ids->sentTime.udiff();
fbf14b03 642 vinfolog("Got answer from %s, relayed to %s%s, took %f usec", dss->remote.toStringWithPort(), ids->origRemote.toStringWithPort(),
3c72bc54 643 isDoH ? " (https)": "", udiff);
66d1c0bf 644
01f6920b
RG
645 struct timespec ts;
646 gettime(&ts);
3e425868 647 g_rings.insertResponse(ts, *dr.remote, *dr.qname, dr.qtype, static_cast<unsigned int>(udiff), static_cast<unsigned int>(got), cleartextDH, dss->remote);
66d1c0bf 648
5d7a33dd 649 switch (cleartextDH.rcode) {
61d10a4d
MH
650 case RCode::NXDomain:
651 ++g_stats.frontendNXDomain;
652 break;
653 case RCode::ServFail:
cb167afd 654 ++g_stats.servfailResponses;
61d10a4d
MH
655 ++g_stats.frontendServFail;
656 break;
657 case RCode::NoError:
658 ++g_stats.frontendNoError;
659 break;
cb167afd 660 }
5bdbb83d 661 dss->latencyUsec = (127.0 * dss->latencyUsec / 128.0) + udiff/128.0;
66d1c0bf 662
5bdbb83d 663 doLatencyStats(udiff);
fcadd56e 664
5bdbb83d
RG
665 rewrittenResponse.clear();
666 }
66d1c0bf 667 }
df560083 668 catch(const std::exception& e){
cd7fa253 669 vinfolog("Got an error in UDP responder thread while parsing a response from %s, id %d: %s", dss->remote.toStringWithPort(), queryId, e.what());
66d1c0bf 670 }
7730131a 671 }
7730131a 672}
66d1c0bf
RG
673catch(const std::exception& e)
674{
675 errlog("UDP responder thread died because of exception: %s", e.what());
66d1c0bf
RG
676}
677catch(const PDNSException& e)
678{
679 errlog("UDP responder thread died because of PowerDNS exception: %s", e.reason);
66d1c0bf
RG
680}
681catch(...)
682{
683 errlog("UDP responder thread died because of an exception: %s", "unknown");
66d1c0bf 684}
24d5cb00 685
5d7e6765 686bool DownstreamState::reconnect()
3c115e0f 687{
5d7e6765
RG
688 std::unique_lock<std::mutex> tl(connectLock, std::try_to_lock);
689 if (!tl.owns_lock()) {
690 /* we are already reconnecting */
691 return false;
692 }
693
b58f08e5 694 connected = false;
5bdbb83d 695 for (auto& fd : sockets) {
150105a2 696 if (fd != -1) {
5d7e6765 697 if (sockets.size() > 1) {
5bdbb83d
RG
698 std::lock_guard<std::mutex> lock(socketsLock);
699 mplexer->removeReadFD(fd);
700 }
150105a2
RG
701 /* shutdown() is needed to wake up recv() in the responderThread */
702 shutdown(fd, SHUT_RDWR);
703 close(fd);
704 fd = -1;
f99e3aaf 705 }
150105a2
RG
706 if (!IsAnyAddress(remote)) {
707 fd = SSocket(remote.sin4.sin_family, SOCK_DGRAM, 0);
708 if (!IsAnyAddress(sourceAddr)) {
709 SSetsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 1);
70b0d0e2
RG
710 if (!sourceItfName.empty()) {
711#ifdef SO_BINDTODEVICE
712 int res = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, sourceItfName.c_str(), sourceItfName.length());
713 if (res != 0) {
714 infolog("Error setting up the interface on backend socket '%s': %s", remote.toStringWithPort(), stringerror());
715 }
716#endif
717 }
718
150105a2
RG
719 SBind(fd, sourceAddr);
720 }
721 try {
722 SConnect(fd, remote);
5d7e6765 723 if (sockets.size() > 1) {
5bdbb83d
RG
724 std::lock_guard<std::mutex> lock(socketsLock);
725 mplexer->addReadFD(fd, [](int, boost::any) {});
726 }
150105a2
RG
727 connected = true;
728 }
729 catch(const std::runtime_error& error) {
730 infolog("Error connecting to new server with address %s: %s", remote.toStringWithPort(), error.what());
38069e7e
RG
731 connected = false;
732 break;
733 }
7565f4e6 734 }
38069e7e
RG
735 }
736
737 /* if at least one (re-)connection failed, close all sockets */
738 if (!connected) {
5bdbb83d 739 for (auto& fd : sockets) {
38069e7e 740 if (fd != -1) {
5d7e6765
RG
741 if (sockets.size() > 1) {
742 std::lock_guard<std::mutex> lock(socketsLock);
743 mplexer->removeReadFD(fd);
744 }
38069e7e
RG
745 /* shutdown() is needed to wake up recv() in the responderThread */
746 shutdown(fd, SHUT_RDWR);
747 close(fd);
748 fd = -1;
150105a2 749 }
7565f4e6 750 }
b58f08e5 751 }
5d7e6765
RG
752
753 return connected;
b58f08e5 754}
f2caf657
CHB
755void DownstreamState::hash()
756{
d58e616a 757 vinfolog("Computing hashes for id=%s and weight=%d", id, weight);
f2caf657 758 auto w = weight;
d58e616a 759 WriteLock wl(&d_lock);
f2caf657
CHB
760 hashes.clear();
761 while (w > 0) {
762 std::string uuid = boost::str(boost::format("%s-%d") % id % w);
763 unsigned int wshash = burtleCI((const unsigned char*)uuid.c_str(), uuid.size(), g_hashperturb);
764 hashes.insert(wshash);
765 --w;
766 }
767}
768
769void DownstreamState::setId(const boost::uuids::uuid& newId)
770{
771 id = newId;
d58e616a
CHB
772 // compute hashes only if already done
773 if (!hashes.empty()) {
774 hash();
775 }
f2caf657
CHB
776}
777
778void DownstreamState::setWeight(int newWeight)
779{
5a2bbe8c
CHB
780 if (newWeight < 1) {
781 errlog("Error setting server's weight: downstream weight value must be greater than 0.");
782 return ;
783 }
f2caf657 784 weight = newWeight;
d58e616a
CHB
785 if (!hashes.empty()) {
786 hash();
787 }
f2caf657 788}
b58f08e5 789
203b5348 790DownstreamState::DownstreamState(const ComboAddress& remote_, const ComboAddress& sourceAddr_, unsigned int sourceItf_, const std::string& sourceItfName_, size_t numberOfSockets, bool connect=true): sourceItfName(sourceItfName_), remote(remote_), sourceAddr(sourceAddr_), sourceItf(sourceItf_)
b58f08e5 791{
d58e616a 792 pthread_rwlock_init(&d_lock, nullptr);
d61aa945 793 id = getUniqueID();
5d7e6765
RG
794 threadStarted.clear();
795
5bdbb83d
RG
796 mplexer = std::unique_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
797
798 sockets.resize(numberOfSockets);
799 for (auto& fd : sockets) {
150105a2
RG
800 fd = -1;
801 }
802
203b5348 803 if (connect && !IsAnyAddress(remote)) {
b58f08e5 804 reconnect();
f99e3aaf
RG
805 idStates.resize(g_maxOutstanding);
806 sw.start();
807 infolog("Added downstream server %s", remote.toStringWithPort());
fbe2a2e0 808 }
1720247e 809
3c115e0f 810}
811
773470ca 812std::mutex g_luamutex;
3f25bce6 813LuaContext g_lua;
3f25bce6 814
ecbe9133 815GlobalStateHolder<ServerPolicy> g_policy;
773470ca 816
497a6e3a 817shared_ptr<DownstreamState> firstAvailable(const NumberedServerVector& servers, const DNSQuestion* dq)
773470ca 818{
22b2b326 819 for(auto& d : servers) {
da4e7813 820 if(d.second->isUp() && d.second->qps.check())
821 return d.second;
773470ca 822 }
497a6e3a 823 return leastOutstanding(servers, dq);
2c5db3f7 824}
825
d1fba58e 826// get server with least outstanding queries, and within those, with the lowest order, and within those: the fastest
497a6e3a 827shared_ptr<DownstreamState> leastOutstanding(const NumberedServerVector& servers, const DNSQuestion* dq)
c9262563 828{
886e2cf2
RG
829 if (servers.size() == 1 && servers[0].second->isUp()) {
830 return servers[0].second;
831 }
832
d1fba58e 833 vector<pair<tuple<int,int,double>, shared_ptr<DownstreamState>>> poss;
834 /* so you might wonder, why do we go through this trouble? The data on which we sort could change during the sort,
835 which would suck royally and could even lead to crashes. So first we snapshot on what we sort, and then we sort */
478ce172 836 poss.reserve(servers.size());
0e41337b 837 for(auto& d : servers) {
da4e7813 838 if(d.second->isUp()) {
d1fba58e 839 poss.push_back({make_tuple(d.second->outstanding.load(), d.second->order, d.second->latencyUsec), d.second});
c9262563 840 }
841 }
842 if(poss.empty())
843 return shared_ptr<DownstreamState>();
844 nth_element(poss.begin(), poss.begin(), poss.end(), [](const decltype(poss)::value_type& a, const decltype(poss)::value_type& b) { return a.first < b.first; });
845 return poss.begin()->second;
846}
847
497a6e3a 848shared_ptr<DownstreamState> valrandom(unsigned int val, const NumberedServerVector& servers, const DNSQuestion* dq)
c9262563 849{
850 vector<pair<int, shared_ptr<DownstreamState>>> poss;
278403d3
DM
851 int sum = 0;
852 int max = std::numeric_limits<int>::max();
853
22b2b326 854 for(auto& d : servers) { // w=1, w=10 -> 1, 11
da4e7813 855 if(d.second->isUp()) {
278403d3
DM
856 // Don't overflow sum when adding high weights
857 if(d.second->weight > max - sum) {
858 sum = max;
859 } else {
860 sum += d.second->weight;
861 }
862
da4e7813 863 poss.push_back({sum, d.second});
c9262563 864 }
865 }
d2894425
NJ
866
867 // Catch poss & sum are empty to avoid SIGFPE
868 if(poss.empty())
869 return shared_ptr<DownstreamState>();
870
a7f3108c 871 int r = val % sum;
af619119 872 auto p = upper_bound(poss.begin(), poss.end(),r, [](int r_, const decltype(poss)::value_type& a) { return r_ < a.first;});
c9262563 873 if(p==poss.end())
874 return shared_ptr<DownstreamState>();
875 return p->second;
876}
877
497a6e3a 878shared_ptr<DownstreamState> wrandom(const NumberedServerVector& servers, const DNSQuestion* dq)
a7f3108c 879{
497a6e3a 880 return valrandom(random(), servers, dq);
a7f3108c 881}
882
36e763fa 883uint32_t g_hashperturb;
2b4287d4 884double g_consistentHashBalancingFactor = 0;
497a6e3a 885shared_ptr<DownstreamState> whashed(const NumberedServerVector& servers, const DNSQuestion* dq)
a7f3108c 886{
497a6e3a 887 return valrandom(dq->qname->hash(g_hashperturb), servers, dq);
a7f3108c 888}
889
1720247e
CHB
890shared_ptr<DownstreamState> chashed(const NumberedServerVector& servers, const DNSQuestion* dq)
891{
1720247e 892 unsigned int qhash = dq->qname->hash(g_hashperturb);
b712c51e
CHB
893 unsigned int sel = std::numeric_limits<unsigned int>::max();
894 unsigned int min = std::numeric_limits<unsigned int>::max();
895 shared_ptr<DownstreamState> ret = nullptr, first = nullptr;
1720247e 896
2b4287d4 897 double targetLoad = std::numeric_limits<double>::max();
80ee5144 898 if (g_consistentHashBalancingFactor > 0) {
2b4287d4
RG
899 /* we start with one, representing the query we are currently handling */
900 double currentLoad = 1;
901 for (const auto& pair : servers) {
902 currentLoad += pair.second->outstanding;
903 }
904 targetLoad = (currentLoad / servers.size()) * g_consistentHashBalancingFactor;
905 }
906
1720247e 907 for (const auto& d: servers) {
2b4287d4 908 if (d.second->isUp() && d.second->outstanding <= targetLoad) {
93ca495f 909 // make sure hashes have been computed
d58e616a
CHB
910 if (d.second->hashes.empty()) {
911 d.second->hash();
912 }
913 {
914 ReadLock rl(&(d.second->d_lock));
93ca495f
CHB
915 const auto& server = d.second;
916 // we want to keep track of the last hash
b712c51e
CHB
917 if (min > *(server->hashes.begin())) {
918 min = *(server->hashes.begin());
919 first = server;
93ca495f 920 }
b712c51e
CHB
921
922 auto hash_it = server->hashes.lower_bound(qhash);
923 if (hash_it != server->hashes.end()) {
924 if (*hash_it < sel) {
93ca495f
CHB
925 sel = *hash_it;
926 ret = server;
927 }
d58e616a 928 }
1720247e
CHB
929 }
930 }
931 }
93ca495f
CHB
932 if (ret != nullptr) {
933 return ret;
1720247e 934 }
b712c51e
CHB
935 if (first != nullptr) {
936 return first;
1720247e 937 }
93ca495f 938 return shared_ptr<DownstreamState>();
1720247e 939}
a7f3108c 940
497a6e3a 941shared_ptr<DownstreamState> roundrobin(const NumberedServerVector& servers, const DNSQuestion* dq)
5f504638 942{
da4e7813 943 NumberedServerVector poss;
c9262563 944
22b2b326 945 for(auto& d : servers) {
da4e7813 946 if(d.second->isUp()) {
5f504638 947 poss.push_back(d);
c9262563 948 }
5f504638 949 }
c9262563 950
ecbe9133 951 const auto *res=&poss;
32b86928 952 if(poss.empty() && !g_roundrobinFailOnNoServer)
ecbe9133 953 res = &servers;
c9262563 954
955 if(res->empty())
956 return shared_ptr<DownstreamState>();
957
958 static unsigned int counter;
959
da4e7813 960 return (*res)[(counter++) % res->size()].second;
5f504638 961}
962
e6841c9e 963ComboAddress g_serverControl{"127.0.0.1:5199"};
b076b34a 964
886e2cf2
RG
965std::shared_ptr<ServerPool> createPoolIfNotExists(pools_t& pools, const string& poolName)
966{
967 std::shared_ptr<ServerPool> pool;
968 pools_t::iterator it = pools.find(poolName);
969 if (it != pools.end()) {
970 pool = it->second;
971 }
972 else {
8f4f5186
RG
973 if (!poolName.empty())
974 vinfolog("Creating pool %s", poolName);
886e2cf2
RG
975 pool = std::make_shared<ServerPool>();
976 pools.insert(std::pair<std::string,std::shared_ptr<ServerPool> >(poolName, pool));
977 }
978 return pool;
979}
78c8047b 980
742c079a
RG
981void setPoolPolicy(pools_t& pools, const string& poolName, std::shared_ptr<ServerPolicy> policy)
982{
983 std::shared_ptr<ServerPool> pool = createPoolIfNotExists(pools, poolName);
984 if (!poolName.empty()) {
985 vinfolog("Setting pool %s server selection policy to %s", poolName, policy->name);
986 } else {
987 vinfolog("Setting default pool server selection policy to %s", policy->name);
988 }
989 pool->policy = policy;
990}
991
886e2cf2 992void addServerToPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server)
22b2b326 993{
886e2cf2 994 std::shared_ptr<ServerPool> pool = createPoolIfNotExists(pools, poolName);
c218b223 995 if (!poolName.empty()) {
8f4f5186 996 vinfolog("Adding server to pool %s", poolName);
c218b223 997 } else {
8f4f5186 998 vinfolog("Adding server to default pool");
c218b223 999 }
a1b1a29d 1000 pool->addServer(server);
886e2cf2
RG
1001}
1002
1003void removeServerFromPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server)
1004{
8f4f5186 1005 std::shared_ptr<ServerPool> pool = getPool(pools, poolName);
886e2cf2 1006
c218b223 1007 if (!poolName.empty()) {
8f4f5186 1008 vinfolog("Removing server from pool %s", poolName);
c218b223 1009 }
1010 else {
8f4f5186 1011 vinfolog("Removing server from default pool");
c218b223 1012 }
886e2cf2 1013
a1b1a29d 1014 pool->removeServer(server);
886e2cf2
RG
1015}
1016
1017std::shared_ptr<ServerPool> getPool(const pools_t& pools, const std::string& poolName)
1018{
1019 pools_t::const_iterator it = pools.find(poolName);
1020
1021 if (it == pools.end()) {
1022 throw std::out_of_range("No pool named " + poolName);
1023 }
1024
1025 return it->second;
1026}
1027
a1b1a29d 1028NumberedServerVector getDownstreamCandidates(const pools_t& pools, const std::string& poolName)
886e2cf2
RG
1029{
1030 std::shared_ptr<ServerPool> pool = getPool(pools, poolName);
a1b1a29d 1031 return pool->getServers();
22b2b326 1032}
c9262563 1033
202c4ab9 1034static void spoofResponseFromString(DNSQuestion& dq, const string& spoofContent, bool raw)
7791f83a
RG
1035{
1036 string result;
614239d0 1037
202c4ab9
RG
1038 if (raw) {
1039 SpoofAction sa(spoofContent);
1040 sa(&dq, &result);
1041 }
1042 else {
1043 std::vector<std::string> addrs;
1044 stringtok(addrs, spoofContent, " ,");
614239d0 1045
202c4ab9 1046 if (addrs.size() == 1) {
614239d0 1047 try {
202c4ab9
RG
1048 ComboAddress spoofAddr(spoofContent);
1049 SpoofAction sa({spoofAddr});
1050 sa(&dq, &result);
614239d0 1051 }
202c4ab9
RG
1052 catch(const PDNSException &e) {
1053 DNSName cname(spoofContent);
1054 SpoofAction sa(cname); // CNAME then
1055 sa(&dq, &result);
614239d0 1056 }
202c4ab9
RG
1057 } else {
1058 std::vector<ComboAddress> cas;
1059 for (const auto& addr : addrs) {
1060 try {
1061 cas.push_back(ComboAddress(addr));
1062 }
1063 catch (...) {
1064 }
1065 }
1066 SpoofAction sa(cas);
1067 sa(&dq, &result);
614239d0 1068 }
7791f83a
RG
1069 }
1070}
1071
2a28db86
RG
1072bool processRulesResult(const DNSAction::Action& action, DNSQuestion& dq, std::string& ruleresult, bool& drop)
1073{
1074 switch(action) {
1075 case DNSAction::Action::Allow:
1076 return true;
1077 break;
1078 case DNSAction::Action::Drop:
1079 ++g_stats.ruleDrop;
1080 drop = true;
1081 return true;
1082 break;
1083 case DNSAction::Action::Nxdomain:
1084 dq.dh->rcode = RCode::NXDomain;
1085 dq.dh->qr=true;
1086 ++g_stats.ruleNXDomain;
1087 return true;
1088 break;
1089 case DNSAction::Action::Refused:
1090 dq.dh->rcode = RCode::Refused;
1091 dq.dh->qr=true;
1092 ++g_stats.ruleRefused;
1093 return true;
1094 break;
1095 case DNSAction::Action::ServFail:
1096 dq.dh->rcode = RCode::ServFail;
1097 dq.dh->qr=true;
1098 ++g_stats.ruleServFail;
1099 return true;
1100 break;
1101 case DNSAction::Action::Spoof:
202c4ab9
RG
1102 spoofResponseFromString(dq, ruleresult, false);
1103 return true;
1104 break;
1105 case DNSAction::Action::SpoofRaw:
1106 spoofResponseFromString(dq, ruleresult, true);
2a28db86
RG
1107 return true;
1108 break;
1109 case DNSAction::Action::Truncate:
1110 dq.dh->tc = true;
1111 dq.dh->qr = true;
955b9377
RG
1112 dq.dh->ra = dq.dh->rd;
1113 dq.dh->aa = false;
1114 dq.dh->ad = false;
2a28db86
RG
1115 return true;
1116 break;
1117 case DNSAction::Action::HeaderModify:
1118 return true;
1119 break;
1120 case DNSAction::Action::Pool:
1121 dq.poolname=ruleresult;
1122 return true;
1123 break;
1124 case DNSAction::Action::NoRecurse:
1125 dq.dh->rd = false;
1126 return true;
1127 break;
1128 /* non-terminal actions follow */
1129 case DNSAction::Action::Delay:
1130 dq.delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
1131 break;
1132 case DNSAction::Action::None:
1133 /* fall-through */
1134 case DNSAction::Action::NoOp:
1135 break;
1136 }
1137
1138 /* false means that we don't stop the processing */
1139 return false;
1140}
1141
1142
1143static bool applyRulesToQuery(LocalHolders& holders, DNSQuestion& dq, const struct timespec& now)
e91084ce 1144{
4ab01344 1145 g_rings.insertQuery(now, *dq.remote, *dq.qname, dq.qtype, dq.len, *dq.dh);
e91084ce 1146
786e4d8c 1147 if(g_qcount.enabled) {
348ef1c6 1148 string qname = (*dq.qname).toLogString();
786e4d8c
RS
1149 bool countQuery{true};
1150 if(g_qcount.filter) {
1151 std::lock_guard<std::mutex> lock(g_luamutex);
dd1a3034 1152 std::tie (countQuery, qname) = g_qcount.filter(&dq);
786e4d8c
RS
1153 }
1154
1155 if(countQuery) {
1156 WriteLock wl(&g_qcount.queryLock);
1157 if(!g_qcount.records.count(qname)) {
1158 g_qcount.records[qname] = 0;
1159 }
1160 g_qcount.records[qname]++;
1161 }
1162 }
1163
0beaa5c8 1164 if(auto got = holders.dynNMGBlock->lookup(*dq.remote)) {
701f690b 1165 auto updateBlockStats = [&got]() {
cb167afd 1166 ++g_stats.dynBlocked;
701f690b 1167 got->second.blocks++;
1168 };
1169
e91084ce 1170 if(now < got->second.until) {
7b925432
RG
1171 DNSAction::Action action = got->second.action;
1172 if (action == DNSAction::Action::None) {
1173 action = g_dynBlockAction;
1174 }
477c86a0
RG
1175 switch (action) {
1176 case DNSAction::Action::NoOp:
1177 /* do nothing */
1178 break;
79ee8ff9
RG
1179
1180 case DNSAction::Action::Nxdomain:
1181 vinfolog("Query from %s turned into NXDomain because of dynamic block", dq.remote->toStringWithPort());
1182 updateBlockStats();
1183
1184 dq.dh->rcode = RCode::NXDomain;
1185 dq.dh->qr=true;
1186 return true;
1187
477c86a0 1188 case DNSAction::Action::Refused:
dd46e5e3 1189 vinfolog("Query from %s refused because of dynamic block", dq.remote->toStringWithPort());
701f690b 1190 updateBlockStats();
8477236d 1191
dd46e5e3 1192 dq.dh->rcode = RCode::Refused;
79ee8ff9 1193 dq.dh->qr = true;
dd46e5e3 1194 return true;
477c86a0
RG
1195
1196 case DNSAction::Action::Truncate:
8477236d 1197 if(!dq.tcp) {
701f690b 1198 updateBlockStats();
8477236d 1199 vinfolog("Query from %s truncated because of dynamic block", dq.remote->toStringWithPort());
1200 dq.dh->tc = true;
1201 dq.dh->qr = true;
955b9377
RG
1202 dq.dh->ra = dq.dh->rd;
1203 dq.dh->aa = false;
1204 dq.dh->ad = false;
8477236d 1205 return true;
1206 }
1207 else {
348ef1c6 1208 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
8477236d 1209 }
477c86a0 1210 break;
3d60b39a 1211 case DNSAction::Action::NoRecurse:
1212 updateBlockStats();
1213 vinfolog("Query from %s setting rd=0 because of dynamic block", dq.remote->toStringWithPort());
1214 dq.dh->rd = false;
1215 return true;
477c86a0 1216 default:
701f690b 1217 updateBlockStats();
dd46e5e3
RG
1218 vinfolog("Query from %s dropped because of dynamic block", dq.remote->toStringWithPort());
1219 return false;
1220 }
e91084ce
RG
1221 }
1222 }
1223
0beaa5c8 1224 if(auto got = holders.dynSMTBlock->lookup(*dq.qname)) {
701f690b 1225 auto updateBlockStats = [&got]() {
cb167afd 1226 ++g_stats.dynBlocked;
701f690b 1227 got->blocks++;
1228 };
1229
71c94675 1230 if(now < got->until) {
7b925432
RG
1231 DNSAction::Action action = got->action;
1232 if (action == DNSAction::Action::None) {
1233 action = g_dynBlockAction;
1234 }
477c86a0
RG
1235 switch (action) {
1236 case DNSAction::Action::NoOp:
1237 /* do nothing */
1238 break;
79ee8ff9 1239 case DNSAction::Action::Nxdomain:
348ef1c6 1240 vinfolog("Query from %s for %s turned into NXDomain because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
79ee8ff9
RG
1241 updateBlockStats();
1242
1243 dq.dh->rcode = RCode::NXDomain;
1244 dq.dh->qr=true;
1245 return true;
477c86a0 1246 case DNSAction::Action::Refused:
348ef1c6 1247 vinfolog("Query from %s for %s refused because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
701f690b 1248 updateBlockStats();
8477236d 1249
dd46e5e3
RG
1250 dq.dh->rcode = RCode::Refused;
1251 dq.dh->qr=true;
1252 return true;
477c86a0 1253 case DNSAction::Action::Truncate:
8477236d 1254 if(!dq.tcp) {
701f690b 1255 updateBlockStats();
8477236d 1256
348ef1c6 1257 vinfolog("Query from %s for %s truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
8477236d 1258 dq.dh->tc = true;
1259 dq.dh->qr = true;
955b9377
RG
1260 dq.dh->ra = dq.dh->rd;
1261 dq.dh->aa = false;
1262 dq.dh->ad = false;
8477236d 1263 return true;
1264 }
1265 else {
348ef1c6 1266 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
8477236d 1267 }
477c86a0 1268 break;
3d60b39a 1269 case DNSAction::Action::NoRecurse:
1270 updateBlockStats();
1271 vinfolog("Query from %s setting rd=0 because of dynamic block", dq.remote->toStringWithPort());
1272 dq.dh->rd = false;
1273 return true;
477c86a0 1274 default:
701f690b 1275 updateBlockStats();
348ef1c6 1276 vinfolog("Query from %s for %s dropped because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
dd46e5e3
RG
1277 return false;
1278 }
71c94675 1279 }
1280 }
1281
e91084ce
RG
1282 DNSAction::Action action=DNSAction::Action::None;
1283 string ruleresult;
2a28db86 1284 bool drop = false;
0beaa5c8 1285 for(const auto& lr : *holders.rulactions) {
4d5959e6
RG
1286 if(lr.d_rule->matches(&dq)) {
1287 lr.d_rule->d_matches++;
1288 action=(*lr.d_action)(&dq, &ruleresult);
2a28db86 1289 if (processRulesResult(action, dq, ruleresult, drop)) {
3d60b39a 1290 break;
e91084ce
RG
1291 }
1292 }
1293 }
1294
2a28db86
RG
1295 if (drop) {
1296 return false;
1297 }
1298
e91084ce
RG
1299 return true;
1300}
1301
fbf14b03 1302ssize_t udpClientSendRequestToBackend(const std::shared_ptr<DownstreamState>& ss, const int sd, const char* request, const size_t requestLen, bool healthCheck)
fbe2a2e0 1303{
b58f08e5
RG
1304 ssize_t result;
1305
fbe2a2e0 1306 if (ss->sourceItf == 0) {
b58f08e5
RG
1307 result = send(sd, request, requestLen, 0);
1308 }
1309 else {
1310 struct msghdr msgh;
1311 struct iovec iov;
7bec330a 1312 cmsgbuf_aligned cbuf;
a2353842 1313 ComboAddress remote(ss->remote);
7bec330a
OM
1314 fillMSGHdr(&msgh, &iov, &cbuf, sizeof(cbuf), const_cast<char*>(request), requestLen, &remote);
1315 addCMsgSrcAddr(&msgh, &cbuf, &ss->sourceAddr, ss->sourceItf);
b58f08e5 1316 result = sendmsg(sd, &msgh, 0);
fbe2a2e0
RG
1317 }
1318
b58f08e5
RG
1319 if (result == -1) {
1320 int savederrno = errno;
1321 vinfolog("Error sending request to backend %s: %d", ss->remote.toStringWithPort(), savederrno);
1322
1323 /* This might sound silly, but on Linux send() might fail with EINVAL
1b126225
RG
1324 if the interface the socket was bound to doesn't exist anymore.
1325 We don't want to reconnect the real socket if the healthcheck failed,
1326 because it's not using the same socket.
1327 */
1328 if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV)) {
b58f08e5
RG
1329 ss->reconnect();
1330 }
fbe2a2e0
RG
1331 }
1332
b58f08e5 1333 return result;
fbe2a2e0
RG
1334}
1335
0beaa5c8 1336static bool isUDPQueryAcceptable(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest)
24d5cb00 1337{
0beaa5c8
RG
1338 if (msgh->msg_flags & MSG_TRUNC) {
1339 /* message was too large for our buffer */
1340 vinfolog("Dropping message too large for our buffer");
cb167afd 1341 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1342 return false;
1343 }
a4652d55 1344
0beaa5c8
RG
1345 if(!holders.acl->match(remote)) {
1346 vinfolog("Query from %s dropped because of ACL", remote.toStringWithPort());
cb167afd 1347 ++g_stats.aclDrops;
0beaa5c8 1348 return false;
2b3eefc3 1349 }
2b3eefc3 1350
0beaa5c8 1351 cs.queries++;
cb167afd 1352 ++g_stats.queries;
2b3eefc3 1353
0beaa5c8
RG
1354 if (HarvestDestinationAddress(msgh, &dest)) {
1355 /* we don't get the port, only the address */
1356 dest.sin4.sin_port = cs.local.sin4.sin_port;
1357 }
1358 else {
1359 dest.sin4.sin_family = 0;
2b3eefc3 1360 }
549d63c9 1361
0beaa5c8
RG
1362 return true;
1363}
1364
4ab01344 1365boost::optional<std::vector<uint8_t>> checkDNSCryptQuery(const ClientState& cs, const char* query, uint16_t& len, std::shared_ptr<DNSCryptQuery>& dnsCryptQuery, time_t now, bool tcp)
0beaa5c8
RG
1366{
1367 if (cs.dnscryptCtx) {
7129b5c4 1368#ifdef HAVE_DNSCRYPT
0beaa5c8
RG
1369 vector<uint8_t> response;
1370 uint16_t decryptedQueryLen = 0;
2b3eefc3 1371
43234e76 1372 dnsCryptQuery = std::make_shared<DNSCryptQuery>(cs.dnscryptCtx);
0beaa5c8 1373
4ab01344 1374 bool decrypted = handleDNSCryptQuery(const_cast<char*>(query), len, dnsCryptQuery, &decryptedQueryLen, tcp, now, response);
0beaa5c8
RG
1375
1376 if (!decrypted) {
1377 if (response.size() > 0) {
4ab01344 1378 return response;
2b3eefc3 1379 }
4ab01344 1380 throw std::runtime_error("Unable to decrypt DNSCrypt query, dropping.");
0beaa5c8 1381 }
2b3eefc3 1382
0beaa5c8 1383 len = decryptedQueryLen;
7129b5c4 1384#endif /* HAVE_DNSCRYPT */
0beaa5c8 1385 }
4ab01344 1386 return boost::none;
0beaa5c8 1387}
2b3eefc3 1388
0beaa5c8
RG
1389bool checkQueryHeaders(const struct dnsheader* dh)
1390{
1391 if (dh->qr) { // don't respond to responses
cb167afd 1392 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1393 return false;
1394 }
2b3eefc3 1395
0beaa5c8 1396 if (dh->qdcount == 0) {
cb167afd 1397 ++g_stats.emptyQueries;
0beaa5c8
RG
1398 return false;
1399 }
0ba5eecf 1400
0beaa5c8 1401 if (dh->rd) {
cb167afd 1402 ++g_stats.rdQueries;
0beaa5c8 1403 }
e91084ce 1404
0beaa5c8
RG
1405 return true;
1406}
963bef8d 1407
0beaa5c8 1408#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
e59405cd 1409static void queueResponse(const ClientState& cs, const char* response, uint16_t responseLen, const ComboAddress& dest, const ComboAddress& remote, struct mmsghdr& outMsg, struct iovec* iov, cmsgbuf_aligned* cbuf)
0beaa5c8
RG
1410{
1411 outMsg.msg_len = 0;
1412 fillMSGHdr(&outMsg.msg_hdr, iov, nullptr, 0, const_cast<char*>(response), responseLen, const_cast<ComboAddress*>(&remote));
11e1e08b 1413
0beaa5c8
RG
1414 if (dest.sin4.sin_family == 0) {
1415 outMsg.msg_hdr.msg_control = nullptr;
1416 }
1417 else {
1418 addCMsgSrcAddr(&outMsg.msg_hdr, cbuf, &dest, 0);
1419 }
1420}
1421#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
43eeadc1 1422
3e425868
RG
1423/* self-generated responses or cache hits */
1424static bool prepareOutgoingResponse(LocalHolders& holders, ClientState& cs, DNSQuestion& dq, bool cacheHit)
54aaa82b 1425{
3e425868 1426 DNSResponse dr(dq.qname, dq.qtype, dq.qclass, dq.consumed, dq.local, dq.remote, reinterpret_cast<dnsheader*>(dq.dh), dq.size, dq.len, dq.tcp, dq.queryTime);
4ab01344 1427
54aaa82b 1428#ifdef HAVE_PROTOBUF
1429 dr.uniqueId = dq.uniqueId;
1430#endif
1431 dr.qTag = dq.qTag;
3e425868 1432 dr.delayMsec = dq.delayMsec;
54aaa82b 1433
3e425868
RG
1434 if (!applyRulesToResponse(cacheHit ? holders.cacheHitRespRulactions : holders.selfAnsweredRespRulactions, dr)) {
1435 return false;
54aaa82b 1436 }
1437
3e425868
RG
1438 /* in case a rule changed it */
1439 dq.delayMsec = dr.delayMsec;
1440
54aaa82b 1441#ifdef HAVE_DNSCRYPT
7129b5c4 1442 if (!cs.muted) {
3e425868
RG
1443 if (!encryptResponse(reinterpret_cast<char*>(dq.dh), &dq.len, dq.size, dq.tcp, dq.dnsCryptQuery, nullptr, nullptr)) {
1444 return false;
54aaa82b 1445 }
54aaa82b 1446 }
7129b5c4 1447#endif /* HAVE_DNSCRYPT */
54aaa82b 1448
389d903a
RG
1449 if (cacheHit) {
1450 ++g_stats.cacheHits;
1451 }
3e425868 1452
61d10a4d
MH
1453 switch (dr.dh->rcode) {
1454 case RCode::NXDomain:
1455 ++g_stats.frontendNXDomain;
1456 break;
1457 case RCode::ServFail:
1458 ++g_stats.frontendServFail;
1459 break;
1460 case RCode::NoError:
1461 ++g_stats.frontendNoError;
1462 break;
1463 }
3e425868 1464
54aaa82b 1465 doLatencyStats(0); // we're not going to measure this
3e425868 1466 return true;
54aaa82b 1467}
1468
3e425868 1469ProcessQueryResult processQuery(DNSQuestion& dq, ClientState& cs, LocalHolders& holders, std::shared_ptr<DownstreamState>& selectedBackend)
0beaa5c8 1470{
4ab01344 1471 const uint16_t queryId = ntohs(dq.dh->id);
2b3eefc3 1472
0beaa5c8 1473 try {
43234e76
RG
1474 /* we need an accurate ("real") value for the response and
1475 to store into the IDS, but not for insertion into the
1476 rings for example */
43234e76
RG
1477 struct timespec now;
1478 gettime(&now);
2efd427d 1479
2a28db86 1480 if (!applyRulesToQuery(holders, dq, now)) {
3e425868 1481 return ProcessQueryResult::Drop;
0beaa5c8 1482 }
b63add03 1483
0beaa5c8 1484 if(dq.dh->qr) { // something turned it into a response
4ab01344 1485 fixUpQueryTurnedResponse(dq, dq.origFlags);
0beaa5c8 1486
3e425868
RG
1487 if (!prepareOutgoingResponse(holders, cs, dq, false)) {
1488 return ProcessQueryResult::Drop;
22b2b326 1489 }
5f504638 1490
3e425868 1491 ++g_stats.selfAnswered;
7fc95193 1492 ++cs.responses;
3e425868 1493 return ProcessQueryResult::SendAnswer;
0beaa5c8
RG
1494 }
1495
2a28db86 1496 std::shared_ptr<ServerPool> serverPool = getPool(*holders.pools, dq.poolname);
4ab01344 1497 dq.packetCache = serverPool->packetCache;
a1b1a29d 1498 auto policy = *(holders.policy);
0beaa5c8 1499 if (serverPool->policy != nullptr) {
a1b1a29d 1500 policy = *(serverPool->policy);
0beaa5c8 1501 }
a1b1a29d
RG
1502 auto servers = serverPool->getServers();
1503 if (policy.isLua) {
0beaa5c8 1504 std::lock_guard<std::mutex> lock(g_luamutex);
3e425868 1505 selectedBackend = policy.policy(servers, &dq);
a1b1a29d
RG
1506 }
1507 else {
3e425868 1508 selectedBackend = policy.policy(servers, &dq);
0beaa5c8 1509 }
228e4fe8 1510
9837850d 1511 uint16_t cachedResponseSize = dq.size;
3e425868 1512 uint32_t allowExpired = selectedBackend ? 0 : g_staleCacheEntriesTTL;
9837850d 1513
4ab01344
RG
1514 if (dq.packetCache && !dq.skipCache) {
1515 dq.dnssecOK = (getEDNSZ(dq) & EDNS_HEADER_FLAG_DO);
1ef18cab 1516 }
1517
3e425868 1518 if (dq.useECS && ((selectedBackend && selectedBackend->useECS) || (!selectedBackend && serverPool->getECS()))) {
389d903a 1519 // we special case our cache in case a downstream explicitly gave us a universally valid response with a 0 scope
2d0cefab
RG
1520 // we need ECS parsing (parseECS) to be true so we can be sure that the initial incoming query did not have an existing
1521 // ECS option, which would make it unsuitable for the zero-scope feature.
3e425868 1522 if (dq.packetCache && !dq.skipCache && (!selectedBackend || !selectedBackend->disableZeroScope) && dq.packetCache->isECSParsingEnabled()) {
4ab01344 1523 if (dq.packetCache->get(dq, dq.consumed, dq.dh->id, reinterpret_cast<char*>(dq.dh), &cachedResponseSize, &dq.cacheKeyNoECS, dq.subnet, dq.dnssecOK, allowExpired)) {
3e425868
RG
1524 dq.len = cachedResponseSize;
1525
1526 if (!prepareOutgoingResponse(holders, cs, dq, true)) {
1527 return ProcessQueryResult::Drop;
1528 }
1529
1530 return ProcessQueryResult::SendAnswer;
389d903a
RG
1531 }
1532
4ab01344 1533 if (!dq.subnet) {
389d903a 1534 /* there was no existing ECS on the query, enable the zero-scope feature */
4ab01344 1535 dq.useZeroScope = true;
389d903a 1536 }
9837850d 1537 }
389d903a 1538
be90d6bd 1539 if (!handleEDNSClientSubnet(dq, dq.ednsAdded, dq.ecsAdded, g_preserveTrailingData)) {
4ab01344 1540 vinfolog("Dropping query from %s because we couldn't insert the ECS value", dq.remote->toStringWithPort());
3e425868 1541 return ProcessQueryResult::Drop;
886e2cf2 1542 }
0beaa5c8 1543 }
886e2cf2 1544
4ab01344
RG
1545 if (dq.packetCache && !dq.skipCache) {
1546 if (dq.packetCache->get(dq, dq.consumed, dq.dh->id, reinterpret_cast<char*>(dq.dh), &cachedResponseSize, &dq.cacheKey, dq.subnet, dq.dnssecOK, allowExpired)) {
3e425868
RG
1547 dq.len = cachedResponseSize;
1548
1549 if (!prepareOutgoingResponse(holders, cs, dq, true)) {
1550 return ProcessQueryResult::Drop;
1551 }
1552
1553 return ProcessQueryResult::SendAnswer;
886e2cf2 1554 }
cb167afd 1555 ++g_stats.cacheMisses;
0beaa5c8 1556 }
886e2cf2 1557
3e425868 1558 if(!selectedBackend) {
cb167afd 1559 ++g_stats.noPolicy;
26a3cdb7 1560
348ef1c6 1561 vinfolog("%s query for %s|%s from %s, no policy applied", g_servFailOnNoPolicy ? "ServFailed" : "Dropped", dq.qname->toLogString(), QType(dq.qtype).getName(), dq.remote->toStringWithPort());
3e425868 1562 if (g_servFailOnNoPolicy) {
4ab01344 1563 restoreFlags(dq.dh, dq.origFlags);
26a3cdb7 1564
0beaa5c8
RG
1565 dq.dh->rcode = RCode::ServFail;
1566 dq.dh->qr = true;
26a3cdb7 1567
3e425868
RG
1568 if (!prepareOutgoingResponse(holders, cs, dq, false)) {
1569 return ProcessQueryResult::Drop;
1570 }
be6c318f 1571 // no response-only statistics counter to update.
3e425868 1572 return ProcessQueryResult::SendAnswer;
1ea747c0 1573 }
3e425868
RG
1574
1575 return ProcessQueryResult::Drop;
0beaa5c8 1576 }
1ea747c0 1577
3e425868
RG
1578 if (dq.addXPF && selectedBackend->xpfRRCode != 0) {
1579 addXPF(dq, selectedBackend->xpfRRCode, g_preserveTrailingData);
5cc8371b
RG
1580 }
1581
3e425868
RG
1582 selectedBackend->queries++;
1583 return ProcessQueryResult::PassToBackend;
4ab01344
RG
1584 }
1585 catch(const std::exception& e){
1586 vinfolog("Got an error while parsing a %s query from %s, id %d: %s", (dq.tcp ? "TCP" : "UDP"), dq.remote->toStringWithPort(), queryId, e.what());
4ab01344 1587 }
3e425868 1588 return ProcessQueryResult::Drop;
4ab01344
RG
1589}
1590
7bec330a 1591static void processUDPQuery(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest, char* query, uint16_t len, size_t queryBufferSize, struct mmsghdr* responsesVect, unsigned int* queuedResponses, struct iovec* respIOV, cmsgbuf_aligned* respCBuf)
4ab01344
RG
1592{
1593 assert(responsesVect == nullptr || (queuedResponses != nullptr && respIOV != nullptr && respCBuf != nullptr));
1594 uint16_t queryId = 0;
1595
1596 try {
1597 if (!isUDPQueryAcceptable(cs, holders, msgh, remote, dest)) {
1598 return;
1599 }
1600
1601 /* we need an accurate ("real") value for the response and
1602 to store into the IDS, but not for insertion into the
1603 rings for example */
1604 struct timespec queryRealTime;
4ab01344
RG
1605 gettime(&queryRealTime, true);
1606
1607 std::shared_ptr<DNSCryptQuery> dnsCryptQuery = nullptr;
4ab01344
RG
1608 auto dnsCryptResponse = checkDNSCryptQuery(cs, query, len, dnsCryptQuery, queryRealTime.tv_sec, false);
1609 if (dnsCryptResponse) {
1610 sendUDPResponse(cs.udpFD, reinterpret_cast<char*>(dnsCryptResponse->data()), static_cast<uint16_t>(dnsCryptResponse->size()), 0, dest, remote);
1611 return;
1612 }
4ab01344
RG
1613
1614 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(query);
1615 queryId = ntohs(dh->id);
1616
1617 if (!checkQueryHeaders(dh)) {
1618 return;
1619 }
1620
1621 uint16_t qtype, qclass;
1622 unsigned int consumed = 0;
1623 DNSName qname(query, len, sizeof(dnsheader), false, &qtype, &qclass, &consumed);
1624 DNSQuestion dq(&qname, qtype, qclass, consumed, dest.sin4.sin_family != 0 ? &dest : &cs.local, &remote, dh, queryBufferSize, len, false, &queryRealTime);
1625 dq.dnsCryptQuery = std::move(dnsCryptQuery);
3e425868
RG
1626 std::shared_ptr<DownstreamState> ss{nullptr};
1627 auto result = processQuery(dq, cs, holders, ss);
4ab01344 1628
3e425868
RG
1629 if (result == ProcessQueryResult::Drop) {
1630 return;
1631 }
1632
1633 if (result == ProcessQueryResult::SendAnswer) {
4ab01344 1634#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
3e425868
RG
1635 if (dq.delayMsec == 0 && responsesVect != nullptr) {
1636 queueResponse(cs, reinterpret_cast<char*>(dq.dh), dq.len, *dq.local, *dq.remote, responsesVect[*queuedResponses], respIOV, respCBuf);
4ab01344
RG
1637 (*queuedResponses)++;
1638 return;
1639 }
1640#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
6d192f51
RG
1641 /* we use dest, always, because we don't want to use the listening address to send a response since it could be 0.0.0.0 */
1642 sendUDPResponse(cs.udpFD, reinterpret_cast<char*>(dq.dh), dq.len, dq.delayMsec, dest, *dq.remote);
3e425868
RG
1643 return;
1644 }
4ab01344 1645
3e425868 1646 if (result != ProcessQueryResult::PassToBackend || ss == nullptr) {
4ab01344
RG
1647 return;
1648 }
11e1e08b 1649
0beaa5c8
RG
1650 unsigned int idOffset = (ss->idOffset++) % ss->idStates.size();
1651 IDState* ids = &ss->idStates[idOffset];
1652 ids->age = 0;
0956c5c5 1653 DOHUnit* du = nullptr;
c9ba8478 1654
9bd1a882
RG
1655 /* that means that the state was in use, possibly with an allocated
1656 DOHUnit that we will need to handle, but we can't touch it before
1657 confirming that we now own this state */
311f19d5 1658 if (ids->isInUse()) {
0956c5c5
RG
1659 du = ids->du;
1660 }
c9ba8478 1661
a9489723 1662 /* we atomically replace the value, we now own this state */
311f19d5
RG
1663 if (!ids->markAsUsed()) {
1664 /* the state was not in use.
9bd1a882 1665 we reset 'du' because it might have still been in use when we read it. */
0956c5c5 1666 du = nullptr;
fbf14b03 1667 ++ss->outstanding;
71b86bd8 1668 }
0beaa5c8 1669 else {
9bd1a882
RG
1670 /* we are reusing a state, no change in outstanding but if there was an existing DOHUnit we need
1671 to handle it because it's about to be overwritten. */
a9489723 1672 ids->du = nullptr;
fbf14b03 1673 ++ss->reuseds;
cb167afd 1674 ++g_stats.downstreamTimeouts;
0956c5c5 1675 handleDOHTimeout(du);
0beaa5c8 1676 }
0e41337b 1677
0beaa5c8 1678 ids->cs = &cs;
a9489723 1679 ids->origFD = cs.udpFD;
0beaa5c8 1680 ids->origID = dh->id;
d0ae6360 1681 setIDStateFromDNSQuestion(*ids, dq, std::move(qname));
0beaa5c8
RG
1682
1683 /* If we couldn't harvest the real dest addr, still
1684 write down the listening addr since it will be useful
1685 (especially if it's not an 'any' one).
1686 We need to keep track of which one it is since we may
1687 want to use the real but not the listening addr to reply.
1688 */
1689 if (dest.sin4.sin_family != 0) {
1690 ids->origDest = dest;
1691 ids->destHarvested = true;
1692 }
1693 else {
1694 ids->origDest = cs.local;
1695 ids->destHarvested = false;
1696 }
7129b5c4 1697
0beaa5c8 1698 dh->id = idOffset;
ca404e94 1699
38069e7e 1700 int fd = pickBackendSocketForSending(ss);
150105a2 1701 ssize_t ret = udpClientSendRequestToBackend(ss, fd, query, dq.len);
11e1e08b 1702
0beaa5c8 1703 if(ret < 0) {
fbf14b03 1704 ++ss->sendErrors;
cb167afd 1705 ++g_stats.downstreamSendErrors;
0beaa5c8 1706 }
ca404e94 1707
348ef1c6 1708 vinfolog("Got query for %s|%s from %s, relayed to %s", ids->qname.toLogString(), QType(ids->qtype).getName(), remote.toStringWithPort(), ss->getName());
0beaa5c8
RG
1709 }
1710 catch(const std::exception& e){
1711 vinfolog("Got an error in UDP question thread while parsing a query from %s, id %d: %s", remote.toStringWithPort(), queryId, e.what());
1712 }
1713}
1714
1715#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1716static void MultipleMessagesUDPClientThread(ClientState* cs, LocalHolders& holders)
1717{
1718 struct MMReceiver
1719 {
8179b6d6 1720 char packet[s_maxPacketCacheEntrySize];
0beaa5c8
RG
1721 ComboAddress remote;
1722 ComboAddress dest;
1723 struct iovec iov;
392966bb
OM
1724 /* used by HarvestDestinationAddress */
1725 cmsgbuf_aligned cbuf;
0beaa5c8
RG
1726 };
1727 const size_t vectSize = g_udpVectorSize;
1728 /* the actual buffer is larger because:
1729 - we may have to add EDNS and/or ECS
1730 - we use it for self-generated responses (from rule or cache)
1731 but we only accept incoming payloads up to that size
1732 */
1733 static_assert(s_udpIncomingBufferSize <= sizeof(MMReceiver::packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1734
1735 auto recvData = std::unique_ptr<MMReceiver[]>(new MMReceiver[vectSize]);
1736 auto msgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1737 auto outMsgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1738
1739 /* initialize the structures needed to receive our messages */
1740 for (size_t idx = 0; idx < vectSize; idx++) {
1741 recvData[idx].remote.sin4.sin_family = cs->local.sin4.sin_family;
7bec330a 1742 fillMSGHdr(&msgVec[idx].msg_hdr, &recvData[idx].iov, &recvData[idx].cbuf, sizeof(recvData[idx].cbuf), recvData[idx].packet, s_udpIncomingBufferSize, &recvData[idx].remote);
0beaa5c8
RG
1743 }
1744
1745 /* go now */
1746 for(;;) {
1747
1748 /* reset the IO vector, since it's also used to send the vector of responses
1749 to avoid having to copy the data around */
1750 for (size_t idx = 0; idx < vectSize; idx++) {
1751 recvData[idx].iov.iov_base = recvData[idx].packet;
1752 recvData[idx].iov.iov_len = sizeof(recvData[idx].packet);
5f504638 1753 }
0beaa5c8
RG
1754
1755 /* block until we have at least one message ready, but return
1756 as many as possible to save the syscall costs */
1757 int msgsGot = recvmmsg(cs->udpFD, msgVec.get(), vectSize, MSG_WAITFORONE | MSG_TRUNC, nullptr);
1758
1759 if (msgsGot <= 0) {
a702a96c 1760 vinfolog("Getting UDP messages via recvmmsg() failed with: %s", stringerror());
0beaa5c8 1761 continue;
773470ca 1762 }
0beaa5c8
RG
1763
1764 unsigned int msgsToSend = 0;
1765
1766 /* process the received messages */
1767 for (int msgIdx = 0; msgIdx < msgsGot; msgIdx++) {
1768 const struct msghdr* msgh = &msgVec[msgIdx].msg_hdr;
1769 unsigned int got = msgVec[msgIdx].msg_len;
1770 const ComboAddress& remote = recvData[msgIdx].remote;
1771
33d01bbd 1772 if (static_cast<size_t>(got) < sizeof(struct dnsheader)) {
3e425868 1773 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1774 continue;
1775 }
1776
7bec330a 1777 processUDPQuery(*cs, holders, msgh, remote, recvData[msgIdx].dest, recvData[msgIdx].packet, static_cast<uint16_t>(got), sizeof(recvData[msgIdx].packet), outMsgVec.get(), &msgsToSend, &recvData[msgIdx].iov, &recvData[msgIdx].cbuf);
0beaa5c8 1778
2b3eefc3 1779 }
0beaa5c8
RG
1780
1781 /* immediate (not delayed or sent to a backend) responses (mostly from a rule, dynamic block
1782 or the cache) can be sent in batch too */
1783
1784 if (msgsToSend > 0 && msgsToSend <= static_cast<unsigned int>(msgsGot)) {
1785 int sent = sendmmsg(cs->udpFD, outMsgVec.get(), msgsToSend, 0);
1786
2b3eefc3 1787 if (sent < 0 || static_cast<unsigned int>(sent) != msgsToSend) {
a702a96c 1788 vinfolog("Error sending responses with sendmmsg() (%d on %u): %s", sent, msgsToSend, stringerror());
2b3eefc3 1789 }
2b3eefc3 1790 }
0beaa5c8 1791
24d5cb00 1792 }
0beaa5c8
RG
1793}
1794#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1795
1796// listens to incoming queries, sends out to downstream servers, noting the intended return path
9b73b71c 1797static void udpClientThread(ClientState* cs)
0beaa5c8
RG
1798try
1799{
519f5484 1800 setThreadName("dnsdist/udpClie");
0beaa5c8
RG
1801 LocalHolders holders;
1802
1803#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1804 if (g_udpVectorSize > 1) {
1805 MultipleMessagesUDPClientThread(cs, holders);
1806
1807 }
1808 else
1809#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1810 {
8179b6d6 1811 char packet[s_maxPacketCacheEntrySize];
0beaa5c8
RG
1812 /* the actual buffer is larger because:
1813 - we may have to add EDNS and/or ECS
1814 - we use it for self-generated responses (from rule or cache)
1815 but we only accept incoming payloads up to that size
1816 */
1817 static_assert(s_udpIncomingBufferSize <= sizeof(packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1818 struct msghdr msgh;
1819 struct iovec iov;
1820 /* used by HarvestDestinationAddress */
7bec330a 1821 cmsgbuf_aligned cbuf;
0beaa5c8
RG
1822
1823 ComboAddress remote;
1824 ComboAddress dest;
1825 remote.sin4.sin_family = cs->local.sin4.sin_family;
be7dec02 1826 fillMSGHdr(&msgh, &iov, &cbuf, sizeof(cbuf), packet, s_udpIncomingBufferSize, &remote);
0beaa5c8
RG
1827
1828 for(;;) {
1829 ssize_t got = recvmsg(cs->udpFD, &msgh, 0);
1830
1831 if (got < 0 || static_cast<size_t>(got) < sizeof(struct dnsheader)) {
cb167afd 1832 ++g_stats.nonCompliantQueries;
0beaa5c8
RG
1833 continue;
1834 }
1835
be7dec02 1836 processUDPQuery(*cs, holders, &msgh, remote, dest, packet, static_cast<uint16_t>(got), sizeof(packet), nullptr, nullptr, nullptr, nullptr);
0beaa5c8
RG
1837 }
1838 }
24d5cb00 1839}
2b3eefc3 1840catch(const std::exception &e)
a4652d55 1841{
1842 errlog("UDP client thread died because of exception: %s", e.what());
a4652d55 1843}
2b3eefc3 1844catch(const PDNSException &e)
a4652d55 1845{
1846 errlog("UDP client thread died because of PowerDNS exception: %s", e.reason);
a4652d55 1847}
1848catch(...)
1849{
1850 errlog("UDP client thread died because of an exception: %s", "unknown");
a4652d55 1851}
24d5cb00 1852
555970c9
RG
1853uint16_t getRandomDNSID()
1854{
1855#ifdef HAVE_LIBSODIUM
dd9c8246 1856 return randombytes_uniform(65536);
555970c9
RG
1857#else
1858 return (random() % 65536);
1859#endif
1860}
1861
6c1ca990 1862uint64_t g_maxTCPClientThreads{10};
886e2cf2 1863std::atomic<uint16_t> g_cacheCleaningDelay{60};
f65ea0c2 1864std::atomic<uint16_t> g_cacheCleaningPercentage{100};
e41f8165 1865
9b73b71c 1866void maintThread()
886e2cf2 1867{
519f5484 1868 setThreadName("dnsdist/main");
886e2cf2
RG
1869 int interval = 1;
1870 size_t counter = 0;
5f3ea719 1871 int32_t secondsToWaitLog = 0;
886e2cf2
RG
1872
1873 for(;;) {
1874 sleep(interval);
1875
069e59db 1876 {
1877 std::lock_guard<std::mutex> lock(g_luamutex);
5f3ea719
PL
1878 auto f = g_lua.readVariable<boost::optional<std::function<void()> > >("maintenance");
1879 if(f) {
1880 try {
1881 (*f)();
1882 secondsToWaitLog = 0;
1883 }
1884 catch(std::exception &e) {
1885 if (secondsToWaitLog <= 0) {
1886 infolog("Error during execution of maintenance function: %s", e.what());
1887 secondsToWaitLog = 61;
1888 }
1889 secondsToWaitLog -= interval;
1890 }
1891 }
069e59db 1892 }
886e2cf2
RG
1893
1894 counter++;
1895 if (counter >= g_cacheCleaningDelay) {
c1b81381
RG
1896 /* keep track, for each cache, of whether we should keep
1897 expired entries */
1898 std::map<std::shared_ptr<DNSDistPacketCache>, bool> caches;
1899
1900 /* gather all caches actually used by at least one pool, and see
1901 if something prevents us from cleaning the expired entries */
a9c2e4ab 1902 auto localPools = g_pools.getLocal();
a9c2e4ab 1903 for (const auto& entry : *localPools) {
c1b81381
RG
1904 auto& pool = entry.second;
1905
1906 auto packetCache = pool->packetCache;
1907 if (!packetCache) {
1908 continue;
1909 }
1910
1911 auto pair = caches.insert({packetCache, false});
1912 auto& iter = pair.first;
1913 /* if we need to keep stale data for this cache (ie, not clear
1914 expired entries when at least one pool using this cache
1915 has all its backends down) */
1916 if (packetCache->keepStaleData() && iter->second == false) {
1917 /* so far all pools had at least one backend up */
1918 if (pool->countServers(true) == 0) {
1919 iter->second = true;
1920 }
886e2cf2
RG
1921 }
1922 }
c1b81381
RG
1923
1924 for (auto pair : caches) {
1925 /* shall we keep expired entries ? */
1926 if (pair.second == true) {
1927 continue;
886e2cf2 1928 }
c1b81381
RG
1929 auto& packetCache = pair.first;
1930 size_t upTo = (packetCache->getMaxEntries()* (100 - g_cacheCleaningPercentage)) / 100;
1931 packetCache->purgeExpired(upTo);
886e2cf2
RG
1932 }
1933 counter = 0;
1934 }
1935
1936 // ponder pruning g_dynblocks of expired entries here
1937 }
886e2cf2
RG
1938}
1939
9b73b71c 1940static void secPollThread()
5d4e1ef8
RG
1941{
1942 setThreadName("dnsdist/secpoll");
1943
1944 for (;;) {
1945 try {
1946 doSecPoll(g_secPollSuffix);
1947 }
1948 catch(...) {
1949 }
1950 sleep(g_secPollInterval);
1951 }
1952}
1953
9b73b71c 1954static void healthChecksThread()
3ae86514 1955{
519f5484 1956 setThreadName("dnsdist/healthC");
77c9bc9a 1957
dd9c8246 1958 static const int interval = 1;
64e4ebb4 1959
3ae86514 1960 for(;;) {
1961 sleep(interval);
773470ca 1962
dd9c8246 1963 if(g_tcpclientthreads->getQueuedCount() > 1 && !g_tcpclientthreads->hasReachedMaxThreads()) {
a9bf3ec4 1964 g_tcpclientthreads->addTCPClientThread();
dd9c8246 1965 }
3ae86514 1966
dd9c8246 1967 auto mplexer = std::shared_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
a9c2e4ab
RG
1968 auto states = g_dstates.getLocal(); // this points to the actual shared_ptrs!
1969 for(auto& dss : *states) {
dd9c8246 1970 if(++dss->lastCheck < dss->checkInterval) {
7c9bf18d 1971 continue;
dd9c8246 1972 }
5d7e6765 1973
dd9c8246 1974 dss->lastCheck = 0;
7565f4e6 1975
dd9c8246
RG
1976 if (dss->availability == DownstreamState::Availability::Auto) {
1977 if (!queueHealthCheck(mplexer, dss)) {
1978 updateHealthCheckResult(dss, false);
2993d58d 1979 }
773470ca 1980 }
b076b34a 1981
773470ca 1982 auto delta = dss->sw.udiffAndSet()/1000000.0;
1983 dss->queryLoad = 1.0*(dss->queries.load() - dss->prev.queries.load())/delta;
1984 dss->dropRate = 1.0*(dss->reuseds.load() - dss->prev.reuseds.load())/delta;
3c115e0f 1985 dss->prev.queries.store(dss->queries.load());
773470ca 1986 dss->prev.reuseds.store(dss->reuseds.load());
3c115e0f 1987
dd9c8246 1988 for (IDState& ids : dss->idStates) { // timeouts
a9489723 1989 int64_t usageIndicator = ids.usageIndicator;
311f19d5
RG
1990 if(IDState::isInUse(usageIndicator) && ids.age++ > g_udpTimeout) {
1991 /* We mark the state as unused as soon as possible
51642fe3
RG
1992 to limit the risk of racing with the
1993 responder thread.
51642fe3 1994 */
0956c5c5
RG
1995 auto oldDU = ids.du;
1996
311f19d5 1997 if (!ids.tryMarkUnused(usageIndicator)) {
71b86bd8
RG
1998 /* this state has been altered in the meantime,
1999 don't go anywhere near it */
2000 continue;
2001 }
fbf14b03 2002 ids.du = nullptr;
9bd1a882 2003 handleDOHTimeout(oldDU);
64e4ebb4 2004 ids.age = 0;
3c115e0f 2005 dss->reuseds++;
2006 --dss->outstanding;
cb167afd 2007 ++g_stats.downstreamTimeouts; // this is an 'actively' discovered timeout
c08a5092 2008 vinfolog("Had a downstream timeout from %s (%s) for query for %s|%s from %s",
2009 dss->remote.toStringWithPort(), dss->name,
348ef1c6 2010 ids.qname.toLogString(), QType(ids.qtype).getName(), ids.origRemote.toStringWithPort());
51642fe3 2011
c2fbeb27 2012 struct timespec ts;
85c7ca75 2013 gettime(&ts);
c2fbeb27 2014
2d11d1b2 2015 struct dnsheader fake;
2016 memset(&fake, 0, sizeof(fake));
2017 fake.id = ids.origID;
c2fbeb27 2018
6d31c8b6 2019 g_rings.insertResponse(ts, ids.origRemote, ids.qname, ids.qtype, std::numeric_limits<unsigned int>::max(), 0, fake, dss->remote);
232f0877 2020 }
3ae86514 2021 }
2022 }
dd9c8246
RG
2023
2024 handleQueuedHealthChecks(mplexer);
3ae86514 2025 }
3ae86514 2026}
2027
c2a42f97 2028static void bindAny(int af, int sock)
2029{
18f8e493 2030 __attribute__((unused)) int one = 1;
c2a42f97 2031
2032#ifdef IP_FREEBIND
2033 if (setsockopt(sock, IPPROTO_IP, IP_FREEBIND, &one, sizeof(one)) < 0)
a702a96c 2034 warnlog("Warning: IP_FREEBIND setsockopt failed: %s", stringerror());
c2a42f97 2035#endif
2036
2037#ifdef IP_BINDANY
2038 if (af == AF_INET)
2039 if (setsockopt(sock, IPPROTO_IP, IP_BINDANY, &one, sizeof(one)) < 0)
a702a96c 2040 warnlog("Warning: IP_BINDANY setsockopt failed: %s", stringerror());
c2a42f97 2041#endif
2042#ifdef IPV6_BINDANY
2043 if (af == AF_INET6)
2044 if (setsockopt(sock, IPPROTO_IPV6, IPV6_BINDANY, &one, sizeof(one)) < 0)
a702a96c 2045 warnlog("Warning: IPV6_BINDANY setsockopt failed: %s", stringerror());
c2a42f97 2046#endif
2047#ifdef SO_BINDANY
2048 if (setsockopt(sock, SOL_SOCKET, SO_BINDANY, &one, sizeof(one)) < 0)
a702a96c 2049 warnlog("Warning: SO_BINDANY setsockopt failed: %s", stringerror());
c2a42f97 2050#endif
2051}
2052
a36ce055
RG
2053static void dropGroupPrivs(gid_t gid)
2054{
2055 if (gid) {
2056 if (setgid(gid) == 0) {
2057 if (setgroups(0, NULL) < 0) {
a702a96c 2058 warnlog("Warning: Unable to drop supplementary gids: %s", stringerror());
a36ce055
RG
2059 }
2060 }
2061 else {
a702a96c 2062 warnlog("Warning: Unable to set group ID to %d: %s", gid, stringerror());
a36ce055
RG
2063 }
2064 }
2065}
2066
2067static void dropUserPrivs(uid_t uid)
2068{
2069 if(uid) {
2070 if(setuid(uid) < 0) {
a702a96c 2071 warnlog("Warning: Unable to set user ID to %d: %s", uid, stringerror());
a36ce055
RG
2072 }
2073 }
2074}
2075
41408d3a
RG
2076static void checkFileDescriptorsLimits(size_t udpBindsCount, size_t tcpBindsCount)
2077{
2078 /* stdin, stdout, stderr */
2079 size_t requiredFDsCount = 3;
a9c2e4ab 2080 auto backends = g_dstates.getLocal();
cd73ceeb
RG
2081 /* UDP sockets to backends */
2082 size_t backendUDPSocketsCount = 0;
a9c2e4ab 2083 for (const auto& backend : *backends) {
5bdbb83d 2084 backendUDPSocketsCount += backend->sockets.size();
cd73ceeb
RG
2085 }
2086 requiredFDsCount += backendUDPSocketsCount;
2087 /* TCP sockets to backends */
a9c2e4ab 2088 requiredFDsCount += (backends->size() * g_maxTCPClientThreads);
9fcd6adb 2089 /* listening sockets */
41408d3a
RG
2090 requiredFDsCount += udpBindsCount;
2091 requiredFDsCount += tcpBindsCount;
2092 /* max TCP connections currently served */
2093 requiredFDsCount += g_maxTCPClientThreads;
f2e29d04 2094 /* max pipes for communicating between TCP acceptors and client threads */
41408d3a 2095 requiredFDsCount += (g_maxTCPClientThreads * 2);
41408d3a 2096 /* max TCP queued connections */
9fcd6adb 2097 requiredFDsCount += g_maxTCPQueuedConnections;
41408d3a
RG
2098 /* DelayPipe pipe */
2099 requiredFDsCount += 2;
2100 /* syslog socket */
2101 requiredFDsCount++;
2102 /* webserver main socket */
2103 requiredFDsCount++;
2104 /* console main socket */
2105 requiredFDsCount++;
2106 /* carbon export */
2107 requiredFDsCount++;
2108 /* history file */
2109 requiredFDsCount++;
2110 struct rlimit rl;
2111 getrlimit(RLIMIT_NOFILE, &rl);
9fcd6adb 2112 if (rl.rlim_cur <= requiredFDsCount) {
41408d3a
RG
2113 warnlog("Warning, this configuration can use more than %d file descriptors, web server and console connections not included, and the current limit is %d.", std::to_string(requiredFDsCount), std::to_string(rl.rlim_cur));
2114#ifdef HAVE_SYSTEMD
2115 warnlog("You can increase this value by using LimitNOFILE= in the systemd unit file or ulimit.");
2116#else
2117 warnlog("You can increase this value by using ulimit.");
2118#endif
2119 }
2120}
c2a42f97 2121
6e9fd124
RG
2122static void setUpLocalBind(std::unique_ptr<ClientState>& cs)
2123{
2124 /* skip some warnings if there is an identical UDP context */
fbf14b03 2125 bool warn = cs->tcp == false || cs->tlsFrontend != nullptr || cs->dohFrontend != nullptr;
6e9fd124
RG
2126 int& fd = cs->tcp == false ? cs->udpFD : cs->tcpFD;
2127 (void) warn;
2128
2129 fd = SSocket(cs->local.sin4.sin_family, cs->tcp == false ? SOCK_DGRAM : SOCK_STREAM, 0);
2130
2131 if (cs->tcp) {
2132 SSetsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 1);
2133#ifdef TCP_DEFER_ACCEPT
2134 SSetsockopt(fd, IPPROTO_TCP, TCP_DEFER_ACCEPT, 1);
2135#endif
2136 if (cs->fastOpenQueueSize > 0) {
2137#ifdef TCP_FASTOPEN
2138 SSetsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, cs->fastOpenQueueSize);
2139#else
2140 if (warn) {
2141 warnlog("TCP Fast Open has been configured on local address '%s' but is not supported", cs->local.toStringWithPort());
2142 }
2143#endif
2144 }
2145 }
2146
2147 if(cs->local.sin4.sin_family == AF_INET6) {
2148 SSetsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, 1);
2149 }
2150
2151 bindAny(cs->local.sin4.sin_family, fd);
2152
2153 if(!cs->tcp && IsAnyAddress(cs->local)) {
2154 int one=1;
2155 setsockopt(fd, IPPROTO_IP, GEN_IP_PKTINFO, &one, sizeof(one)); // linux supports this, so why not - might fail on other systems
2156#ifdef IPV6_RECVPKTINFO
2157 setsockopt(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
2158#endif
2159 }
2160
2161 if (cs->reuseport) {
2162#ifdef SO_REUSEPORT
2163 SSetsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
2164#else
2165 if (warn) {
2166 /* no need to warn again if configured but support is not available, we already did for UDP */
2167 warnlog("SO_REUSEPORT has been configured on local address '%s' but is not supported", cs->local.toStringWithPort());
2168 }
2169#endif
2170 }
2171
90f9fbc0
RG
2172 if (!cs->tcp) {
2173 if (cs->local.isIPv4()) {
2174 try {
2175 setSocketIgnorePMTU(cs->udpFD);
2176 }
2177 catch(const std::exception& e) {
2178 warnlog("Failed to set IP_MTU_DISCOVER on UDP server socket for local address '%s': %s", cs->local.toStringWithPort(), e.what());
2179 }
2180 }
2181 }
2182
6e9fd124
RG
2183 const std::string& itf = cs->interface;
2184 if (!itf.empty()) {
2185#ifdef SO_BINDTODEVICE
2186 int res = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, itf.c_str(), itf.length());
2187 if (res != 0) {
a702a96c 2188 warnlog("Error setting up the interface on local address '%s': %s", cs->local.toStringWithPort(), stringerror());
6e9fd124
RG
2189 }
2190#else
2191 if (warn) {
2192 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", cs->local.toStringWithPort());
2193 }
2194#endif
2195 }
2196
2197#ifdef HAVE_EBPF
2198 if (g_defaultBPFFilter) {
2199 cs->attachFilter(g_defaultBPFFilter);
2200 vinfolog("Attaching default BPF Filter to %s frontend %s", (!cs->tcp ? "UDP" : "TCP"), cs->local.toStringWithPort());
2201 }
2202#endif /* HAVE_EBPF */
2203
2204 if (cs->tlsFrontend != nullptr) {
2205 if (!cs->tlsFrontend->setupTLS()) {
2206 errlog("Error while setting up TLS on local address '%s', exiting", cs->local.toStringWithPort());
2207 _exit(EXIT_FAILURE);
2208 }
2209 }
2210
fbf14b03
RG
2211 if (cs->dohFrontend != nullptr) {
2212 cs->dohFrontend->setup();
2213 }
2214
6e9fd124
RG
2215 SBind(fd, cs->local);
2216
2217 if (cs->tcp) {
fbf14b03 2218 SListen(cs->tcpFD, SOMAXCONN);
6e9fd124
RG
2219 if (cs->tlsFrontend != nullptr) {
2220 warnlog("Listening on %s for TLS", cs->local.toStringWithPort());
2221 }
fbf14b03
RG
2222 else if (cs->dohFrontend != nullptr) {
2223 warnlog("Listening on %s for DoH", cs->local.toStringWithPort());
2224 }
6e9fd124
RG
2225 else if (cs->dnscryptCtx != nullptr) {
2226 warnlog("Listening on %s for DNSCrypt", cs->local.toStringWithPort());
2227 }
2228 else {
2229 warnlog("Listening on %s", cs->local.toStringWithPort());
2230 }
2231 }
2232
2233 cs->ready = true;
2234}
2235
7cc68f53 2236struct
2237{
2238 vector<string> locals;
2239 vector<string> remotes;
5efcfa63 2240 bool checkConfig{false};
7cc68f53 2241 bool beClient{false};
505ca3d1 2242 bool beSupervised{false};
7cc68f53 2243 string command;
2244 string config;
a36ce055
RG
2245 string uid;
2246 string gid;
7cc68f53 2247} g_cmdLine;
520eb5a0 2248
e41f8165 2249std::atomic<bool> g_configurationDone{false};
520eb5a0 2250
7d3ee2bb
PL
2251static void usage()
2252{
2253 cout<<endl;
b82a127e
RG
2254 cout<<"Syntax: dnsdist [-C,--config file] [-c,--client [IP[:PORT]]]\n";
2255 cout<<"[-e,--execute cmd] [-h,--help] [-l,--local addr]\n";
4406b79b 2256 cout<<"[-v,--verbose] [--check-config] [--version]\n";
7d3ee2bb
PL
2257 cout<<"\n";
2258 cout<<"-a,--acl netmask Add this netmask to the ACL\n";
2259 cout<<"-C,--config file Load configuration from 'file'\n";
2260 cout<<"-c,--client Operate as a client, connect to dnsdist. This reads\n";
2261 cout<<" controlSocket from your configuration file, but also\n";
2262 cout<<" accepts an IP:PORT argument\n";
2263#ifdef HAVE_LIBSODIUM
2264 cout<<"-k,--setkey KEY Use KEY for encrypted communication to dnsdist. This\n";
2265 cout<<" is similar to setting setKey in the configuration file.\n";
17a0ddad
CH
2266 cout<<" NOTE: this will leak this key in your shell's history\n";
2267 cout<<" and in the systems running process list.\n";
7d3ee2bb
PL
2268#endif
2269 cout<<"--check-config Validate the configuration file and exit. The exit-code\n";
2270 cout<<" reflects the validation, 0 is OK, 1 means an error.\n";
2271 cout<<" Any errors are printed as well.\n";
7d3ee2bb
PL
2272 cout<<"-e,--execute cmd Connect to dnsdist and execute 'cmd'\n";
2273 cout<<"-g,--gid gid Change the process group ID after binding sockets\n";
2274 cout<<"-h,--help Display this helpful message\n";
2275 cout<<"-l,--local address Listen on this local address\n";
2276 cout<<"--supervised Don't open a console, I'm supervised\n";
2277 cout<<" (use with e.g. systemd and daemontools)\n";
2278 cout<<"--disable-syslog Don't log to syslog, only to stdout\n";
2279 cout<<" (use with e.g. systemd)\n";
7d3ee2bb
PL
2280 cout<<"-u,--uid uid Change the process user ID after binding sockets\n";
2281 cout<<"-v,--verbose Enable verbose mode\n";
4406b79b 2282 cout<<"-V,--version Show dnsdist version information and exit\n";
7d3ee2bb
PL
2283}
2284
24d5cb00 2285int main(int argc, char** argv)
2286try
2287{
41408d3a
RG
2288 size_t udpBindsCount = 0;
2289 size_t tcpBindsCount = 0;
94721140 2290 rl_attempted_completion_function = my_completion;
2291 rl_completion_append_character = 0;
2292
726ddf60 2293 signal(SIGPIPE, SIG_IGN);
6d01c80c 2294 signal(SIGCHLD, SIG_IGN);
0ca6a67f 2295 openlog("dnsdist", LOG_PID|LOG_NDELAY, LOG_DAEMON);
6d01c80c 2296
0b62ec78 2297#ifdef HAVE_LIBSODIUM
6d01c80c 2298 if (sodium_init() == -1) {
2299 cerr<<"Unable to initialize crypto library"<<endl;
2300 exit(EXIT_FAILURE);
2301 }
7691e7df 2302 g_hashperturb=randombytes_uniform(0xffffffff);
2303 srandom(randombytes_uniform(0xffffffff));
2304#else
2305 {
2306 struct timeval tv;
2307 gettimeofday(&tv, 0);
2308 srandom(tv.tv_sec ^ tv.tv_usec ^ getpid());
2309 g_hashperturb=random();
2310 }
2311
0b62ec78 2312#endif
094b6aff 2313 ComboAddress clientAddress = ComboAddress();
11058bd6 2314 g_cmdLine.config=SYSCONFDIR "/dnsdist.conf";
359bdba5 2315 struct option longopts[]={
8f2d5ec3 2316 {"acl", required_argument, 0, 'a'},
359bdba5
CH
2317 {"check-config", no_argument, 0, 1},
2318 {"client", no_argument, 0, 'c'},
7cc68f53 2319 {"config", required_argument, 0, 'C'},
359bdba5 2320 {"disable-syslog", no_argument, 0, 2},
7cc68f53 2321 {"execute", required_argument, 0, 'e'},
359bdba5
CH
2322 {"gid", required_argument, 0, 'g'},
2323 {"help", no_argument, 0, 'h'},
2324 {"local", required_argument, 0, 'l'},
359bdba5 2325 {"setkey", required_argument, 0, 'k'},
359bdba5
CH
2326 {"supervised", no_argument, 0, 3},
2327 {"uid", required_argument, 0, 'u'},
2328 {"verbose", no_argument, 0, 'v'},
2329 {"version", no_argument, 0, 'V'},
2330 {0,0,0,0}
7cc68f53 2331 };
2332 int longindex=0;
8f2d5ec3 2333 string optstring;
7cc68f53 2334 for(;;) {
359bdba5 2335 int c=getopt_long(argc, argv, "a:cC:e:g:hk:l:u:vV", longopts, &longindex);
7cc68f53 2336 if(c==-1)
2337 break;
2338 switch(c) {
5efcfa63
PL
2339 case 1:
2340 g_cmdLine.checkConfig=true;
2341 break;
bbfaaa6f
PL
2342 case 2:
2343 g_syslog=false;
2344 break;
b7165327
CH
2345 case 3:
2346 g_cmdLine.beSupervised=true;
2347 break;
7cc68f53 2348 case 'C':
2349 g_cmdLine.config=optarg;
2350 break;
2351 case 'c':
2352 g_cmdLine.beClient=true;
2353 break;
7cc68f53 2354 case 'e':
2355 g_cmdLine.command=optarg;
2356 break;
a36ce055
RG
2357 case 'g':
2358 g_cmdLine.gid=optarg;
2359 break;
7cc68f53 2360 case 'h':
6306c282 2361 cout<<"dnsdist "<<VERSION<<endl;
7d3ee2bb 2362 usage();
7cc68f53 2363 cout<<"\n";
2364 exit(EXIT_SUCCESS);
2365 break;
8f2d5ec3 2366 case 'a':
2367 optstring=optarg;
2368 g_ACL.modify([optstring](NetmaskGroup& nmg) { nmg.addMask(optstring); });
2369 break;
ddb14ec9 2370 case 'k':
b4b5edbd 2371#ifdef HAVE_LIBSODIUM
b5521206 2372 if (B64Decode(string(optarg), g_consoleKey) < 0) {
ddb14ec9
PL
2373 cerr<<"Unable to decode key '"<<optarg<<"'."<<endl;
2374 exit(EXIT_FAILURE);
2375 }
b4b5edbd
CH
2376#else
2377 cerr<<"dnsdist has been built without libsodium, -k/--setkey is unsupported."<<endl;
2378 exit(EXIT_FAILURE);
ddb14ec9 2379#endif
b4b5edbd 2380 break;
7cc68f53 2381 case 'l':
6a363878 2382 g_cmdLine.locals.push_back(trim_copy(string(optarg)));
7cc68f53 2383 break;
a36ce055
RG
2384 case 'u':
2385 g_cmdLine.uid=optarg;
2386 break;
7cc68f53 2387 case 'v':
2388 g_verbose=true;
2389 break;
6306c282 2390 case 'V':
d4d796e5
PD
2391#ifdef LUAJIT_VERSION
2392 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<" ["<<LUAJIT_VERSION<<"])"<<endl;
2393#else
2394 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<")"<<endl;
2395#endif
70829d97 2396 cout<<"Enabled features: ";
90fe8ae6
RG
2397#ifdef HAVE_CDB
2398 cout<<"cdb ";
2399#endif
a227f47d
RG
2400#ifdef HAVE_DNS_OVER_TLS
2401 cout<<"dns-over-tls(";
2402#ifdef HAVE_GNUTLS
3909bf10
CH
2403 cout<<"gnutls";
2404 #ifdef HAVE_LIBSSL
2405 cout<<" ";
2406 #endif
a227f47d
RG
2407#endif
2408#ifdef HAVE_LIBSSL
2409 cout<<"openssl";
2410#endif
2411 cout<<") ";
2412#endif
fbf14b03
RG
2413#ifdef HAVE_DNS_OVER_HTTPS
2414 cout<<"dns-over-https(DOH) ";
2415#endif
70829d97
PL
2416#ifdef HAVE_DNSCRYPT
2417 cout<<"dnscrypt ";
2418#endif
0beaa5c8
RG
2419#ifdef HAVE_EBPF
2420 cout<<"ebpf ";
2421#endif
82a91ddf
CH
2422#ifdef HAVE_FSTRM
2423 cout<<"fstrm ";
2424#endif
f4b1f1fd
RG
2425#ifdef HAVE_LIBCRYPTO
2426 cout<<"ipcipher ";
2427#endif
3909bf10
CH
2428#ifdef HAVE_LIBSODIUM
2429 cout<<"libsodium ";
2430#endif
f441962a
RG
2431#ifdef HAVE_LMDB
2432 cout<<"lmdb ";
2433#endif
70829d97
PL
2434#ifdef HAVE_PROTOBUF
2435 cout<<"protobuf ";
2436#endif
2437#ifdef HAVE_RE2
2438 cout<<"re2 ";
2439#endif
0beaa5c8
RG
2440#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
2441 cout<<"recvmmsg/sendmmsg ";
2442#endif
2443#ifdef HAVE_NET_SNMP
2444 cout<<"snmp ";
2445#endif
70829d97
PL
2446#ifdef HAVE_SYSTEMD
2447 cout<<"systemd";
2448#endif
2449 cout<<endl;
6306c282
PL
2450 exit(EXIT_SUCCESS);
2451 break;
7d3ee2bb
PL
2452 case '?':
2453 //getopt_long printed an error message.
2454 usage();
2455 exit(EXIT_FAILURE);
2456 break;
7cc68f53 2457 }
24d5cb00 2458 }
6ab65223 2459
7cc68f53 2460 argc-=optind;
2461 argv+=optind;
2462 for(auto p = argv; *p; ++p) {
094b6aff
PL
2463 if(g_cmdLine.beClient) {
2464 clientAddress = ComboAddress(*p, 5199);
2465 } else {
2466 g_cmdLine.remotes.push_back(*p);
2467 }
7cc68f53 2468 }
2469
a1b1a29d 2470 ServerPolicy leastOutstandingPol{"leastOutstanding", leastOutstanding, false};
cd29dcb1 2471
e5a14b2b 2472 g_policy.setState(leastOutstandingPol);
7cc68f53 2473 if(g_cmdLine.beClient || !g_cmdLine.command.empty()) {
203b5348 2474 setupLua(true, false, g_cmdLine.config);
094b6aff
PL
2475 if (clientAddress != ComboAddress())
2476 g_serverControl = clientAddress;
7cc68f53 2477 doClient(g_serverControl, g_cmdLine.command);
e16fd59c 2478 _exit(EXIT_SUCCESS);
6d01c80c 2479 }
2e72cc0e 2480
8f133915 2481 auto acl = g_ACL.getCopy();
8f2d5ec3 2482 if(acl.empty()) {
2483 for(auto& addr : {"127.0.0.0/8", "10.0.0.0/8", "100.64.0.0/10", "169.254.0.0/16", "192.168.0.0/16", "172.16.0.0/12", "::1/128", "fc00::/7", "fe80::/10"})
2484 acl.addMask(addr);
2485 g_ACL.setState(acl);
2486 }
8f133915 2487
b5521206 2488 auto consoleACL = g_consoleACL.getCopy();
5ceea33e
RG
2489 for (const auto& mask : { "127.0.0.1/8", "::1/128" }) {
2490 consoleACL.addMask(mask);
2491 }
b5521206
RG
2492 g_consoleACL.setState(consoleACL);
2493
5efcfa63 2494 if (g_cmdLine.checkConfig) {
203b5348 2495 setupLua(false, true, g_cmdLine.config);
5efcfa63
PL
2496 // No exception was thrown
2497 infolog("Configuration '%s' OK!", g_cmdLine.config);
d8c19b98 2498 _exit(EXIT_SUCCESS);
5efcfa63
PL
2499 }
2500
203b5348 2501 auto todo=setupLua(false, false, g_cmdLine.config);
2e72cc0e 2502
636cc544
CHB
2503 auto localPools = g_pools.getCopy();
2504 {
2505 bool precompute = false;
2506 if (g_policy.getLocal()->name == "chashed") {
2507 precompute = true;
2508 } else {
2509 for (const auto& entry: localPools) {
2510 if (entry.second->policy != nullptr && entry.second->policy->name == "chashed") {
2511 precompute = true;
2512 break ;
2513 }
2514 }
2515 }
2516 if (precompute) {
2517 vinfolog("Pre-computing hashes for consistent hash load-balancing policy");
2518 // pre compute hashes
2519 auto backends = g_dstates.getLocal();
2520 for (auto& backend: *backends) {
2521 backend->hash();
2522 }
d58e616a
CHB
2523 }
2524 }
2525
6e9fd124
RG
2526 if (!g_cmdLine.locals.empty()) {
2527 for (auto it = g_frontends.begin(); it != g_frontends.end(); ) {
fbf14b03
RG
2528 /* DoH, DoT and DNSCrypt frontends are separate */
2529 if ((*it)->dohFrontend == nullptr && (*it)->tlsFrontend == nullptr && (*it)->dnscryptCtx == nullptr) {
6e9fd124 2530 it = g_frontends.erase(it);
9f67b883 2531 }
6e9fd124
RG
2532 else {
2533 ++it;
9f67b883 2534 }
9f67b883
RG
2535 }
2536
6e9fd124
RG
2537 for(const auto& loc : g_cmdLine.locals) {
2538 /* UDP */
2539 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress(loc, 53), false, false, 0, "", {})));
2540 /* TCP */
2541 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress(loc, 53), true, false, 0, "", {})));
87b515ed 2542 }
a36ce055
RG
2543 }
2544
6e9fd124
RG
2545 if (g_frontends.empty()) {
2546 /* UDP */
2547 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress("127.0.0.1", 53), false, false, 0, "", {})));
2548 /* TCP */
2549 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress("127.0.0.1", 53), true, false, 0, "", {})));
11e1e08b 2550 }
11e1e08b 2551
6e9fd124 2552 g_configurationDone = true;
a227f47d 2553
6e9fd124
RG
2554 for(auto& frontend : g_frontends) {
2555 setUpLocalBind(frontend);
a227f47d 2556
6e9fd124
RG
2557 if (frontend->tcp == false) {
2558 ++udpBindsCount;
a227f47d
RG
2559 }
2560 else {
6e9fd124 2561 ++tcpBindsCount;
a227f47d
RG
2562 }
2563 }
2564
b82a127e 2565 warnlog("dnsdist %s comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it according to the terms of the GPL version 2", VERSION);
9c9b4998 2566
b82a127e
RG
2567 vector<string> vec;
2568 std::string acls;
2569 g_ACL.getLocal()->toStringVector(&vec);
2570 for(const auto& s : vec) {
2571 if (!acls.empty())
2572 acls += ", ";
2573 acls += s;
b076b34a 2574 }
b82a127e 2575 infolog("ACL allowing queries from: %s", acls.c_str());
b5521206
RG
2576 vec.clear();
2577 acls.clear();
2578 g_consoleACL.getLocal()->toStringVector(&vec);
2579 for (const auto& entry : vec) {
2580 if (!acls.empty()) {
2581 acls += ", ";
2582 }
2583 acls += entry;
2584 }
2585 infolog("Console ACL allowing connections from: %s", acls.c_str());
6d01c80c 2586
9c9b4998
RG
2587#ifdef HAVE_LIBSODIUM
2588 if (g_consoleEnabled && g_consoleKey.empty()) {
2589 warnlog("Warning, the console has been enabled via 'controlSocket()' but no key has been set with 'setKey()' so all connections will fail until a key has been set");
2590 }
2591#endif
2592
a65aec1f
PL
2593 uid_t newgid=getegid();
2594 gid_t newuid=geteuid();
aac59883
RG
2595
2596 if(!g_cmdLine.gid.empty())
2597 newgid = strToGID(g_cmdLine.gid.c_str());
2598
2599 if(!g_cmdLine.uid.empty())
2600 newuid = strToUID(g_cmdLine.uid.c_str());
2601
d2138f20
PL
2602 if (getegid() != newgid) {
2603 if (running_in_service_mgr()) {
2604 errlog("--gid/-g set on command-line, but dnsdist was started as a systemd service. Use the 'Group' setting in the systemd unit file to set the group to run as");
2605 _exit(EXIT_FAILURE);
2606 }
a65aec1f 2607 dropGroupPrivs(newgid);
d2138f20
PL
2608 }
2609
2610 if (geteuid() != newuid) {
2611 if (running_in_service_mgr()) {
2612 errlog("--uid/-u set on command-line, but dnsdist was started as a systemd service. Use the 'User' setting in the systemd unit file to set the user to run as");
2613 _exit(EXIT_FAILURE);
2614 }
a65aec1f 2615 dropUserPrivs(newuid);
d2138f20 2616 }
a65aec1f 2617
fdc3ea42
RG
2618 try {
2619 /* we might still have capabilities remaining,
2620 for example if we have been started as root
2621 without --uid or --gid (please don't do that)
2622 or as an unprivileged user with ambient
2623 capabilities like CAP_NET_BIND_SERVICE.
2624 */
83fe2c55 2625 dropCapabilities(g_capabilitiesToRetain);
fdc3ea42
RG
2626 }
2627 catch(const std::exception& e) {
2628 warnlog("%s", e.what());
2629 }
aac59883 2630
a36ce055
RG
2631 /* this need to be done _after_ dropping privileges */
2632 g_delay = new DelayPipe<DelayedPacket>();
2633
9f4eb5cc
RG
2634 if (g_snmpAgent) {
2635 g_snmpAgent->run();
2636 }
2637
1f7646c2 2638 g_tcpclientthreads = std::unique_ptr<TCPClientCollection>(new TCPClientCollection(g_maxTCPClientThreads, g_useTCPSinglePipe));
a9bf3ec4 2639
2e72cc0e 2640 for(auto& t : todo)
2641 t();
2642
636cc544 2643 localPools = g_pools.getCopy();
8f4f5186
RG
2644 /* create the default pool no matter what */
2645 createPoolIfNotExists(localPools, "");
7cc68f53 2646 if(g_cmdLine.remotes.size()) {
2647 for(const auto& address : g_cmdLine.remotes) {
c9262563 2648 auto ret=std::make_shared<DownstreamState>(ComboAddress(address, 53));
886e2cf2 2649 addServerToPool(localPools, "", ret);
5d7e6765 2650 if (ret->connected && !ret->threadStarted.test_and_set()) {
2717a92f 2651 ret->tid = thread(responderThread, ret);
7565f4e6 2652 }
ecbe9133 2653 g_dstates.modify([ret](servers_t& servers) { servers.push_back(ret); });
6d01c80c 2654 }
2655 }
886e2cf2 2656 g_pools.setState(localPools);
6d01c80c 2657
a9c2e4ab 2658 if(g_dstates.getLocal()->empty()) {
e73ec7d3 2659 errlog("No downstream servers defined: all packets will get dropped");
2660 // you might define them later, but you need to know
2661 }
2662
41408d3a
RG
2663 checkFileDescriptorsLimits(udpBindsCount, tcpBindsCount);
2664
dd9c8246 2665 auto mplexer = std::shared_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
e5a14b2b 2666 for(auto& dss : g_dstates.getCopy()) { // it is a copy, but the internal shared_ptrs are the real deal
dd9c8246
RG
2667 if (dss->availability == DownstreamState::Availability::Auto) {
2668 if (!queueHealthCheck(mplexer, dss, true)) {
2669 dss->upStatus = false;
2670 warnlog("Marking downstream %s as 'down'", dss->getNameWithAddr());
2671 }
773470ca 2672 }
2673 }
dd9c8246 2674 handleQueuedHealthChecks(mplexer, true);
b076b34a 2675
6e9fd124 2676 for(auto& cs : g_frontends) {
fbf14b03
RG
2677 if (cs->dohFrontend != nullptr) {
2678#ifdef HAVE_DNS_OVER_HTTPS
2679 std::thread t1(dohThread, cs.get());
2b0cb8f8
RG
2680 if (!cs->cpus.empty()) {
2681 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2682 }
fbf14b03
RG
2683 t1.detach();
2684#endif /* HAVE_DNS_OVER_HTTPS */
2685 continue;
2686 }
a36ce055 2687 if (cs->udpFD >= 0) {
6e9fd124 2688 thread t1(udpClientThread, cs.get());
f0e4dcba
RG
2689 if (!cs->cpus.empty()) {
2690 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2691 }
a36ce055 2692 t1.detach();
652a7355 2693 }
a36ce055 2694 else if (cs->tcpFD >= 0) {
6e9fd124 2695 thread t1(tcpAcceptorThread, cs.get());
f0e4dcba
RG
2696 if (!cs->cpus.empty()) {
2697 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2698 }
a36ce055 2699 t1.detach();
726ddf60 2700 }
24d5cb00 2701 }
7730131a 2702
42fae326 2703 thread carbonthread(carbonDumpThread);
2704 carbonthread.detach();
2705
3c115e0f 2706 thread stattid(maintThread);
886e2cf2 2707 stattid.detach();
6d01c80c 2708
886e2cf2
RG
2709 thread healththread(healthChecksThread);
2710
5d4e1ef8
RG
2711 if (!g_secPollSuffix.empty()) {
2712 thread secpollthread(secPollThread);
2713 secpollthread.detach();
2714 }
2715
b82a127e 2716 if(g_cmdLine.beSupervised) {
6ab65223
PL
2717#ifdef HAVE_SYSTEMD
2718 sd_notify(0, "READY=1");
2719#endif
886e2cf2 2720 healththread.join();
773470ca 2721 }
6d01c80c 2722 else {
886e2cf2 2723 healththread.detach();
505ca3d1 2724 doConsole();
3c115e0f 2725 }
9cf811d1 2726 _exit(EXIT_SUCCESS);
3c115e0f 2727
6d01c80c 2728}
3f5c3f1d
PD
2729catch(const LuaContext::ExecutionErrorException& e) {
2730 try {
2731 errlog("Fatal Lua error: %s", e.what());
2732 std::rethrow_if_nested(e);
2010ac95
RG
2733 } catch(const std::exception& ne) {
2734 errlog("Details: %s", ne.what());
3f5c3f1d
PD
2735 }
2736 catch(PDNSException &ae)
2737 {
2738 errlog("Fatal pdns error: %s", ae.reason);
2739 }
2740 _exit(EXIT_FAILURE);
2741}
24d5cb00 2742catch(std::exception &e)
2743{
6d01c80c 2744 errlog("Fatal error: %s", e.what());
4a966472 2745 _exit(EXIT_FAILURE);
24d5cb00 2746}
3f81d239 2747catch(PDNSException &ae)
7730131a 2748{
6d01c80c 2749 errlog("Fatal pdns error: %s", ae.reason);
4a966472 2750 _exit(EXIT_FAILURE);
7730131a 2751}
eb0335ff
MC
2752
2753uint64_t getLatencyCount(const std::string&)
2754{
2755 return g_stats.responses + g_stats.selfAnswered + g_stats.cacheHits;
2756}