]> git.ipfire.org Git - thirdparty/pdns.git/blame - pdns/dnsdist.cc
dnsdist: Add initial DNS over TLS support
[thirdparty/pdns.git] / pdns / dnsdist.cc
CommitLineData
24d5cb00 1/*
12471842
PL
2 * This file is part of PowerDNS or dnsdist.
3 * Copyright -- PowerDNS.COM B.V. and its contributors
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of version 2 of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * In addition, for the avoidance of any doubt, permission is granted to
10 * link this program with OpenSSL and to (re)distribute the binaries
11 * produced as the result of such linking.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
df111b53 22#include "dnsdist.hh"
ca404e94 23#include "dnsdist-ecs.hh"
24d5cb00 24#include "sstuff.hh"
25#include "misc.hh"
24d5cb00 26#include <netinet/tcp.h>
2c5db3f7 27#include <limits>
64e4ebb4 28#include "dolog.hh"
424bdfb1
PD
29
30#if defined (__OpenBSD__)
31#include <readline/readline.h>
32#else
33#include <editline/readline.h>
34#endif
35
3c115e0f 36#include "dnsname.hh"
773470ca 37#include "dnswriter.hh"
6d01c80c 38#include "base64.hh"
3f25bce6 39#include <fstream>
7b3865cd 40#include "delaypipe.hh"
7cc68f53 41#include <unistd.h>
6d01c80c 42#include "sodcrypto.hh"
6bb38cd6 43#include "dnsdist-lua.hh"
a36ce055
RG
44#include <grp.h>
45#include <pwd.h>
0e41337b 46#include "lock.hh"
7cc68f53 47#include <getopt.h>
41408d3a 48#include <sys/resource.h>
886e2cf2 49#include "dnsdist-cache.hh"
85c7ca75 50#include "gettime.hh"
6bb38cd6 51#include "ednsoptions.hh"
3f25bce6 52
6ab65223
PL
53#ifdef HAVE_SYSTEMD
54#include <systemd/sd-daemon.h>
55#endif
56
0beaa5c8
RG
57#ifdef HAVE_PROTOBUF
58thread_local boost::uuids::random_generator t_uuidGenerator;
59#endif
60
a40df301 61/* Known sins:
e48090d1 62
d12cea01 63 Receiver is currently single threaded
e5a14b2b 64 not *that* bad actually, but now that we are thread safe, might want to scale
a40df301 65*/
24d5cb00 66
0940e4eb 67/* the Rulaction plan
68 Set of Rules, if one matches, it leads to an Action
69 Both rules and actions could conceivably be Lua based.
70 On the C++ side, both could be inherited from a class Rule and a class Action,
71 on the Lua side we can't do that. */
72
64e4ebb4 73using std::atomic;
74using std::thread;
7730131a 75bool g_verbose;
b065f701 76
e48090d1 77struct DNSDistStats g_stats;
13fe35db 78uint16_t g_maxOutstanding{10240};
b076b34a 79bool g_console;
9e87dcb8 80bool g_verboseHealthChecks{false};
1ea747c0 81uint32_t g_staleCacheEntriesTTL{0};
bbfaaa6f 82bool g_syslog{true};
cffde2fd 83
84GlobalStateHolder<NetmaskGroup> g_ACL;
c9262563 85string g_outputBuffer;
a227f47d 86
f0e4dcba 87vector<std::tuple<ComboAddress, bool, bool, int, string, std::set<int>>> g_locals;
a227f47d 88std::vector<std::shared_ptr<TLSFrontend>> g_tlslocals;
11e1e08b 89#ifdef HAVE_DNSCRYPT
f0e4dcba 90std::vector<std::tuple<ComboAddress,DnsCryptContext,bool, int, string, std::set<int>>> g_dnsCryptLocals;
11e1e08b 91#endif
87b515ed
RG
92#ifdef HAVE_EBPF
93shared_ptr<BPFFilter> g_defaultBPFFilter;
8429ad04 94std::vector<std::shared_ptr<DynBPFFilter> > g_dynBPFFilters;
87b515ed 95#endif /* HAVE_EBPF */
963bef8d 96vector<ClientState *> g_frontends;
886e2cf2 97GlobalStateHolder<pools_t> g_pools;
0beaa5c8 98size_t g_udpVectorSize{1};
7c0860e1 99
9f4eb5cc
RG
100bool g_snmpEnabled{false};
101bool g_snmpTrapsEnabled{false};
102DNSDistSNMPAgent* g_snmpAgent{nullptr};
103
726ddf60 104/* UDP: the grand design. Per socket we listen on for incoming queries there is one thread.
7730131a 105 Then we have a bunch of connected sockets for talking to downstream servers.
106 We send directly to those sockets.
24d5cb00 107
7730131a 108 For the return path, per downstream server we have a thread that listens to responses.
24d5cb00 109
7730131a 110 Per socket there is an array of 2^16 states, when we send out a packet downstream, we note
111 there the original requestor and the original id. The new ID is the offset in the array.
112
113 When an answer comes in on a socket, we look up the offset by the id, and lob it to the
114 original requestor.
115
116 IDs are assigned by atomic increments of the socket offset.
117 */
118
3f25bce6 119/* for our load balancing, we want to support:
120 Round-robin
121 Round-robin with basic uptime checks
122 Send to least loaded server (least outstanding)
123 Send it to the first server that is not overloaded
839f3021 124 Hashed weighted random
3f25bce6 125*/
126
773470ca 127/* Idea:
128 Multiple server groups, by default we load balance to the group with no name.
129 Each instance is either 'up', 'down' or 'auto', where 'auto' means that dnsdist
130 determines if the instance is up or not. Auto should be the default and very very good.
131
132 In addition, to each instance you can attach a QPS object with rate & burst, which will optionally
133 limit the amount of queries we send there.
134
135 If all downstreams are over QPS, we pick the fastest server */
136
0940e4eb 137GlobalStateHolder<vector<pair<std::shared_ptr<DNSRule>, std::shared_ptr<DNSAction> > > > g_rulactions;
8146444b 138GlobalStateHolder<vector<pair<std::shared_ptr<DNSRule>, std::shared_ptr<DNSResponseAction> > > > g_resprulactions;
cf48b0ce 139GlobalStateHolder<vector<pair<std::shared_ptr<DNSRule>, std::shared_ptr<DNSResponseAction> > > > g_cachehitresprulactions;
df111b53 140Rings g_rings;
786e4d8c 141QueryCount g_qcount;
7c0860e1 142
ecbe9133 143GlobalStateHolder<servers_t> g_dstates;
78ffa782 144GlobalStateHolder<NetmaskTree<DynBlock>> g_dynblockNMG;
71c94675 145GlobalStateHolder<SuffixMatchTree<DynBlock>> g_dynblockSMT;
dd46e5e3 146DNSAction::Action g_dynBlockAction = DNSAction::Action::Drop;
3f6d07a4
RG
147int g_tcpRecvTimeout{2};
148int g_tcpSendTimeout{2};
e0b5e49d 149int g_udpTimeout{2};
3f6d07a4 150
26a3cdb7 151bool g_servFailOnNoPolicy{false};
e72fbfc4 152bool g_truncateTC{false};
b29edbee 153bool g_fixupCase{0};
0beaa5c8
RG
154
155static const size_t s_udpIncomingBufferSize{1500};
156
b50ee135 157static void truncateTC(const char* packet, uint16_t* len)
6ad8b29a 158try
159{
160 unsigned int consumed;
ca404e94 161 DNSName qname(packet, *len, sizeof(dnsheader), false, 0, 0, &consumed);
a683e8bd 162 *len=(uint16_t) (sizeof(dnsheader)+consumed+DNS_TYPE_SIZE+DNS_CLASS_SIZE);
6ad8b29a 163 struct dnsheader* dh =(struct dnsheader*)packet;
164 dh->ancount = dh->arcount = dh->nscount=0;
165}
166catch(...)
167{
168 g_stats.truncFail++;
169}
170
7b3865cd 171struct DelayedPacket
172{
173 int fd;
174 string packet;
175 ComboAddress destination;
68f3eb4d 176 ComboAddress origDest;
7b3865cd 177 void operator()()
178 {
20b019fa
RG
179 ssize_t res;
180 if(origDest.sin4.sin_family == 0) {
181 res = sendto(fd, packet.c_str(), packet.size(), 0, (struct sockaddr*)&destination, destination.getSocklen());
182 }
183 else {
184 res = sendfromto(fd, packet.c_str(), packet.size(), 0, origDest, destination);
185 }
186 if (res == -1) {
187 int err = errno;
188 vinfolog("Error sending delayed response to %s: %s", destination.toStringWithPort(), strerror(err));
189 }
7b3865cd 190 }
191};
192
a36ce055 193DelayPipe<DelayedPacket> * g_delay = 0;
7b3865cd 194
daacd477 195static void doLatencyAverages(double udiff)
196{
197 auto doAvg = [](double& var, double n, double weight) {
198 var = (weight -1) * var/weight + n/weight;
199 };
200
201 doAvg(g_stats.latencyAvg100, udiff, 100);
202 doAvg(g_stats.latencyAvg1000, udiff, 1000);
203 doAvg(g_stats.latencyAvg10000, udiff, 10000);
204 doAvg(g_stats.latencyAvg1000000, udiff, 1000000);
205}
ca404e94 206
fcffc585
RG
207bool responseContentMatches(const char* response, const uint16_t responseLen, const DNSName& qname, const uint16_t qtype, const uint16_t qclass, const ComboAddress& remote)
208{
209 uint16_t rqtype, rqclass;
210 unsigned int consumed;
211 DNSName rqname;
212 const struct dnsheader* dh = (struct dnsheader*) response;
213
214 if (responseLen < sizeof(dnsheader)) {
215 return false;
216 }
217
c8c3d4e4
RG
218 if (dh->qdcount == 0) {
219 if (dh->rcode != RCode::NoError && dh->rcode != RCode::NXDomain) {
220 return true;
221 }
222 else {
223 g_stats.nonCompliantResponses++;
224 return false;
225 }
226 }
227
fcffc585
RG
228 try {
229 rqname=DNSName(response, responseLen, sizeof(dnsheader), false, &rqtype, &rqclass, &consumed);
230 }
231 catch(std::exception& e) {
232 if(responseLen > (ssize_t)sizeof(dnsheader))
233 infolog("Backend %s sent us a response with id %d that did not parse: %s", remote.toStringWithPort(), ntohs(dh->id), e.what());
234 g_stats.nonCompliantResponses++;
235 return false;
236 }
237
238 if (rqtype != qtype || rqclass != qclass || rqname != qname) {
239 return false;
240 }
241
242 return true;
243}
244
0f72fd5c 245void restoreFlags(struct dnsheader* dh, uint16_t origFlags)
fcffc585
RG
246{
247 static const uint16_t rdMask = 1 << FLAGS_RD_OFFSET;
248 static const uint16_t cdMask = 1 << FLAGS_CD_OFFSET;
249 static const uint16_t restoreFlagsMask = UINT16_MAX & ~(rdMask | cdMask);
0f72fd5c
RG
250 uint16_t * flags = getFlagsFromDNSHeader(dh);
251 /* clear the flags we are about to restore */
252 *flags &= restoreFlagsMask;
253 /* only keep the flags we want to restore */
254 origFlags &= ~restoreFlagsMask;
255 /* set the saved flags as they were */
256 *flags |= origFlags;
257}
258
ff73f02b 259bool fixUpResponse(char** response, uint16_t* responseLen, size_t* responseSize, const DNSName& qname, uint16_t origFlags, bool ednsAdded, bool ecsAdded, std::vector<uint8_t>& rewrittenResponse, uint16_t addRoom)
0f72fd5c 260{
fcffc585
RG
261 struct dnsheader* dh = (struct dnsheader*) *response;
262
263 if (*responseLen < sizeof(dnsheader)) {
264 return false;
265 }
266
c8c3d4e4
RG
267 restoreFlags(dh, origFlags);
268
269 if (*responseLen == sizeof(dnsheader)) {
270 return true;
271 }
272
fcffc585
RG
273 if(g_fixupCase) {
274 string realname = qname.toDNSString();
275 if (*responseLen >= (sizeof(dnsheader) + realname.length())) {
276 memcpy(*response + sizeof(dnsheader), realname.c_str(), realname.length());
277 }
278 }
279
ff73f02b
RG
280 if (ednsAdded || ecsAdded) {
281 char * optStart = NULL;
fcffc585
RG
282 size_t optLen = 0;
283 bool last = false;
284
285 int res = locateEDNSOptRR(*response, *responseLen, &optStart, &optLen, &last);
286
287 if (res == 0) {
ff73f02b
RG
288 if (ednsAdded) {
289 /* we added the entire OPT RR,
290 therefore we need to remove it entirely */
291 if (last) {
292 /* simply remove the last AR */
293 *responseLen -= optLen;
294 uint16_t arcount = ntohs(dh->arcount);
295 arcount--;
296 dh->arcount = htons(arcount);
297 }
298 else {
299 /* Removing an intermediary RR could lead to compression error */
300 if (rewriteResponseWithoutEDNS(*response, *responseLen, rewrittenResponse) == 0) {
301 *responseLen = rewrittenResponse.size();
302 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
303 rewrittenResponse.reserve(*responseLen + addRoom);
304 }
305 *responseSize = rewrittenResponse.capacity();
306 *response = reinterpret_cast<char*>(rewrittenResponse.data());
307 }
308 else {
309 warnlog("Error rewriting content");
310 }
311 }
fcffc585
RG
312 }
313 else {
ff73f02b
RG
314 /* the OPT RR was already present, but without ECS,
315 we need to remove the ECS option if any */
316 if (last) {
317 /* nothing after the OPT RR, we can simply remove the
318 ECS option */
319 size_t existingOptLen = optLen;
320 removeEDNSOptionFromOPT(optStart, &optLen, EDNSOptionCode::ECS);
321 *responseLen -= (existingOptLen - optLen);
fcffc585
RG
322 }
323 else {
ff73f02b
RG
324 /* Removing an intermediary RR could lead to compression error */
325 if (rewriteResponseWithoutEDNSOption(*response, *responseLen, EDNSOptionCode::ECS, rewrittenResponse) == 0) {
326 *responseLen = rewrittenResponse.size();
327 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
328 rewrittenResponse.reserve(*responseLen + addRoom);
329 }
330 *responseSize = rewrittenResponse.capacity();
331 *response = reinterpret_cast<char*>(rewrittenResponse.data());
332 }
333 else {
334 warnlog("Error rewriting content");
335 }
fcffc585
RG
336 }
337 }
338 }
339 }
340
341 return true;
342}
343
fcffc585 344#ifdef HAVE_DNSCRYPT
57847d65 345bool encryptResponse(char* response, uint16_t* responseLen, size_t responseSize, bool tcp, std::shared_ptr<DnsCryptQuery> dnsCryptQuery, dnsheader** dh, dnsheader* dhCopy)
fcffc585 346{
0f72fd5c
RG
347 if (dnsCryptQuery) {
348 uint16_t encryptedResponseLen = 0;
57847d65
RG
349
350 /* save the original header before encrypting it in place */
351 if (dh != nullptr && *dh != nullptr && dhCopy != nullptr) {
352 memcpy(dhCopy, *dh, sizeof(dnsheader));
353 *dh = dhCopy;
354 }
355
0f72fd5c 356 int res = dnsCryptQuery->ctx->encryptResponse(response, *responseLen, responseSize, dnsCryptQuery, tcp, &encryptedResponseLen);
fcffc585 357 if (res == 0) {
0f72fd5c 358 *responseLen = encryptedResponseLen;
fcffc585
RG
359 } else {
360 /* dropping response */
361 vinfolog("Error encrypting the response, dropping.");
362 return false;
363 }
364 }
0f72fd5c
RG
365 return true;
366}
fcffc585
RG
367#endif
368
0f72fd5c
RG
369static bool sendUDPResponse(int origFD, char* response, uint16_t responseLen, int delayMsec, const ComboAddress& origDest, const ComboAddress& origRemote)
370{
fcffc585
RG
371 if(delayMsec && g_delay) {
372 DelayedPacket dp{origFD, string(response,responseLen), origRemote, origDest};
373 g_delay->submit(dp, delayMsec);
374 }
375 else {
20b019fa
RG
376 ssize_t res;
377 if(origDest.sin4.sin_family == 0) {
378 res = sendto(origFD, response, responseLen, 0, (struct sockaddr*)&origRemote, origRemote.getSocklen());
379 }
380 else {
381 res = sendfromto(origFD, response, responseLen, 0, origDest, origRemote);
382 }
383 if (res == -1) {
384 int err = errno;
385 vinfolog("Error sending response to %s: %s", origRemote.toStringWithPort(), strerror(err));
386 }
fcffc585
RG
387 }
388
389 return true;
390}
391
4632045b 392// listens on a dedicated socket, lobs answers from downstream servers to original requestors
3c115e0f 393void* responderThread(std::shared_ptr<DownstreamState> state)
66d1c0bf 394try {
d8c19b98 395 auto localRespRulactions = g_resprulactions.getLocal();
11e1e08b
RG
396#ifdef HAVE_DNSCRYPT
397 char packet[4096 + DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE];
57847d65
RG
398 /* when the answer is encrypted in place, we need to get a copy
399 of the original header before encryption to fill the ring buffer */
400 dnsheader dhCopy;
11e1e08b 401#else
64e4ebb4 402 char packet[4096];
11e1e08b 403#endif
b50ee135 404 static_assert(sizeof(packet) <= UINT16_MAX, "Packet size should fit in a uint16_t");
ca404e94 405 vector<uint8_t> rewrittenResponse;
7267213a 406
66d1c0bf 407 uint16_t queryId = 0;
7730131a 408 for(;;) {
57847d65 409 dnsheader* dh = reinterpret_cast<struct dnsheader*>(packet);
df560083 410 bool outstandingDecreased = false;
66d1c0bf
RG
411 try {
412 ssize_t got = recv(state->fd, packet, sizeof(packet), 0);
413 char * response = packet;
414 size_t responseSize = sizeof(packet);
ca404e94 415
66d1c0bf
RG
416 if (got < (ssize_t) sizeof(dnsheader))
417 continue;
4f380af3 418
66d1c0bf
RG
419 uint16_t responseLen = (uint16_t) got;
420 queryId = dh->id;
b50ee135 421
66d1c0bf
RG
422 if(queryId >= state->idStates.size())
423 continue;
4f380af3 424
66d1c0bf
RG
425 IDState* ids = &state->idStates[queryId];
426 int origFD = ids->origFD;
c9ba8478 427
66d1c0bf
RG
428 if(origFD < 0) // duplicate
429 continue;
7267213a 430
66d1c0bf
RG
431 /* setting age to 0 to prevent the maintainer thread from
432 cleaning this IDS while we process the response.
433 We have already a copy of the origFD, so it would
434 mostly mess up the outstanding counter.
435 */
436 ids->age = 0;
fcffc585 437
66d1c0bf
RG
438 if (!responseContentMatches(response, responseLen, ids->qname, ids->qtype, ids->qclass, state->remote)) {
439 continue;
440 }
7267213a 441
66d1c0bf 442 --state->outstanding; // you'd think an attacker could game this, but we're using connected socket
df560083 443 outstandingDecreased = true;
2c5db3f7 444
66d1c0bf
RG
445 if(dh->tc && g_truncateTC) {
446 truncateTC(response, &responseLen);
447 }
aeb36780 448
66d1c0bf 449 dh->id = ids->origID;
ca404e94 450
66d1c0bf
RG
451 uint16_t addRoom = 0;
452 DNSResponse dr(&ids->qname, ids->qtype, ids->qclass, &ids->origDest, &ids->origRemote, dh, sizeof(packet), responseLen, false, &ids->sentTime.d_start);
d8c19b98 453#ifdef HAVE_PROTOBUF
66d1c0bf 454 dr.uniqueId = ids->uniqueId;
d8c19b98 455#endif
a76b0d63
RG
456 dr.qTag = ids->qTag;
457
66d1c0bf
RG
458 if (!processResponse(localRespRulactions, dr, &ids->delayMsec)) {
459 continue;
460 }
d8c19b98 461
11e1e08b 462#ifdef HAVE_DNSCRYPT
66d1c0bf
RG
463 if (ids->dnsCryptQuery) {
464 addRoom = DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE;
465 }
11e1e08b 466#endif
66d1c0bf
RG
467 if (!fixUpResponse(&response, &responseLen, &responseSize, ids->qname, ids->origFlags, ids->ednsAdded, ids->ecsAdded, rewrittenResponse, addRoom)) {
468 continue;
469 }
ca404e94 470
66d1c0bf 471 if (ids->packetCache && !ids->skipCache) {
2714396e 472 ids->packetCache->insert(ids->cacheKey, ids->qname, ids->qtype, ids->qclass, response, responseLen, false, dh->rcode);
66d1c0bf 473 }
886e2cf2 474
b5b93e0b 475 if (ids->cs && !ids->cs->muted) {
11e1e08b 476#ifdef HAVE_DNSCRYPT
57847d65 477 if (!encryptResponse(response, &responseLen, responseSize, false, ids->dnsCryptQuery, &dh, &dhCopy)) {
b5b93e0b
RG
478 continue;
479 }
11e1e08b 480#endif
b5b93e0b
RG
481
482 ComboAddress empty;
483 empty.sin4.sin_family = 0;
484 /* if ids->destHarvested is false, origDest holds the listening address.
485 We don't want to use that as a source since it could be 0.0.0.0 for example. */
486 sendUDPResponse(origFD, response, responseLen, ids->delayMsec, ids->destHarvested ? ids->origDest : empty, ids->origRemote);
487 }
66d1c0bf
RG
488
489 g_stats.responses++;
490
491 double udiff = ids->sentTime.udiff();
492 vinfolog("Got answer from %s, relayed to %s, took %f usec", state->remote.toStringWithPort(), ids->origRemote.toStringWithPort(), udiff);
493
494 {
495 struct timespec ts;
496 gettime(&ts);
497 std::lock_guard<std::mutex> lock(g_rings.respMutex);
498 g_rings.respRing.push_back({ts, ids->origRemote, ids->qname, ids->qtype, (unsigned int)udiff, (unsigned int)got, *dh, state->remote});
499 }
500
501 if(dh->rcode == RCode::ServFail)
502 g_stats.servfailResponses++;
503 state->latencyUsec = (127.0 * state->latencyUsec / 128.0) + udiff/128.0;
504
505 if(udiff < 1000) g_stats.latency0_1++;
506 else if(udiff < 10000) g_stats.latency1_10++;
507 else if(udiff < 50000) g_stats.latency10_50++;
508 else if(udiff < 100000) g_stats.latency50_100++;
509 else if(udiff < 1000000) g_stats.latency100_1000++;
510 else g_stats.latencySlow++;
42fae326 511
66d1c0bf 512 doLatencyAverages(udiff);
fcadd56e 513
66d1c0bf 514 if (ids->origFD == origFD) {
11e1e08b 515#ifdef HAVE_DNSCRYPT
0beaa5c8 516 ids->dnsCryptQuery = nullptr;
11e1e08b 517#endif
66d1c0bf 518 ids->origFD = -1;
df560083 519 outstandingDecreased = false;
66d1c0bf 520 }
ca404e94 521
66d1c0bf
RG
522 rewrittenResponse.clear();
523 }
df560083 524 catch(const std::exception& e){
66d1c0bf 525 vinfolog("Got an error in UDP responder thread while parsing a response from %s, id %d: %s", state->remote.toStringWithPort(), queryId, e.what());
df560083
RG
526 if (outstandingDecreased) {
527 /* so an exception was raised after we decreased the outstanding queries counter,
528 but before we could set ids->origFD to -1 (because we also set outstandingDecreased
fe0b6bba 529 to false then), meaning the IDS is still considered active and we will decrease the
df560083
RG
530 counter again on a duplicate, or simply while reaping downstream timeouts, so let's
531 increase it back. */
532 state->outstanding++;
533 }
66d1c0bf 534 }
7730131a 535 }
536 return 0;
537}
66d1c0bf
RG
538catch(const std::exception& e)
539{
540 errlog("UDP responder thread died because of exception: %s", e.what());
541 return 0;
542}
543catch(const PDNSException& e)
544{
545 errlog("UDP responder thread died because of PowerDNS exception: %s", e.reason);
546 return 0;
547}
548catch(...)
549{
550 errlog("UDP responder thread died because of an exception: %s", "unknown");
551 return 0;
552}
24d5cb00 553
b58f08e5 554void DownstreamState::reconnect()
3c115e0f 555{
b58f08e5
RG
556 connected = false;
557 if (fd != -1) {
558 /* shutdown() is needed to wake up recv() in the responderThread */
559 shutdown(fd, SHUT_RDWR);
560 close(fd);
561 fd = -1;
562 }
f99e3aaf
RG
563 if (!IsAnyAddress(remote)) {
564 fd = SSocket(remote.sin4.sin_family, SOCK_DGRAM, 0);
565 if (!IsAnyAddress(sourceAddr)) {
566 SSetsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 1);
567 SBind(fd, sourceAddr);
568 }
7565f4e6
RG
569 try {
570 SConnect(fd, remote);
571 connected = true;
572 }
573 catch(const std::runtime_error& error) {
574 infolog("Error connecting to new server with address %s: %s", remote.toStringWithPort(), error.what());
575 }
b58f08e5
RG
576 }
577}
578
579DownstreamState::DownstreamState(const ComboAddress& remote_, const ComboAddress& sourceAddr_, unsigned int sourceItf_): remote(remote_), sourceAddr(sourceAddr_), sourceItf(sourceItf_)
580{
581 if (!IsAnyAddress(remote)) {
582 reconnect();
f99e3aaf
RG
583 idStates.resize(g_maxOutstanding);
584 sw.start();
585 infolog("Added downstream server %s", remote.toStringWithPort());
fbe2a2e0 586 }
3c115e0f 587}
588
773470ca 589std::mutex g_luamutex;
3f25bce6 590LuaContext g_lua;
3f25bce6 591
ecbe9133 592GlobalStateHolder<ServerPolicy> g_policy;
773470ca 593
497a6e3a 594shared_ptr<DownstreamState> firstAvailable(const NumberedServerVector& servers, const DNSQuestion* dq)
773470ca 595{
22b2b326 596 for(auto& d : servers) {
da4e7813 597 if(d.second->isUp() && d.second->qps.check())
598 return d.second;
773470ca 599 }
497a6e3a 600 return leastOutstanding(servers, dq);
2c5db3f7 601}
602
d1fba58e 603// get server with least outstanding queries, and within those, with the lowest order, and within those: the fastest
497a6e3a 604shared_ptr<DownstreamState> leastOutstanding(const NumberedServerVector& servers, const DNSQuestion* dq)
c9262563 605{
886e2cf2
RG
606 if (servers.size() == 1 && servers[0].second->isUp()) {
607 return servers[0].second;
608 }
609
d1fba58e 610 vector<pair<tuple<int,int,double>, shared_ptr<DownstreamState>>> poss;
611 /* so you might wonder, why do we go through this trouble? The data on which we sort could change during the sort,
612 which would suck royally and could even lead to crashes. So first we snapshot on what we sort, and then we sort */
478ce172 613 poss.reserve(servers.size());
0e41337b 614 for(auto& d : servers) {
da4e7813 615 if(d.second->isUp()) {
d1fba58e 616 poss.push_back({make_tuple(d.second->outstanding.load(), d.second->order, d.second->latencyUsec), d.second});
c9262563 617 }
618 }
619 if(poss.empty())
620 return shared_ptr<DownstreamState>();
621 nth_element(poss.begin(), poss.begin(), poss.end(), [](const decltype(poss)::value_type& a, const decltype(poss)::value_type& b) { return a.first < b.first; });
622 return poss.begin()->second;
623}
624
497a6e3a 625shared_ptr<DownstreamState> valrandom(unsigned int val, const NumberedServerVector& servers, const DNSQuestion* dq)
c9262563 626{
627 vector<pair<int, shared_ptr<DownstreamState>>> poss;
628 int sum=0;
22b2b326 629 for(auto& d : servers) { // w=1, w=10 -> 1, 11
da4e7813 630 if(d.second->isUp()) {
631 sum+=d.second->weight;
632 poss.push_back({sum, d.second});
c9262563 633 }
634 }
d2894425
NJ
635
636 // Catch poss & sum are empty to avoid SIGFPE
637 if(poss.empty())
638 return shared_ptr<DownstreamState>();
639
a7f3108c 640 int r = val % sum;
af619119 641 auto p = upper_bound(poss.begin(), poss.end(),r, [](int r_, const decltype(poss)::value_type& a) { return r_ < a.first;});
c9262563 642 if(p==poss.end())
643 return shared_ptr<DownstreamState>();
644 return p->second;
645}
646
497a6e3a 647shared_ptr<DownstreamState> wrandom(const NumberedServerVector& servers, const DNSQuestion* dq)
a7f3108c 648{
497a6e3a 649 return valrandom(random(), servers, dq);
a7f3108c 650}
651
36e763fa 652uint32_t g_hashperturb;
497a6e3a 653shared_ptr<DownstreamState> whashed(const NumberedServerVector& servers, const DNSQuestion* dq)
a7f3108c 654{
497a6e3a 655 return valrandom(dq->qname->hash(g_hashperturb), servers, dq);
a7f3108c 656}
657
658
497a6e3a 659shared_ptr<DownstreamState> roundrobin(const NumberedServerVector& servers, const DNSQuestion* dq)
5f504638 660{
da4e7813 661 NumberedServerVector poss;
c9262563 662
22b2b326 663 for(auto& d : servers) {
da4e7813 664 if(d.second->isUp()) {
5f504638 665 poss.push_back(d);
c9262563 666 }
5f504638 667 }
c9262563 668
ecbe9133 669 const auto *res=&poss;
5f504638 670 if(poss.empty())
ecbe9133 671 res = &servers;
c9262563 672
673 if(res->empty())
674 return shared_ptr<DownstreamState>();
675
676 static unsigned int counter;
677
da4e7813 678 return (*res)[(counter++) % res->size()].second;
5f504638 679}
680
c7b1d943
PL
681static void writepid(string pidfile) {
682 if (!pidfile.empty()) {
683 // Clean up possible stale file
684 unlink(pidfile.c_str());
685
686 // Write the pidfile
687 ofstream of(pidfile.c_str());
688 if (of) {
689 of << getpid();
690 } else {
691 errlog("Unable to write PID-file to '%s'.", pidfile);
692 }
693 of.close();
694 }
695}
696
b076b34a 697static void daemonize(void)
698{
699 if(fork())
700 _exit(0); // bye bye
c7b1d943
PL
701 /* We are child */
702
b076b34a 703 setsid();
704
705 int i=open("/dev/null",O_RDWR); /* open stdin */
706 if(i < 0)
707 ; // L<<Logger::Critical<<"Unable to open /dev/null: "<<stringerror()<<endl;
708 else {
709 dup2(i,0); /* stdin */
710 dup2(i,1); /* stderr */
711 dup2(i,2); /* stderr */
712 close(i);
713 }
714}
715
e6841c9e 716ComboAddress g_serverControl{"127.0.0.1:5199"};
b076b34a 717
886e2cf2
RG
718std::shared_ptr<ServerPool> createPoolIfNotExists(pools_t& pools, const string& poolName)
719{
720 std::shared_ptr<ServerPool> pool;
721 pools_t::iterator it = pools.find(poolName);
722 if (it != pools.end()) {
723 pool = it->second;
724 }
725 else {
8f4f5186
RG
726 if (!poolName.empty())
727 vinfolog("Creating pool %s", poolName);
886e2cf2
RG
728 pool = std::make_shared<ServerPool>();
729 pools.insert(std::pair<std::string,std::shared_ptr<ServerPool> >(poolName, pool));
730 }
731 return pool;
732}
78c8047b 733
742c079a
RG
734void setPoolPolicy(pools_t& pools, const string& poolName, std::shared_ptr<ServerPolicy> policy)
735{
736 std::shared_ptr<ServerPool> pool = createPoolIfNotExists(pools, poolName);
737 if (!poolName.empty()) {
738 vinfolog("Setting pool %s server selection policy to %s", poolName, policy->name);
739 } else {
740 vinfolog("Setting default pool server selection policy to %s", policy->name);
741 }
742 pool->policy = policy;
743}
744
886e2cf2 745void addServerToPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server)
22b2b326 746{
886e2cf2 747 std::shared_ptr<ServerPool> pool = createPoolIfNotExists(pools, poolName);
a683e8bd 748 unsigned int count = (unsigned int) pool->servers.size();
c218b223 749 if (!poolName.empty()) {
8f4f5186 750 vinfolog("Adding server to pool %s", poolName);
c218b223 751 } else {
8f4f5186 752 vinfolog("Adding server to default pool");
c218b223 753 }
886e2cf2 754 pool->servers.push_back(make_pair(++count, server));
d12cd8e9
RG
755 /* we need to reorder based on the server 'order' */
756 std::stable_sort(pool->servers.begin(), pool->servers.end(), [](const std::pair<unsigned int,std::shared_ptr<DownstreamState> >& a, const std::pair<unsigned int,std::shared_ptr<DownstreamState> >& b) {
757 return a.second->order < b.second->order;
758 });
759 /* and now we need to renumber for Lua (custom policies) */
760 size_t idx = 1;
af619119
RG
761 for (auto& serv : pool->servers) {
762 serv.first = idx++;
d12cd8e9 763 }
886e2cf2
RG
764}
765
766void removeServerFromPool(pools_t& pools, const string& poolName, std::shared_ptr<DownstreamState> server)
767{
8f4f5186 768 std::shared_ptr<ServerPool> pool = getPool(pools, poolName);
886e2cf2 769
c218b223 770 if (!poolName.empty()) {
8f4f5186 771 vinfolog("Removing server from pool %s", poolName);
c218b223 772 }
773 else {
8f4f5186 774 vinfolog("Removing server from default pool");
c218b223 775 }
886e2cf2 776
d12cd8e9
RG
777 size_t idx = 1;
778 bool found = false;
779 for (NumberedVector<shared_ptr<DownstreamState> >::iterator it = pool->servers.begin(); it != pool->servers.end();) {
780 if (found) {
781 /* we need to renumber the servers placed
782 after the removed one, for Lua (custom policies) */
783 it->first = idx++;
784 it++;
785 }
786 else if (it->second == server) {
787 it = pool->servers.erase(it);
788 found = true;
789 } else {
790 idx++;
791 it++;
886e2cf2
RG
792 }
793 }
794}
795
796std::shared_ptr<ServerPool> getPool(const pools_t& pools, const std::string& poolName)
797{
798 pools_t::const_iterator it = pools.find(poolName);
799
800 if (it == pools.end()) {
801 throw std::out_of_range("No pool named " + poolName);
802 }
803
804 return it->second;
805}
806
807const NumberedServerVector& getDownstreamCandidates(const pools_t& pools, const std::string& poolName)
808{
809 std::shared_ptr<ServerPool> pool = getPool(pools, poolName);
810 return pool->servers;
22b2b326 811}
c9262563 812
520eb5a0 813// goal in life - if you send us a reasonably normal packet, we'll get Z for you, otherwise 0
814int getEDNSZ(const char* packet, unsigned int len)
d08b1cdf 815try
520eb5a0 816{
817 struct dnsheader* dh =(struct dnsheader*)packet;
818
ca404e94 819 if(ntohs(dh->qdcount) != 1 || dh->ancount!=0 || ntohs(dh->arcount)!=1 || dh->nscount!=0)
520eb5a0 820 return 0;
ca404e94
RG
821
822 if (len <= sizeof(dnsheader))
823 return 0;
824
520eb5a0 825 unsigned int consumed;
ca404e94
RG
826 DNSName qname(packet, len, sizeof(dnsheader), false, 0, 0, &consumed);
827 size_t pos = consumed + DNS_TYPE_SIZE + DNS_CLASS_SIZE;
520eb5a0 828 uint16_t qtype, qclass;
829
ca404e94 830 if (len <= (sizeof(dnsheader)+pos))
520eb5a0 831 return 0;
832
ca404e94
RG
833 DNSName aname(packet, len, sizeof(dnsheader)+pos, true, &qtype, &qclass, &consumed);
834
835 if(qtype!=QType::OPT || sizeof(dnsheader)+pos+consumed+DNS_TYPE_SIZE+DNS_CLASS_SIZE+EDNS_EXTENDED_RCODE_SIZE+EDNS_VERSION_SIZE+1 >= len)
836 return 0;
837
838 uint8_t* z = (uint8_t*)packet+sizeof(dnsheader)+pos+consumed+DNS_TYPE_SIZE+DNS_CLASS_SIZE+EDNS_EXTENDED_RCODE_SIZE+EDNS_VERSION_SIZE;
520eb5a0 839 return 0x100 * (*z) + *(z+1);
840}
d08b1cdf 841catch(...)
842{
843 return 0;
844}
0940e4eb 845
614239d0 846static void spoofResponseFromString(DNSQuestion& dq, const string& spoofContent)
7791f83a
RG
847{
848 string result;
614239d0
RG
849
850 std::vector<std::string> addrs;
851 stringtok(addrs, spoofContent, " ,");
852
853 if (addrs.size() == 1) {
854 try {
855 ComboAddress spoofAddr(spoofContent);
856 SpoofAction sa({spoofAddr});
857 sa(&dq, &result);
858 }
859 catch(const PDNSException &e) {
860 SpoofAction sa(spoofContent); // CNAME then
861 sa(&dq, &result);
862 }
863 } else {
864 std::vector<ComboAddress> cas;
865 for (const auto& addr : addrs) {
866 try {
867 cas.push_back(ComboAddress(addr));
868 }
869 catch (...) {
870 }
871 }
872 SpoofAction sa(cas);
6ca7a40a 873 sa(&dq, &result);
7791f83a
RG
874 }
875}
876
0beaa5c8 877bool processQuery(LocalHolders& holders, DNSQuestion& dq, string& poolname, int* delayMsec, const struct timespec& now)
e91084ce
RG
878{
879 {
880 WriteLock wl(&g_rings.queryLock);
8146444b 881 g_rings.queryRing.push_back({now,*dq.remote,*dq.qname,dq.len,dq.qtype,*dq.dh});
e91084ce
RG
882 }
883
786e4d8c
RS
884 if(g_qcount.enabled) {
885 string qname = (*dq.qname).toString(".");
886 bool countQuery{true};
887 if(g_qcount.filter) {
888 std::lock_guard<std::mutex> lock(g_luamutex);
889 std::tie (countQuery, qname) = g_qcount.filter(dq);
890 }
891
892 if(countQuery) {
893 WriteLock wl(&g_qcount.queryLock);
894 if(!g_qcount.records.count(qname)) {
895 g_qcount.records[qname] = 0;
896 }
897 g_qcount.records[qname]++;
898 }
899 }
900
0beaa5c8 901 if(auto got = holders.dynNMGBlock->lookup(*dq.remote)) {
701f690b 902 auto updateBlockStats = [&got]() {
903 g_stats.dynBlocked++;
904 got->second.blocks++;
905 };
906
e91084ce 907 if(now < got->second.until) {
7b925432
RG
908 DNSAction::Action action = got->second.action;
909 if (action == DNSAction::Action::None) {
910 action = g_dynBlockAction;
911 }
912 if (action == DNSAction::Action::Refused) {
dd46e5e3 913 vinfolog("Query from %s refused because of dynamic block", dq.remote->toStringWithPort());
701f690b 914 updateBlockStats();
8477236d 915
dd46e5e3
RG
916 dq.dh->rcode = RCode::Refused;
917 dq.dh->qr=true;
918 return true;
919 }
8477236d 920 else if (action == DNSAction::Action::Truncate) {
921 if(!dq.tcp) {
701f690b 922 updateBlockStats();
8477236d 923 vinfolog("Query from %s truncated because of dynamic block", dq.remote->toStringWithPort());
924 dq.dh->tc = true;
925 dq.dh->qr = true;
926 return true;
927 }
928 else {
929 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toString());
930 }
931
c4f5aeff 932 }
dd46e5e3 933 else {
701f690b 934 updateBlockStats();
dd46e5e3
RG
935 vinfolog("Query from %s dropped because of dynamic block", dq.remote->toStringWithPort());
936 return false;
937 }
e91084ce
RG
938 }
939 }
940
0beaa5c8 941 if(auto got = holders.dynSMTBlock->lookup(*dq.qname)) {
701f690b 942 auto updateBlockStats = [&got]() {
943 g_stats.dynBlocked++;
944 got->blocks++;
945 };
946
71c94675 947 if(now < got->until) {
7b925432
RG
948 DNSAction::Action action = got->action;
949 if (action == DNSAction::Action::None) {
950 action = g_dynBlockAction;
951 }
952 if (action == DNSAction::Action::Refused) {
dd46e5e3 953 vinfolog("Query from %s for %s refused because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toString());
701f690b 954 updateBlockStats();
8477236d 955
dd46e5e3
RG
956 dq.dh->rcode = RCode::Refused;
957 dq.dh->qr=true;
958 return true;
959 }
8477236d 960 else if (action == DNSAction::Action::Truncate) {
961 if(!dq.tcp) {
701f690b 962 updateBlockStats();
8477236d 963
964 vinfolog("Query from %s for %s truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toString());
965 dq.dh->tc = true;
966 dq.dh->qr = true;
967 return true;
968 }
969 else {
970 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toString());
971 }
c4f5aeff 972 }
dd46e5e3 973 else {
701f690b 974 updateBlockStats();
dd46e5e3
RG
975 vinfolog("Query from %s for %s dropped because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toString());
976 return false;
977 }
71c94675 978 }
979 }
980
e91084ce
RG
981 DNSAction::Action action=DNSAction::Action::None;
982 string ruleresult;
0beaa5c8 983 for(const auto& lr : *holders.rulactions) {
e91084ce
RG
984 if(lr.first->matches(&dq)) {
985 lr.first->d_matches++;
986 action=(*lr.second)(&dq, &ruleresult);
987
988 switch(action) {
989 case DNSAction::Action::Allow:
990 return true;
991 break;
992 case DNSAction::Action::Drop:
993 g_stats.ruleDrop++;
994 return false;
995 break;
996 case DNSAction::Action::Nxdomain:
997 dq.dh->rcode = RCode::NXDomain;
998 dq.dh->qr=true;
999 g_stats.ruleNXDomain++;
1000 return true;
1001 break;
dd46e5e3
RG
1002 case DNSAction::Action::Refused:
1003 dq.dh->rcode = RCode::Refused;
1004 dq.dh->qr=true;
1005 g_stats.ruleRefused++;
1006 return true;
1007 break;
e91084ce
RG
1008 case DNSAction::Action::Spoof:
1009 spoofResponseFromString(dq, ruleresult);
1010 return true;
1011 break;
c4f5aeff
RG
1012 case DNSAction::Action::Truncate:
1013 dq.dh->tc = true;
1014 dq.dh->qr = true;
1015 return true;
1016 break;
e91084ce
RG
1017 case DNSAction::Action::HeaderModify:
1018 return true;
1019 break;
1020 case DNSAction::Action::Pool:
1021 poolname=ruleresult;
1022 return true;
1023 break;
1024 /* non-terminal actions follow */
1025 case DNSAction::Action::Delay:
1026 *delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
1027 break;
1028 case DNSAction::Action::None:
1029 break;
1030 }
1031 }
1032 }
1033
1034 return true;
1035}
1036
788c3243 1037bool processResponse(LocalStateHolder<vector<pair<std::shared_ptr<DNSRule>, std::shared_ptr<DNSResponseAction> > > >& localRespRulactions, DNSResponse& dr, int* delayMsec)
8146444b 1038{
788c3243 1039 DNSResponseAction::Action action=DNSResponseAction::Action::None;
8146444b
RG
1040 std::string ruleresult;
1041 for(const auto& lr : *localRespRulactions) {
1042 if(lr.first->matches(&dr)) {
1043 lr.first->d_matches++;
788c3243
RG
1044 action=(*lr.second)(&dr, &ruleresult);
1045 switch(action) {
1046 case DNSResponseAction::Action::Allow:
1047 return true;
1048 break;
1049 case DNSResponseAction::Action::Drop:
1050 return false;
1051 break;
1052 case DNSResponseAction::Action::HeaderModify:
1053 return true;
1054 break;
1055 /* non-terminal actions follow */
1056 case DNSResponseAction::Action::Delay:
1057 *delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
1058 break;
1059 case DNSResponseAction::Action::None:
1060 break;
1061 }
8146444b
RG
1062 }
1063 }
1064
1065 return true;
1066}
1067
fbe2a2e0
RG
1068static ssize_t udpClientSendRequestToBackend(DownstreamState* ss, const int sd, const char* request, const size_t requestLen)
1069{
b58f08e5
RG
1070 ssize_t result;
1071
fbe2a2e0 1072 if (ss->sourceItf == 0) {
b58f08e5
RG
1073 result = send(sd, request, requestLen, 0);
1074 }
1075 else {
1076 struct msghdr msgh;
1077 struct iovec iov;
1078 char cbuf[256];
1079 fillMSGHdr(&msgh, &iov, cbuf, sizeof(cbuf), const_cast<char*>(request), requestLen, &ss->remote);
1080 addCMsgSrcAddr(&msgh, cbuf, &ss->sourceAddr, ss->sourceItf);
1081 result = sendmsg(sd, &msgh, 0);
fbe2a2e0
RG
1082 }
1083
b58f08e5
RG
1084 if (result == -1) {
1085 int savederrno = errno;
1086 vinfolog("Error sending request to backend %s: %d", ss->remote.toStringWithPort(), savederrno);
1087
1088 /* This might sound silly, but on Linux send() might fail with EINVAL
1089 if the interface the socket was bound to doesn't exist anymore. */
1090 if (savederrno == EINVAL) {
1091 ss->reconnect();
1092 }
fbe2a2e0
RG
1093 }
1094
b58f08e5 1095 return result;
fbe2a2e0
RG
1096}
1097
0beaa5c8 1098static bool isUDPQueryAcceptable(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest)
24d5cb00 1099{
0beaa5c8
RG
1100 if (msgh->msg_flags & MSG_TRUNC) {
1101 /* message was too large for our buffer */
1102 vinfolog("Dropping message too large for our buffer");
1103 g_stats.nonCompliantQueries++;
1104 return false;
1105 }
a4652d55 1106
0beaa5c8
RG
1107 if(!holders.acl->match(remote)) {
1108 vinfolog("Query from %s dropped because of ACL", remote.toStringWithPort());
1109 g_stats.aclDrops++;
1110 return false;
2b3eefc3 1111 }
2b3eefc3 1112
0beaa5c8
RG
1113 cs.queries++;
1114 g_stats.queries++;
2b3eefc3 1115
0beaa5c8
RG
1116 if (HarvestDestinationAddress(msgh, &dest)) {
1117 /* we don't get the port, only the address */
1118 dest.sin4.sin_port = cs.local.sin4.sin_port;
1119 }
1120 else {
1121 dest.sin4.sin_family = 0;
2b3eefc3 1122 }
549d63c9 1123
0beaa5c8
RG
1124 return true;
1125}
1126
11e1e08b 1127#ifdef HAVE_DNSCRYPT
0beaa5c8
RG
1128static bool checkDNSCryptQuery(const ClientState& cs, const char* query, uint16_t& len, std::shared_ptr<DnsCryptQuery>& dnsCryptQuery, const ComboAddress& dest, const ComboAddress& remote)
1129{
1130 if (cs.dnscryptCtx) {
1131 vector<uint8_t> response;
1132 uint16_t decryptedQueryLen = 0;
2b3eefc3 1133
0beaa5c8
RG
1134 dnsCryptQuery = std::make_shared<DnsCryptQuery>();
1135
1136 bool decrypted = handleDnsCryptQuery(cs.dnscryptCtx, const_cast<char*>(query), len, dnsCryptQuery, &decryptedQueryLen, false, response);
1137
1138 if (!decrypted) {
1139 if (response.size() > 0) {
1140 sendUDPResponse(cs.udpFD, reinterpret_cast<char*>(response.data()), static_cast<uint16_t>(response.size()), 0, dest, remote);
2b3eefc3 1141 }
0beaa5c8
RG
1142 return false;
1143 }
2b3eefc3 1144
0beaa5c8
RG
1145 len = decryptedQueryLen;
1146 }
1147 return true;
1148}
1149#endif /* HAVE_DNSCRYPT */
2b3eefc3 1150
0beaa5c8
RG
1151bool checkQueryHeaders(const struct dnsheader* dh)
1152{
1153 if (dh->qr) { // don't respond to responses
1154 g_stats.nonCompliantQueries++;
1155 return false;
1156 }
2b3eefc3 1157
0beaa5c8
RG
1158 if (dh->qdcount == 0) {
1159 g_stats.emptyQueries++;
1160 return false;
1161 }
0ba5eecf 1162
0beaa5c8
RG
1163 if (dh->rd) {
1164 g_stats.rdQueries++;
1165 }
e91084ce 1166
0beaa5c8
RG
1167 return true;
1168}
963bef8d 1169
0beaa5c8
RG
1170#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1171static void queueResponse(const ClientState& cs, const char* response, uint16_t responseLen, const ComboAddress& dest, const ComboAddress& remote, struct mmsghdr& outMsg, struct iovec* iov, char* cbuf)
1172{
1173 outMsg.msg_len = 0;
1174 fillMSGHdr(&outMsg.msg_hdr, iov, nullptr, 0, const_cast<char*>(response), responseLen, const_cast<ComboAddress*>(&remote));
11e1e08b 1175
0beaa5c8
RG
1176 if (dest.sin4.sin_family == 0) {
1177 outMsg.msg_hdr.msg_control = nullptr;
1178 }
1179 else {
1180 addCMsgSrcAddr(&outMsg.msg_hdr, cbuf, &dest, 0);
1181 }
1182}
1183#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
43eeadc1 1184
0beaa5c8
RG
1185static void processUDPQuery(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest, char* query, uint16_t len, size_t queryBufferSize, struct mmsghdr* responsesVect, unsigned int* queuedResponses, struct iovec* respIOV, char* respCBuf)
1186{
1187 assert(responsesVect == nullptr || (queuedResponses != nullptr && respIOV != nullptr && respCBuf != nullptr));
1188 uint16_t queryId = 0;
2b3eefc3 1189
0beaa5c8
RG
1190 try {
1191 if (!isUDPQueryAcceptable(cs, holders, msgh, remote, dest)) {
1192 return;
1193 }
7cea4e39 1194
11e1e08b 1195#ifdef HAVE_DNSCRYPT
0beaa5c8 1196 std::shared_ptr<DnsCryptQuery> dnsCryptQuery = nullptr;
11e1e08b 1197
0beaa5c8
RG
1198 if (!checkDNSCryptQuery(cs, query, len, dnsCryptQuery, dest, remote)) {
1199 return;
1200 }
11e1e08b
RG
1201#endif
1202
0beaa5c8
RG
1203 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(query);
1204 queryId = ntohs(dh->id);
11e1e08b 1205
0beaa5c8
RG
1206 if (!checkQueryHeaders(dh)) {
1207 return;
1208 }
2efd427d 1209
0beaa5c8
RG
1210 const uint16_t * flags = getFlagsFromDNSHeader(dh);
1211 const uint16_t origFlags = *flags;
1212 uint16_t qtype, qclass;
1213 unsigned int consumed = 0;
1214 DNSName qname(query, len, sizeof(dnsheader), false, &qtype, &qclass, &consumed);
1215 DNSQuestion dq(&qname, qtype, qclass, dest.sin4.sin_family != 0 ? &dest : &cs.local, &remote, dh, queryBufferSize, len, false);
11e1e08b 1216
0beaa5c8
RG
1217 string poolname;
1218 int delayMsec = 0;
1219 /* we need an accurate ("real") value for the response and
1220 to store into the IDS, but not for insertion into the
1221 rings for example */
1222 struct timespec realTime;
1223 struct timespec now;
1224 gettime(&now);
1225 gettime(&realTime, true);
1226
1227 if (!processQuery(holders, dq, poolname, &delayMsec, now))
1228 {
1229 return;
1230 }
b63add03 1231
0beaa5c8
RG
1232 if(dq.dh->qr) { // something turned it into a response
1233 g_stats.selfAnswered++;
1234 restoreFlags(dh, origFlags);
1235
1236 if (!cs.muted) {
11e1e08b 1237 char* response = query;
497a6e3a 1238 uint16_t responseLen = dq.len;
11e1e08b 1239
fcffc585 1240#ifdef HAVE_DNSCRYPT
0beaa5c8
RG
1241 if (!encryptResponse(response, &responseLen, dq.size, false, dnsCryptQuery, nullptr, nullptr)) {
1242 return;
1243 }
11e1e08b 1244#endif
0beaa5c8
RG
1245#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1246 if (delayMsec == 0 && responsesVect != nullptr) {
1247 queueResponse(cs, response, responseLen, dest, remote, responsesVect[*queuedResponses], respIOV, respCBuf);
1248 (*queuedResponses)++;
1249 }
1250 else
1251#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1252 {
1253 sendUDPResponse(cs.udpFD, response, responseLen, delayMsec, dest, remote);
b5b93e0b 1254 }
22b2b326 1255 }
5f504638 1256
0beaa5c8
RG
1257 return;
1258 }
1259
1260 DownstreamState* ss = nullptr;
1261 std::shared_ptr<ServerPool> serverPool = getPool(*holders.pools, poolname);
1262 std::shared_ptr<DNSDistPacketCache> packetCache = nullptr;
1263 auto policy = holders.policy->policy;
1264 if (serverPool->policy != nullptr) {
1265 policy = serverPool->policy->policy;
1266 }
1267 {
1268 std::lock_guard<std::mutex> lock(g_luamutex);
1269 ss = policy(serverPool->servers, &dq).get();
1270 packetCache = serverPool->packetCache;
1271 }
228e4fe8 1272
0beaa5c8
RG
1273 bool ednsAdded = false;
1274 bool ecsAdded = false;
1275 if (dq.useECS && ss && ss->useECS) {
1276 if (!handleEDNSClientSubnet(query, dq.size, consumed, &dq.len, &(ednsAdded), &(ecsAdded), remote, dq.ecsOverride, dq.ecsPrefixLength)) {
126dcf66 1277 vinfolog("Dropping query from %s because we couldn't insert the ECS value", remote.toStringWithPort());
0beaa5c8 1278 return;
886e2cf2 1279 }
0beaa5c8 1280 }
886e2cf2 1281
0beaa5c8
RG
1282 uint32_t cacheKey = 0;
1283 if (packetCache && !dq.skipCache) {
1284 uint16_t cachedResponseSize = dq.size;
1285 uint32_t allowExpired = ss ? 0 : g_staleCacheEntriesTTL;
1286 if (packetCache->get(dq, consumed, dh->id, query, &cachedResponseSize, &cacheKey, allowExpired)) {
1287 DNSResponse dr(dq.qname, dq.qtype, dq.qclass, dq.local, dq.remote, reinterpret_cast<dnsheader*>(query), dq.size, cachedResponseSize, false, &realTime);
cf48b0ce 1288#ifdef HAVE_PROTOBUF
0beaa5c8 1289 dr.uniqueId = dq.uniqueId;
cf48b0ce 1290#endif
a76b0d63
RG
1291 dr.qTag = dq.qTag;
1292
0beaa5c8
RG
1293 if (!processResponse(holders.cacheHitRespRulactions, dr, &delayMsec)) {
1294 return;
1295 }
cf48b0ce 1296
0beaa5c8 1297 if (!cs.muted) {
fcffc585 1298#ifdef HAVE_DNSCRYPT
0beaa5c8
RG
1299 if (!encryptResponse(query, &cachedResponseSize, dq.size, false, dnsCryptQuery, nullptr, nullptr)) {
1300 return;
1301 }
fcffc585 1302#endif
0beaa5c8
RG
1303#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1304 if (delayMsec == 0 && responsesVect != nullptr) {
1305 queueResponse(cs, query, cachedResponseSize, dest, remote, responsesVect[*queuedResponses], respIOV, respCBuf);
1306 (*queuedResponses)++;
1307 }
1308 else
1309#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1310 {
1311 sendUDPResponse(cs.udpFD, query, cachedResponseSize, delayMsec, dest, remote);
b5b93e0b 1312 }
886e2cf2 1313 }
0beaa5c8
RG
1314
1315 g_stats.cacheHits++;
1316 g_stats.latency0_1++; // we're not going to measure this
1317 doLatencyAverages(0); // same
1318 return;
886e2cf2 1319 }
0beaa5c8
RG
1320 g_stats.cacheMisses++;
1321 }
886e2cf2 1322
0beaa5c8
RG
1323 if(!ss) {
1324 g_stats.noPolicy++;
26a3cdb7 1325
0beaa5c8
RG
1326 if (g_servFailOnNoPolicy && !cs.muted) {
1327 char* response = query;
1328 uint16_t responseLen = dq.len;
1329 restoreFlags(dh, origFlags);
26a3cdb7 1330
0beaa5c8
RG
1331 dq.dh->rcode = RCode::ServFail;
1332 dq.dh->qr = true;
26a3cdb7
RG
1333
1334#ifdef HAVE_DNSCRYPT
0beaa5c8
RG
1335 if (!encryptResponse(response, &responseLen, dq.size, false, dnsCryptQuery, nullptr, nullptr)) {
1336 return;
1337 }
26a3cdb7 1338#endif
0beaa5c8
RG
1339#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1340 if (responsesVect != nullptr) {
1341 queueResponse(cs, response, responseLen, dest, remote, responsesVect[*queuedResponses], respIOV, respCBuf);
1342 (*queuedResponses)++;
1343 }
1344 else
1345#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1346 {
1347 sendUDPResponse(cs.udpFD, response, responseLen, 0, dest, remote);
26a3cdb7 1348 }
1ea747c0 1349 }
b9c3c5df 1350 vinfolog("%s query for %s|%s from %s, no policy applied", g_servFailOnNoPolicy ? "ServFailed" : "Dropped", dq.qname->toString(), QType(dq.qtype).getName(), remote.toStringWithPort());
0beaa5c8
RG
1351 return;
1352 }
1ea747c0 1353
0beaa5c8 1354 ss->queries++;
11e1e08b 1355
0beaa5c8
RG
1356 unsigned int idOffset = (ss->idOffset++) % ss->idStates.size();
1357 IDState* ids = &ss->idStates[idOffset];
1358 ids->age = 0;
c9ba8478 1359
0beaa5c8
RG
1360 if(ids->origFD < 0) // if we are reusing, no change in outstanding
1361 ss->outstanding++;
1362 else {
1363 ss->reuseds++;
1364 g_stats.downstreamTimeouts++;
1365 }
0e41337b 1366
0beaa5c8
RG
1367 ids->cs = &cs;
1368 ids->origFD = cs.udpFD;
1369 ids->origID = dh->id;
1370 ids->origRemote = remote;
1371 ids->sentTime.set(realTime);
1372 ids->qname = qname;
1373 ids->qtype = dq.qtype;
1374 ids->qclass = dq.qclass;
1375 ids->delayMsec = delayMsec;
1376 ids->origFlags = origFlags;
1377 ids->cacheKey = cacheKey;
1378 ids->skipCache = dq.skipCache;
1379 ids->packetCache = packetCache;
1380 ids->ednsAdded = ednsAdded;
1381 ids->ecsAdded = ecsAdded;
a76b0d63 1382 ids->qTag = dq.qTag;
0beaa5c8
RG
1383
1384 /* If we couldn't harvest the real dest addr, still
1385 write down the listening addr since it will be useful
1386 (especially if it's not an 'any' one).
1387 We need to keep track of which one it is since we may
1388 want to use the real but not the listening addr to reply.
1389 */
1390 if (dest.sin4.sin_family != 0) {
1391 ids->origDest = dest;
1392 ids->destHarvested = true;
1393 }
1394 else {
1395 ids->origDest = cs.local;
1396 ids->destHarvested = false;
1397 }
11e1e08b 1398#ifdef HAVE_DNSCRYPT
0beaa5c8 1399 ids->dnsCryptQuery = dnsCryptQuery;
d8c19b98
RG
1400#endif
1401#ifdef HAVE_PROTOBUF
0beaa5c8 1402 ids->uniqueId = dq.uniqueId;
11e1e08b 1403#endif
c9ba8478 1404
0beaa5c8 1405 dh->id = idOffset;
ca404e94 1406
0beaa5c8 1407 ssize_t ret = udpClientSendRequestToBackend(ss, ss->fd, query, dq.len);
11e1e08b 1408
0beaa5c8
RG
1409 if(ret < 0) {
1410 ss->sendErrors++;
1411 g_stats.downstreamSendErrors++;
1412 }
ca404e94 1413
0beaa5c8
RG
1414 vinfolog("Got query for %s|%s from %s, relayed to %s", ids->qname.toString(), QType(ids->qtype).getName(), remote.toStringWithPort(), ss->getName());
1415 }
1416 catch(const std::exception& e){
1417 vinfolog("Got an error in UDP question thread while parsing a query from %s, id %d: %s", remote.toStringWithPort(), queryId, e.what());
1418 }
1419}
1420
1421#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1422static void MultipleMessagesUDPClientThread(ClientState* cs, LocalHolders& holders)
1423{
1424 struct MMReceiver
1425 {
1426 char packet[4096];
1427 /* used by HarvestDestinationAddress */
1428 char cbuf[256];
1429 ComboAddress remote;
1430 ComboAddress dest;
1431 struct iovec iov;
1432 };
1433 const size_t vectSize = g_udpVectorSize;
1434 /* the actual buffer is larger because:
1435 - we may have to add EDNS and/or ECS
1436 - we use it for self-generated responses (from rule or cache)
1437 but we only accept incoming payloads up to that size
1438 */
1439 static_assert(s_udpIncomingBufferSize <= sizeof(MMReceiver::packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1440
1441 auto recvData = std::unique_ptr<MMReceiver[]>(new MMReceiver[vectSize]);
1442 auto msgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1443 auto outMsgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1444
1445 /* initialize the structures needed to receive our messages */
1446 for (size_t idx = 0; idx < vectSize; idx++) {
1447 recvData[idx].remote.sin4.sin_family = cs->local.sin4.sin_family;
1448 fillMSGHdr(&msgVec[idx].msg_hdr, &recvData[idx].iov, recvData[idx].cbuf, sizeof(recvData[idx].cbuf), recvData[idx].packet, s_udpIncomingBufferSize, &recvData[idx].remote);
1449 }
1450
1451 /* go now */
1452 for(;;) {
1453
1454 /* reset the IO vector, since it's also used to send the vector of responses
1455 to avoid having to copy the data around */
1456 for (size_t idx = 0; idx < vectSize; idx++) {
1457 recvData[idx].iov.iov_base = recvData[idx].packet;
1458 recvData[idx].iov.iov_len = sizeof(recvData[idx].packet);
5f504638 1459 }
0beaa5c8
RG
1460
1461 /* block until we have at least one message ready, but return
1462 as many as possible to save the syscall costs */
1463 int msgsGot = recvmmsg(cs->udpFD, msgVec.get(), vectSize, MSG_WAITFORONE | MSG_TRUNC, nullptr);
1464
1465 if (msgsGot <= 0) {
1466 vinfolog("Getting UDP messages via recvmmsg() failed with: %s", strerror(errno));
1467 continue;
773470ca 1468 }
0beaa5c8
RG
1469
1470 unsigned int msgsToSend = 0;
1471
1472 /* process the received messages */
1473 for (int msgIdx = 0; msgIdx < msgsGot; msgIdx++) {
1474 const struct msghdr* msgh = &msgVec[msgIdx].msg_hdr;
1475 unsigned int got = msgVec[msgIdx].msg_len;
1476 const ComboAddress& remote = recvData[msgIdx].remote;
1477
1478 if (got < sizeof(struct dnsheader)) {
1479 g_stats.nonCompliantQueries++;
1480 continue;
1481 }
1482
1483 processUDPQuery(*cs, holders, msgh, remote, recvData[msgIdx].dest, recvData[msgIdx].packet, static_cast<uint16_t>(got), sizeof(recvData[msgIdx].packet), outMsgVec.get(), &msgsToSend, &recvData[msgIdx].iov, recvData[msgIdx].cbuf);
1484
2b3eefc3 1485 }
0beaa5c8
RG
1486
1487 /* immediate (not delayed or sent to a backend) responses (mostly from a rule, dynamic block
1488 or the cache) can be sent in batch too */
1489
1490 if (msgsToSend > 0 && msgsToSend <= static_cast<unsigned int>(msgsGot)) {
1491 int sent = sendmmsg(cs->udpFD, outMsgVec.get(), msgsToSend, 0);
1492
2b3eefc3 1493 if (sent < 0 || static_cast<unsigned int>(sent) != msgsToSend) {
0beaa5c8 1494 vinfolog("Error sending responses with sendmmsg() (%d on %u): %s", sent, msgsToSend, strerror(errno));
2b3eefc3 1495 }
2b3eefc3 1496 }
0beaa5c8 1497
24d5cb00 1498 }
0beaa5c8
RG
1499}
1500#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1501
1502// listens to incoming queries, sends out to downstream servers, noting the intended return path
1503static void* udpClientThread(ClientState* cs)
1504try
1505{
1506 LocalHolders holders;
1507
1508#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1509 if (g_udpVectorSize > 1) {
1510 MultipleMessagesUDPClientThread(cs, holders);
1511
1512 }
1513 else
1514#endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1515 {
1516 char packet[4096];
1517 /* the actual buffer is larger because:
1518 - we may have to add EDNS and/or ECS
1519 - we use it for self-generated responses (from rule or cache)
1520 but we only accept incoming payloads up to that size
1521 */
1522 static_assert(s_udpIncomingBufferSize <= sizeof(packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1523 struct msghdr msgh;
1524 struct iovec iov;
1525 /* used by HarvestDestinationAddress */
1526 char cbuf[256];
1527
1528 ComboAddress remote;
1529 ComboAddress dest;
1530 remote.sin4.sin_family = cs->local.sin4.sin_family;
1531 fillMSGHdr(&msgh, &iov, cbuf, sizeof(cbuf), packet, sizeof(packet), &remote);
1532
1533 for(;;) {
1534 ssize_t got = recvmsg(cs->udpFD, &msgh, 0);
1535
1536 if (got < 0 || static_cast<size_t>(got) < sizeof(struct dnsheader)) {
1537 g_stats.nonCompliantQueries++;
1538 continue;
1539 }
1540
1541 processUDPQuery(*cs, holders, &msgh, remote, dest, packet, static_cast<uint16_t>(got), s_udpIncomingBufferSize, nullptr, nullptr, nullptr, nullptr);
1542 }
1543 }
1544
1545 return nullptr;
24d5cb00 1546}
2b3eefc3 1547catch(const std::exception &e)
a4652d55 1548{
1549 errlog("UDP client thread died because of exception: %s", e.what());
0beaa5c8 1550 return nullptr;
a4652d55 1551}
2b3eefc3 1552catch(const PDNSException &e)
a4652d55 1553{
1554 errlog("UDP client thread died because of PowerDNS exception: %s", e.reason);
0beaa5c8 1555 return nullptr;
a4652d55 1556}
1557catch(...)
1558{
1559 errlog("UDP client thread died because of an exception: %s", "unknown");
0beaa5c8 1560 return nullptr;
a4652d55 1561}
24d5cb00 1562
fbe2a2e0 1563static bool upCheck(DownstreamState& ds)
773470ca 1564try
1565{
1566 vector<uint8_t> packet;
de9f7157 1567 DNSPacketWriter dpw(packet, ds.checkName, ds.checkType.getCode(), ds.checkClass);
fc01e0b1
RG
1568 dnsheader * requestHeader = dpw.getHeader();
1569 requestHeader->rd=true;
21830638
RG
1570 if (ds.setCD) {
1571 requestHeader->cd = true;
1572 }
773470ca 1573
fbe2a2e0 1574 Socket sock(ds.remote.sin4.sin_family, SOCK_DGRAM);
773470ca 1575 sock.setNonBlocking();
fbe2a2e0
RG
1576 if (!IsAnyAddress(ds.sourceAddr)) {
1577 sock.setReuseAddr();
1578 sock.bind(ds.sourceAddr);
1579 }
1580 sock.connect(ds.remote);
1581 ssize_t sent = udpClientSendRequestToBackend(&ds, sock.getHandle(), (char*)&packet[0], packet.size());
9e87dcb8
RG
1582 if (sent < 0) {
1583 int ret = errno;
1584 if (g_verboseHealthChecks)
fd23d9bb 1585 infolog("Error while sending a health check query to backend %s: %d", ds.getNameWithAddr(), ret);
fbe2a2e0 1586 return false;
9e87dcb8 1587 }
fbe2a2e0 1588
773470ca 1589 int ret=waitForRWData(sock.getHandle(), true, 1, 0);
9e87dcb8
RG
1590 if(ret < 0 || !ret) { // error, timeout, both are down!
1591 if (ret < 0) {
1592 ret = errno;
1593 if (g_verboseHealthChecks)
fd23d9bb 1594 infolog("Error while waiting for the health check response from backend %s: %d", ds.getNameWithAddr(), ret);
9e87dcb8
RG
1595 }
1596 else {
1597 if (g_verboseHealthChecks)
fd23d9bb 1598 infolog("Timeout while waiting for the health check response from backend %s", ds.getNameWithAddr());
9e87dcb8 1599 }
773470ca 1600 return false;
9e87dcb8
RG
1601 }
1602
773470ca 1603 string reply;
fbe2a2e0 1604 sock.recvFrom(reply, ds.remote);
773470ca 1605
231d3b6b 1606 const dnsheader * responseHeader = (const dnsheader *) reply.c_str();
fc01e0b1 1607
9e87dcb8
RG
1608 if (reply.size() < sizeof(*responseHeader)) {
1609 if (g_verboseHealthChecks)
fd23d9bb 1610 infolog("Invalid health check response of size %d from backend %s, expecting at least %d", reply.size(), ds.getNameWithAddr(), sizeof(*responseHeader));
fc01e0b1 1611 return false;
9e87dcb8 1612 }
fc01e0b1 1613
9e87dcb8
RG
1614 if (responseHeader->id != requestHeader->id) {
1615 if (g_verboseHealthChecks)
fd23d9bb 1616 infolog("Invalid health check response id %d from backend %s, expecting %d", responseHeader->id, ds.getNameWithAddr(), requestHeader->id);
fc01e0b1 1617 return false;
9e87dcb8
RG
1618 }
1619
1620 if (!responseHeader->qr) {
1621 if (g_verboseHealthChecks)
fd23d9bb 1622 infolog("Invalid health check response from backend %s, expecting QR to be set", ds.getNameWithAddr());
fc01e0b1 1623 return false;
9e87dcb8
RG
1624 }
1625
1626 if (responseHeader->rcode == RCode::ServFail) {
1627 if (g_verboseHealthChecks)
fd23d9bb 1628 infolog("Backend %s responded to health check with ServFail", ds.getNameWithAddr());
fc01e0b1 1629 return false;
9e87dcb8
RG
1630 }
1631
1632 if (ds.mustResolve && (responseHeader->rcode == RCode::NXDomain || responseHeader->rcode == RCode::Refused)) {
1633 if (g_verboseHealthChecks)
fd23d9bb 1634 infolog("Backend %s responded to health check with %s while mustResolve is set", ds.getNameWithAddr(), responseHeader->rcode == RCode::NXDomain ? "NXDomain" : "Refused");
a6e02424 1635 return false;
9e87dcb8 1636 }
fc01e0b1 1637
773470ca 1638 // XXX fixme do bunch of checking here etc
1639 return true;
1640}
9e87dcb8
RG
1641catch(const std::exception& e)
1642{
1643 if (g_verboseHealthChecks)
fd23d9bb 1644 infolog("Error checking the health of backend %s: %s", ds.getNameWithAddr(), e.what());
9e87dcb8
RG
1645 return false;
1646}
773470ca 1647catch(...)
1648{
9e87dcb8 1649 if (g_verboseHealthChecks)
fd23d9bb 1650 infolog("Unknown exception while checking the health of backend %s", ds.getNameWithAddr());
773470ca 1651 return false;
1652}
726ddf60 1653
6c1ca990 1654uint64_t g_maxTCPClientThreads{10};
886e2cf2 1655std::atomic<uint16_t> g_cacheCleaningDelay{60};
f65ea0c2 1656std::atomic<uint16_t> g_cacheCleaningPercentage{100};
e41f8165 1657
3c115e0f 1658void* maintThread()
886e2cf2
RG
1659{
1660 int interval = 1;
1661 size_t counter = 0;
5f3ea719 1662 int32_t secondsToWaitLog = 0;
886e2cf2
RG
1663
1664 for(;;) {
1665 sleep(interval);
1666
069e59db 1667 {
1668 std::lock_guard<std::mutex> lock(g_luamutex);
5f3ea719
PL
1669 auto f = g_lua.readVariable<boost::optional<std::function<void()> > >("maintenance");
1670 if(f) {
1671 try {
1672 (*f)();
1673 secondsToWaitLog = 0;
1674 }
1675 catch(std::exception &e) {
1676 if (secondsToWaitLog <= 0) {
1677 infolog("Error during execution of maintenance function: %s", e.what());
1678 secondsToWaitLog = 61;
1679 }
1680 secondsToWaitLog -= interval;
1681 }
1682 }
069e59db 1683 }
886e2cf2
RG
1684
1685 counter++;
1686 if (counter >= g_cacheCleaningDelay) {
1687 const auto localPools = g_pools.getCopy();
e6557ec8 1688 std::shared_ptr<DNSDistPacketCache> packetCache = nullptr;
886e2cf2 1689 for (const auto& entry : localPools) {
e6557ec8
RG
1690 {
1691 std::lock_guard<std::mutex> lock(g_luamutex);
1692 packetCache = entry.second->packetCache;
1693 }
1694 if (packetCache) {
f65ea0c2
RG
1695 size_t upTo = (packetCache->getMaxEntries()* (100 - g_cacheCleaningPercentage)) / 100;
1696 packetCache->purgeExpired(upTo);
886e2cf2
RG
1697 }
1698 }
1699 counter = 0;
1700 }
1701
1702 // ponder pruning g_dynblocks of expired entries here
1703 }
1704 return 0;
1705}
1706
1707void* healthChecksThread()
3ae86514 1708{
fcadd56e 1709 int interval = 1;
64e4ebb4 1710
3ae86514 1711 for(;;) {
1712 sleep(interval);
773470ca 1713
ded1985a 1714 if(g_tcpclientthreads->getQueuedCount() > 1 && !g_tcpclientthreads->hasReachedMaxThreads())
a9bf3ec4 1715 g_tcpclientthreads->addTCPClientThread();
3ae86514 1716
e5a14b2b 1717 for(auto& dss : g_dstates.getCopy()) { // this points to the actual shared_ptrs!
773470ca 1718 if(dss->availability==DownstreamState::Availability::Auto) {
2993d58d 1719 bool newState=upCheck(*dss);
1720 if (newState) {
1721 if (dss->currentCheckFailures != 0) {
1722 dss->currentCheckFailures = 0;
1723 }
1724 }
1725 else if (!newState && dss->upStatus) {
1726 dss->currentCheckFailures++;
1727 if (dss->currentCheckFailures < dss->maxCheckFailures) {
1728 newState = true;
1729 }
1730 }
1731
1732 if(newState != dss->upStatus) {
1733 warnlog("Marking downstream %s as '%s'", dss->getNameWithAddr(), newState ? "up" : "down");
7565f4e6
RG
1734
1735 if (newState && !dss->connected) {
1736 try {
1737 SConnect(dss->fd, dss->remote);
1738 dss->connected = true;
2717a92f 1739 dss->tid = thread(responderThread, dss);
7565f4e6
RG
1740 }
1741 catch(const std::runtime_error& error) {
1742 infolog("Error connecting to new server with address %s: %s", dss->remote.toStringWithPort(), error.what());
b58f08e5
RG
1743 newState = false;
1744 dss->connected = false;
7565f4e6
RG
1745 }
1746 }
1747
2993d58d 1748 dss->upStatus = newState;
1749 dss->currentCheckFailures = 0;
9f4eb5cc
RG
1750 if (g_snmpAgent && g_snmpTrapsEnabled) {
1751 g_snmpAgent->sendBackendStatusChangeTrap(dss);
1752 }
2993d58d 1753 }
773470ca 1754 }
b076b34a 1755
773470ca 1756 auto delta = dss->sw.udiffAndSet()/1000000.0;
1757 dss->queryLoad = 1.0*(dss->queries.load() - dss->prev.queries.load())/delta;
1758 dss->dropRate = 1.0*(dss->reuseds.load() - dss->prev.reuseds.load())/delta;
3c115e0f 1759 dss->prev.queries.store(dss->queries.load());
773470ca 1760 dss->prev.reuseds.store(dss->reuseds.load());
3c115e0f 1761
773470ca 1762 for(IDState& ids : dss->idStates) { // timeouts
e0b5e49d 1763 if(ids.origFD >=0 && ids.age++ > g_udpTimeout) {
51642fe3
RG
1764 /* We set origFD to -1 as soon as possible
1765 to limit the risk of racing with the
1766 responder thread.
1767 The UDP client thread only checks origFD to
1768 know whether outstanding has to be incremented,
1769 so the sooner the better any way since we _will_
1770 decrement it.
1771 */
1772 ids.origFD = -1;
64e4ebb4 1773 ids.age = 0;
3c115e0f 1774 dss->reuseds++;
1775 --dss->outstanding;
51642fe3 1776 g_stats.downstreamTimeouts++; // this is an 'actively' discovered timeout
c08a5092 1777 vinfolog("Had a downstream timeout from %s (%s) for query for %s|%s from %s",
1778 dss->remote.toStringWithPort(), dss->name,
1779 ids.qname.toString(), QType(ids.qtype).getName(), ids.origRemote.toStringWithPort());
51642fe3 1780
c2fbeb27 1781 struct timespec ts;
85c7ca75 1782 gettime(&ts);
c2fbeb27 1783
2d11d1b2 1784 struct dnsheader fake;
1785 memset(&fake, 0, sizeof(fake));
1786 fake.id = ids.origID;
c2fbeb27 1787
1788 std::lock_guard<std::mutex> lock(g_rings.respMutex);
1789 g_rings.respRing.push_back({ts, ids.origRemote, ids.qname, ids.qtype, std::numeric_limits<unsigned int>::max(), 0, fake, dss->remote});
232f0877 1790 }
3ae86514 1791 }
1792 }
3ae86514 1793 }
1794 return 0;
1795}
1796
8c732ebf 1797string g_key;
3ae86514 1798
6d01c80c 1799
1800void controlThread(int fd, ComboAddress local)
1801try
1802{
1803 ComboAddress client;
1804 int sock;
1805 warnlog("Accepting control connections on %s", local.toStringWithPort());
1806 while((sock=SAccept(fd, client)) >= 0) {
506bb661
RG
1807 if (g_logConsoleConnections) {
1808 warnlog("Got control connection from %s", client.toStringWithPort());
1809 }
1810
6d01c80c 1811 thread t(controlClientThread, sock, client);
1812 t.detach();
1813 }
1814}
1815catch(std::exception& e)
1816{
1817 close(fd);
1818 errlog("Control connection died: %s", e.what());
1819}
3f25bce6 1820
b076b34a 1821
3c115e0f 1822
c2a42f97 1823static void bindAny(int af, int sock)
1824{
18f8e493 1825 __attribute__((unused)) int one = 1;
c2a42f97 1826
1827#ifdef IP_FREEBIND
1828 if (setsockopt(sock, IPPROTO_IP, IP_FREEBIND, &one, sizeof(one)) < 0)
1829 warnlog("Warning: IP_FREEBIND setsockopt failed: %s", strerror(errno));
1830#endif
1831
1832#ifdef IP_BINDANY
1833 if (af == AF_INET)
1834 if (setsockopt(sock, IPPROTO_IP, IP_BINDANY, &one, sizeof(one)) < 0)
1835 warnlog("Warning: IP_BINDANY setsockopt failed: %s", strerror(errno));
1836#endif
1837#ifdef IPV6_BINDANY
1838 if (af == AF_INET6)
1839 if (setsockopt(sock, IPPROTO_IPV6, IPV6_BINDANY, &one, sizeof(one)) < 0)
1840 warnlog("Warning: IPV6_BINDANY setsockopt failed: %s", strerror(errno));
1841#endif
1842#ifdef SO_BINDANY
1843 if (setsockopt(sock, SOL_SOCKET, SO_BINDANY, &one, sizeof(one)) < 0)
1844 warnlog("Warning: SO_BINDANY setsockopt failed: %s", strerror(errno));
1845#endif
1846}
1847
a36ce055
RG
1848static void dropGroupPrivs(gid_t gid)
1849{
1850 if (gid) {
1851 if (setgid(gid) == 0) {
1852 if (setgroups(0, NULL) < 0) {
1853 warnlog("Warning: Unable to drop supplementary gids: %s", strerror(errno));
1854 }
1855 }
1856 else {
1857 warnlog("Warning: Unable to set group ID to %d: %s", gid, strerror(errno));
1858 }
1859 }
1860}
1861
1862static void dropUserPrivs(uid_t uid)
1863{
1864 if(uid) {
1865 if(setuid(uid) < 0) {
1866 warnlog("Warning: Unable to set user ID to %d: %s", uid, strerror(errno));
1867 }
1868 }
1869}
1870
41408d3a
RG
1871static void checkFileDescriptorsLimits(size_t udpBindsCount, size_t tcpBindsCount)
1872{
1873 /* stdin, stdout, stderr */
1874 size_t requiredFDsCount = 3;
1875 size_t backendsCount = g_dstates.getCopy().size();
9fcd6adb 1876 /* listening sockets */
41408d3a
RG
1877 requiredFDsCount += udpBindsCount;
1878 requiredFDsCount += tcpBindsCount;
1879 /* max TCP connections currently served */
1880 requiredFDsCount += g_maxTCPClientThreads;
f2e29d04 1881 /* max pipes for communicating between TCP acceptors and client threads */
41408d3a
RG
1882 requiredFDsCount += (g_maxTCPClientThreads * 2);
1883 /* UDP sockets to backends */
1884 requiredFDsCount += backendsCount;
1885 /* TCP sockets to backends */
1886 requiredFDsCount += (backendsCount * g_maxTCPClientThreads);
1887 /* max TCP queued connections */
9fcd6adb 1888 requiredFDsCount += g_maxTCPQueuedConnections;
41408d3a
RG
1889 /* DelayPipe pipe */
1890 requiredFDsCount += 2;
1891 /* syslog socket */
1892 requiredFDsCount++;
1893 /* webserver main socket */
1894 requiredFDsCount++;
1895 /* console main socket */
1896 requiredFDsCount++;
1897 /* carbon export */
1898 requiredFDsCount++;
1899 /* history file */
1900 requiredFDsCount++;
1901 struct rlimit rl;
1902 getrlimit(RLIMIT_NOFILE, &rl);
9fcd6adb 1903 if (rl.rlim_cur <= requiredFDsCount) {
41408d3a
RG
1904 warnlog("Warning, this configuration can use more than %d file descriptors, web server and console connections not included, and the current limit is %d.", std::to_string(requiredFDsCount), std::to_string(rl.rlim_cur));
1905#ifdef HAVE_SYSTEMD
1906 warnlog("You can increase this value by using LimitNOFILE= in the systemd unit file or ulimit.");
1907#else
1908 warnlog("You can increase this value by using ulimit.");
1909#endif
1910 }
1911}
c2a42f97 1912
7cc68f53 1913struct
1914{
1915 vector<string> locals;
1916 vector<string> remotes;
5efcfa63 1917 bool checkConfig{false};
7cc68f53 1918 bool beDaemon{false};
1919 bool beClient{false};
505ca3d1 1920 bool beSupervised{false};
c7b1d943 1921 string pidfile;
7cc68f53 1922 string command;
1923 string config;
a36ce055
RG
1924 string uid;
1925 string gid;
7cc68f53 1926} g_cmdLine;
520eb5a0 1927
e41f8165 1928std::atomic<bool> g_configurationDone{false};
520eb5a0 1929
24d5cb00 1930int main(int argc, char** argv)
1931try
1932{
41408d3a
RG
1933 size_t udpBindsCount = 0;
1934 size_t tcpBindsCount = 0;
94721140 1935 rl_attempted_completion_function = my_completion;
1936 rl_completion_append_character = 0;
1937
726ddf60 1938 signal(SIGPIPE, SIG_IGN);
6d01c80c 1939 signal(SIGCHLD, SIG_IGN);
b076b34a 1940 openlog("dnsdist", LOG_PID, LOG_DAEMON);
a4652d55 1941 g_console=true;
6d01c80c 1942
0b62ec78 1943#ifdef HAVE_LIBSODIUM
6d01c80c 1944 if (sodium_init() == -1) {
1945 cerr<<"Unable to initialize crypto library"<<endl;
1946 exit(EXIT_FAILURE);
1947 }
7691e7df 1948 g_hashperturb=randombytes_uniform(0xffffffff);
1949 srandom(randombytes_uniform(0xffffffff));
1950#else
1951 {
1952 struct timeval tv;
1953 gettimeofday(&tv, 0);
1954 srandom(tv.tv_sec ^ tv.tv_usec ^ getpid());
1955 g_hashperturb=random();
1956 }
1957
0b62ec78 1958#endif
094b6aff 1959 ComboAddress clientAddress = ComboAddress();
11058bd6 1960 g_cmdLine.config=SYSCONFDIR "/dnsdist.conf";
7cc68f53 1961 struct option longopts[]={
8f2d5ec3 1962 {"acl", required_argument, 0, 'a'},
7cc68f53 1963 {"config", required_argument, 0, 'C'},
5efcfa63 1964 {"check-config", 0, 0, 1},
7cc68f53 1965 {"execute", required_argument, 0, 'e'},
4fdb62a4 1966 {"client", 0, 0, 'c'},
a36ce055 1967 {"gid", required_argument, 0, 'g'},
ddb14ec9
PL
1968#ifdef HAVE_LIBSODIUM
1969 {"setkey", required_argument, 0, 'k'},
1970#endif
7cc68f53 1971 {"local", required_argument, 0, 'l'},
98dd1cda 1972 {"daemon", 0, 0, 'd'},
c7b1d943 1973 {"pidfile", required_argument, 0, 'p'},
505ca3d1 1974 {"supervised", 0, 0, 's'},
bbfaaa6f 1975 {"disable-syslog", 0, 0, 2},
a36ce055 1976 {"uid", required_argument, 0, 'u'},
359d7ca4 1977 {"verbose", 0, 0, 'v'},
6306c282 1978 {"version", 0, 0, 'V'},
c7b1d943 1979 {"help", 0, 0, 'h'},
7cc68f53 1980 {0,0,0,0}
1981 };
1982 int longindex=0;
8f2d5ec3 1983 string optstring;
7cc68f53 1984 for(;;) {
ddb14ec9
PL
1985#ifdef HAVE_LIBSODIUM
1986 int c=getopt_long(argc, argv, "a:hcde:C:k:l:vp:g:u:V", longopts, &longindex);
1987#else
6306c282 1988 int c=getopt_long(argc, argv, "a:hcde:C:l:vp:g:u:V", longopts, &longindex);
ddb14ec9 1989#endif
7cc68f53 1990 if(c==-1)
1991 break;
1992 switch(c) {
5efcfa63
PL
1993 case 1:
1994 g_cmdLine.checkConfig=true;
1995 break;
bbfaaa6f
PL
1996 case 2:
1997 g_syslog=false;
1998 break;
7cc68f53 1999 case 'C':
2000 g_cmdLine.config=optarg;
2001 break;
2002 case 'c':
2003 g_cmdLine.beClient=true;
2004 break;
2005 case 'd':
98dd1cda 2006 g_cmdLine.beDaemon=true;
7cc68f53 2007 break;
2008 case 'e':
2009 g_cmdLine.command=optarg;
2010 break;
a36ce055
RG
2011 case 'g':
2012 g_cmdLine.gid=optarg;
2013 break;
7cc68f53 2014 case 'h':
6306c282
PL
2015 cout<<"dnsdist "<<VERSION<<endl;
2016 cout<<endl;
094b6aff 2017 cout<<"Syntax: dnsdist [-C,--config file] [-c,--client [IP[:PORT]]] [-d,--daemon]\n";
c7b1d943 2018 cout<<"[-p,--pidfile file] [-e,--execute cmd] [-h,--help] [-l,--local addr]\n";
c68aae2e 2019 cout<<"[-v,--verbose] [--check-config]\n";
7cc68f53 2020 cout<<"\n";
8f2d5ec3 2021 cout<<"-a,--acl netmask Add this netmask to the ACL\n";
7cc68f53 2022 cout<<"-C,--config file Load configuration from 'file'\n";
094b6aff
PL
2023 cout<<"-c,--client Operate as a client, connect to dnsdist. This reads\n";
2024 cout<<" controlSocket from your configuration file, but also\n";
2025 cout<<" accepts an IP:PORT argument\n";
ddb14ec9
PL
2026#ifdef HAVE_LIBSODIUM
2027 cout<<"-k,--setkey KEY Use KEY for encrypted communication to dnsdist. This\n";
2028 cout<<" is similar to setting setKey in the configuration file.\n";
2029 cout<<" NOTE: this will leak this key in your shell's history!\n";
2030#endif
c68aae2e
PL
2031 cout<<"--check-config Validate the configuration file and exit. The exit-code\n";
2032 cout<<" reflects the validation, 0 is OK, 1 means an error.\n";
2033 cout<<" Any errors are printed as well.\n";
98dd1cda 2034 cout<<"-d,--daemon Operate as a daemon\n";
7cc68f53 2035 cout<<"-e,--execute cmd Connect to dnsdist and execute 'cmd'\n";
a36ce055 2036 cout<<"-g,--gid gid Change the process group ID after binding sockets\n";
7cc68f53 2037 cout<<"-h,--help Display this helpful message\n";
2038 cout<<"-l,--local address Listen on this local address\n";
505ca3d1
PL
2039 cout<<"--supervised Don't open a console, I'm supervised\n";
2040 cout<<" (use with e.g. systemd and daemontools)\n";
bbfaaa6f
PL
2041 cout<<"--disable-syslog Don't log to syslog, only to stdout\n";
2042 cout<<" (use with e.g. systemd)\n";
c7b1d943 2043 cout<<"-p,--pidfile file Write a pidfile, works only with --daemon\n";
a36ce055 2044 cout<<"-u,--uid uid Change the process user ID after binding sockets\n";
359d7ca4 2045 cout<<"-v,--verbose Enable verbose mode\n";
7cc68f53 2046 cout<<"\n";
2047 exit(EXIT_SUCCESS);
2048 break;
8f2d5ec3 2049 case 'a':
2050 optstring=optarg;
2051 g_ACL.modify([optstring](NetmaskGroup& nmg) { nmg.addMask(optstring); });
2052 break;
ddb14ec9
PL
2053#ifdef HAVE_LIBSODIUM
2054 case 'k':
6f6b4d69 2055 if (B64Decode(string(optarg), g_key) < 0) {
ddb14ec9
PL
2056 cerr<<"Unable to decode key '"<<optarg<<"'."<<endl;
2057 exit(EXIT_FAILURE);
2058 }
2059 break;
2060#endif
7cc68f53 2061 case 'l':
6a363878 2062 g_cmdLine.locals.push_back(trim_copy(string(optarg)));
7cc68f53 2063 break;
c7b1d943
PL
2064 case 'p':
2065 g_cmdLine.pidfile=optarg;
2cd28293 2066 break;
505ca3d1
PL
2067 case 's':
2068 g_cmdLine.beSupervised=true;
2cd28293 2069 break;
a36ce055
RG
2070 case 'u':
2071 g_cmdLine.uid=optarg;
2072 break;
7cc68f53 2073 case 'v':
2074 g_verbose=true;
2075 break;
6306c282 2076 case 'V':
d4d796e5
PD
2077#ifdef LUAJIT_VERSION
2078 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<" ["<<LUAJIT_VERSION<<"])"<<endl;
2079#else
2080 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<")"<<endl;
2081#endif
70829d97 2082 cout<<"Enabled features: ";
a227f47d
RG
2083#ifdef HAVE_DNS_OVER_TLS
2084 cout<<"dns-over-tls(";
2085#ifdef HAVE_GNUTLS
2086 cout<<"gnutls ";
2087#endif
2088#ifdef HAVE_LIBSSL
2089 cout<<"openssl";
2090#endif
2091 cout<<") ";
2092#endif
70829d97
PL
2093#ifdef HAVE_DNSCRYPT
2094 cout<<"dnscrypt ";
2095#endif
0beaa5c8
RG
2096#ifdef HAVE_EBPF
2097 cout<<"ebpf ";
2098#endif
70829d97
PL
2099#ifdef HAVE_LIBSODIUM
2100 cout<<"libsodium ";
2101#endif
2102#ifdef HAVE_PROTOBUF
2103 cout<<"protobuf ";
2104#endif
2105#ifdef HAVE_RE2
2106 cout<<"re2 ";
2107#endif
0beaa5c8
RG
2108#if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
2109 cout<<"recvmmsg/sendmmsg ";
2110#endif
2111#ifdef HAVE_NET_SNMP
2112 cout<<"snmp ";
2113#endif
70829d97
PL
2114#ifdef HAVE_SYSTEMD
2115 cout<<"systemd";
2116#endif
2117 cout<<endl;
6306c282
PL
2118 exit(EXIT_SUCCESS);
2119 break;
7cc68f53 2120 }
24d5cb00 2121 }
6ab65223 2122
7cc68f53 2123 argc-=optind;
2124 argv+=optind;
2125 for(auto p = argv; *p; ++p) {
094b6aff
PL
2126 if(g_cmdLine.beClient) {
2127 clientAddress = ComboAddress(*p, 5199);
2128 } else {
2129 g_cmdLine.remotes.push_back(*p);
2130 }
7cc68f53 2131 }
2132
cd29dcb1 2133 ServerPolicy leastOutstandingPol{"leastOutstanding", leastOutstanding};
2134
e5a14b2b 2135 g_policy.setState(leastOutstandingPol);
7cc68f53 2136 if(g_cmdLine.beClient || !g_cmdLine.command.empty()) {
2137 setupLua(true, g_cmdLine.config);
094b6aff
PL
2138 if (clientAddress != ComboAddress())
2139 g_serverControl = clientAddress;
7cc68f53 2140 doClient(g_serverControl, g_cmdLine.command);
e16fd59c 2141 _exit(EXIT_SUCCESS);
6d01c80c 2142 }
2e72cc0e 2143
8f133915 2144 auto acl = g_ACL.getCopy();
8f2d5ec3 2145 if(acl.empty()) {
2146 for(auto& addr : {"127.0.0.0/8", "10.0.0.0/8", "100.64.0.0/10", "169.254.0.0/16", "192.168.0.0/16", "172.16.0.0/12", "::1/128", "fc00::/7", "fe80::/10"})
2147 acl.addMask(addr);
2148 g_ACL.setState(acl);
2149 }
8f133915 2150
5efcfa63
PL
2151 if (g_cmdLine.checkConfig) {
2152 setupLua(true, g_cmdLine.config);
2153 // No exception was thrown
2154 infolog("Configuration '%s' OK!", g_cmdLine.config);
d8c19b98 2155 _exit(EXIT_SUCCESS);
5efcfa63
PL
2156 }
2157
7cc68f53 2158 auto todo=setupLua(false, g_cmdLine.config);
2e72cc0e 2159
7cc68f53 2160 if(g_cmdLine.locals.size()) {
2e72cc0e 2161 g_locals.clear();
7cc68f53 2162 for(auto loc : g_cmdLine.locals)
f0e4dcba 2163 g_locals.push_back(std::make_tuple(ComboAddress(loc, 53), true, false, 0, "", std::set<int>()));
2e72cc0e 2164 }
2165
2166 if(g_locals.empty())
f0e4dcba 2167 g_locals.push_back(std::make_tuple(ComboAddress("127.0.0.1", 53), true, false, 0, "", std::set<int>()));
2e72cc0e 2168
e41f8165
RG
2169 g_configurationDone = true;
2170
2e72cc0e 2171 vector<ClientState*> toLaunch;
2172 for(const auto& local : g_locals) {
2173 ClientState* cs = new ClientState;
4c34246d 2174 cs->local= std::get<0>(local);
2e72cc0e 2175 cs->udpFD = SSocket(cs->local.sin4.sin_family, SOCK_DGRAM, 0);
2176 if(cs->local.sin4.sin_family == AF_INET6) {
2177 SSetsockopt(cs->udpFD, IPPROTO_IPV6, IPV6_V6ONLY, 1);
2178 }
7cc68f53 2179 //if(g_vm.count("bind-non-local"))
4c34246d 2180 bindAny(cs->local.sin4.sin_family, cs->udpFD);
549d63c9 2181
915b0c39
AT
2182 // if (!setSocketTimestamps(cs->udpFD))
2183 // L<<Logger::Warning<<"Unable to enable timestamp reporting for socket"<<endl;
2184
549d63c9 2185
4c34246d 2186 if(IsAnyAddress(cs->local)) {
549d63c9 2187 int one=1;
2188 setsockopt(cs->udpFD, IPPROTO_IP, GEN_IP_PKTINFO, &one, sizeof(one)); // linux supports this, so why not - might fail on other systems
d98d3951 2189#ifdef IPV6_RECVPKTINFO
11e1e08b 2190 setsockopt(cs->udpFD, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
d98d3951 2191#endif
549d63c9 2192 }
9f67b883 2193
4c34246d 2194 if (std::get<2>(local)) {
5f084036 2195#ifdef SO_REUSEPORT
4c34246d 2196 SSetsockopt(cs->udpFD, SOL_SOCKET, SO_REUSEPORT, 1);
5f084036
RG
2197#else
2198 warnlog("SO_REUSEPORT has been configured on local address '%s' but is not supported", std::get<0>(local).toStringWithPort());
4c34246d 2199#endif
5f084036 2200 }
549d63c9 2201
9f67b883
RG
2202 const std::string& itf = std::get<4>(local);
2203 if (!itf.empty()) {
2204#ifdef SO_BINDTODEVICE
2205 int res = setsockopt(cs->udpFD, SOL_SOCKET, SO_BINDTODEVICE, itf.c_str(), itf.length());
2206 if (res != 0) {
2207 warnlog("Error setting up the interface on local address '%s': %s", std::get<0>(local).toStringWithPort(), strerror(errno));
2208 }
2209#else
2210 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", std::get<0>(local).toStringWithPort());
2211#endif
2212 }
2213
87b515ed
RG
2214#ifdef HAVE_EBPF
2215 if (g_defaultBPFFilter) {
8429ad04 2216 cs->attachFilter(g_defaultBPFFilter);
87b515ed
RG
2217 vinfolog("Attaching default BPF Filter to UDP frontend %s", cs->local.toStringWithPort());
2218 }
2219#endif /* HAVE_EBPF */
2220
f0e4dcba
RG
2221 cs->cpus = std::get<5>(local);
2222
11e1e08b 2223 SBind(cs->udpFD, cs->local);
2e72cc0e 2224 toLaunch.push_back(cs);
963bef8d 2225 g_frontends.push_back(cs);
41408d3a 2226 udpBindsCount++;
2e72cc0e 2227 }
2228
a36ce055 2229 for(const auto& local : g_locals) {
4c34246d
RG
2230 if(!std::get<1>(local)) { // no TCP/IP
2231 warnlog("Not providing TCP/IP service on local address '%s'", std::get<0>(local).toStringWithPort());
a36ce055
RG
2232 continue;
2233 }
963bef8d 2234 ClientState* cs = new ClientState;
4c34246d 2235 cs->local= std::get<0>(local);
a36ce055
RG
2236
2237 cs->tcpFD = SSocket(cs->local.sin4.sin_family, SOCK_STREAM, 0);
2238
2239 SSetsockopt(cs->tcpFD, SOL_SOCKET, SO_REUSEADDR, 1);
2240#ifdef TCP_DEFER_ACCEPT
2241 SSetsockopt(cs->tcpFD, SOL_TCP,TCP_DEFER_ACCEPT, 1);
2242#endif
9e284f31 2243 if (std::get<3>(local) > 0) {
5f084036 2244#ifdef TCP_FASTOPEN
1654ece3 2245 SSetsockopt(cs->tcpFD, IPPROTO_TCP, TCP_FASTOPEN, std::get<3>(local));
5f084036
RG
2246#else
2247 warnlog("TCP Fast Open has been configured on local address '%s' but is not supported", std::get<0>(local).toStringWithPort());
a36ce055 2248#endif
5f084036 2249 }
a36ce055
RG
2250 if(cs->local.sin4.sin_family == AF_INET6) {
2251 SSetsockopt(cs->tcpFD, IPPROTO_IPV6, IPV6_V6ONLY, 1);
2252 }
4c34246d 2253#ifdef SO_REUSEPORT
5f084036 2254 /* no need to warn again if configured but support is not available, we already did for UDP */
4c34246d
RG
2255 if (std::get<2>(local)) {
2256 SSetsockopt(cs->tcpFD, SOL_SOCKET, SO_REUSEPORT, 1);
2257 }
2258#endif
9f67b883
RG
2259
2260 const std::string& itf = std::get<4>(local);
2261 if (!itf.empty()) {
2262#ifdef SO_BINDTODEVICE
2263 int res = setsockopt(cs->tcpFD, SOL_SOCKET, SO_BINDTODEVICE, itf.c_str(), itf.length());
2264 if (res != 0) {
2265 warnlog("Error setting up the interface on local address '%s': %s", std::get<0>(local).toStringWithPort(), strerror(errno));
2266 }
2267#else
2268 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", std::get<0>(local).toStringWithPort());
2269#endif
2270 }
2271
87b515ed
RG
2272#ifdef HAVE_EBPF
2273 if (g_defaultBPFFilter) {
8429ad04 2274 cs->attachFilter(g_defaultBPFFilter);
87b515ed
RG
2275 vinfolog("Attaching default BPF Filter to TCP frontend %s", cs->local.toStringWithPort());
2276 }
2277#endif /* HAVE_EBPF */
2278
a36ce055
RG
2279 // if(g_vm.count("bind-non-local"))
2280 bindAny(cs->local.sin4.sin_family, cs->tcpFD);
2281 SBind(cs->tcpFD, cs->local);
2282 SListen(cs->tcpFD, 64);
f0e4dcba 2283 warnlog("Listening on %s", cs->local.toStringWithPort());
a36ce055
RG
2284
2285 toLaunch.push_back(cs);
963bef8d 2286 g_frontends.push_back(cs);
41408d3a 2287 tcpBindsCount++;
a36ce055
RG
2288 }
2289
11e1e08b
RG
2290#ifdef HAVE_DNSCRYPT
2291 for(auto& dcLocal : g_dnsCryptLocals) {
2292 ClientState* cs = new ClientState;
4c34246d
RG
2293 cs->local = std::get<0>(dcLocal);
2294 cs->dnscryptCtx = &(std::get<1>(dcLocal));
11e1e08b
RG
2295 cs->udpFD = SSocket(cs->local.sin4.sin_family, SOCK_DGRAM, 0);
2296 if(cs->local.sin4.sin_family == AF_INET6) {
2297 SSetsockopt(cs->udpFD, IPPROTO_IPV6, IPV6_V6ONLY, 1);
2298 }
2299 bindAny(cs->local.sin4.sin_family, cs->udpFD);
2300 if(IsAnyAddress(cs->local)) {
2301 int one=1;
2302 setsockopt(cs->udpFD, IPPROTO_IP, GEN_IP_PKTINFO, &one, sizeof(one)); // linux supports this, so why not - might fail on other systems
2303#ifdef IPV6_RECVPKTINFO
2304 setsockopt(cs->udpFD, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
05510903
RG
2305#endif
2306 }
2307 if (std::get<2>(dcLocal)) {
2308#ifdef SO_REUSEPORT
2309 SSetsockopt(cs->udpFD, SOL_SOCKET, SO_REUSEPORT, 1);
2310#else
2311 warnlog("SO_REUSEPORT has been configured on local address '%s' but is not supported", std::get<0>(dcLocal).toStringWithPort());
11e1e08b
RG
2312#endif
2313 }
9f67b883
RG
2314
2315 const std::string& itf = std::get<4>(dcLocal);
2316 if (!itf.empty()) {
2317#ifdef SO_BINDTODEVICE
2318 int res = setsockopt(cs->udpFD, SOL_SOCKET, SO_BINDTODEVICE, itf.c_str(), itf.length());
2319 if (res != 0) {
2320 warnlog("Error setting up the interface on local address '%s': %s", std::get<0>(dcLocal).toStringWithPort(), strerror(errno));
2321 }
2322#else
2323 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", std::get<0>(dcLocal).toStringWithPort());
2324#endif
2325 }
2326
87b515ed
RG
2327#ifdef HAVE_EBPF
2328 if (g_defaultBPFFilter) {
8429ad04 2329 cs->attachFilter(g_defaultBPFFilter);
87b515ed
RG
2330 vinfolog("Attaching default BPF Filter to UDP DNSCrypt frontend %s", cs->local.toStringWithPort());
2331 }
2332#endif /* HAVE_EBPF */
11e1e08b
RG
2333 SBind(cs->udpFD, cs->local);
2334 toLaunch.push_back(cs);
2335 g_frontends.push_back(cs);
41408d3a 2336 udpBindsCount++;
11e1e08b
RG
2337
2338 cs = new ClientState;
4c34246d
RG
2339 cs->local = std::get<0>(dcLocal);
2340 cs->dnscryptCtx = &(std::get<1>(dcLocal));
11e1e08b
RG
2341 cs->tcpFD = SSocket(cs->local.sin4.sin_family, SOCK_STREAM, 0);
2342 SSetsockopt(cs->tcpFD, SOL_SOCKET, SO_REUSEADDR, 1);
2343#ifdef TCP_DEFER_ACCEPT
2344 SSetsockopt(cs->tcpFD, SOL_TCP,TCP_DEFER_ACCEPT, 1);
4c34246d 2345#endif
9e284f31 2346 if (std::get<3>(dcLocal) > 0) {
5f084036 2347#ifdef TCP_FASTOPEN
1654ece3 2348 SSetsockopt(cs->tcpFD, IPPROTO_TCP, TCP_FASTOPEN, std::get<3>(dcLocal));
5f084036
RG
2349#else
2350 warnlog("TCP Fast Open has been configured on local address '%s' but is not supported", std::get<0>(dcLocal).toStringWithPort());
9e284f31 2351#endif
5f084036 2352 }
9f67b883 2353
4c34246d 2354#ifdef SO_REUSEPORT
05510903 2355 /* no need to warn again if configured but support is not available, we already did for UDP */
4c34246d
RG
2356 if (std::get<2>(dcLocal)) {
2357 SSetsockopt(cs->tcpFD, SOL_SOCKET, SO_REUSEPORT, 1);
2358 }
11e1e08b 2359#endif
9f67b883
RG
2360
2361 if (!itf.empty()) {
2362#ifdef SO_BINDTODEVICE
2363 int res = setsockopt(cs->tcpFD, SOL_SOCKET, SO_BINDTODEVICE, itf.c_str(), itf.length());
2364 if (res != 0) {
2365 warnlog("Error setting up the interface on local address '%s': %s", std::get<0>(dcLocal).toStringWithPort(), strerror(errno));
2366 }
2367#else
2368 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", std::get<0>(dcLocal).toStringWithPort());
2369#endif
2370 }
2371
11e1e08b
RG
2372 if(cs->local.sin4.sin_family == AF_INET6) {
2373 SSetsockopt(cs->tcpFD, IPPROTO_IPV6, IPV6_V6ONLY, 1);
2374 }
87b515ed
RG
2375#ifdef HAVE_EBPF
2376 if (g_defaultBPFFilter) {
8429ad04 2377 cs->attachFilter(g_defaultBPFFilter);
87b515ed
RG
2378 vinfolog("Attaching default BPF Filter to TCP DNSCrypt frontend %s", cs->local.toStringWithPort());
2379 }
2380#endif /* HAVE_EBPF */
f0e4dcba
RG
2381
2382 cs->cpus = std::get<5>(dcLocal);
2383
11e1e08b
RG
2384 bindAny(cs->local.sin4.sin_family, cs->tcpFD);
2385 SBind(cs->tcpFD, cs->local);
2386 SListen(cs->tcpFD, 64);
2387 warnlog("Listening on %s", cs->local.toStringWithPort());
2388 toLaunch.push_back(cs);
2389 g_frontends.push_back(cs);
41408d3a 2390 tcpBindsCount++;
11e1e08b
RG
2391 }
2392#endif
2393
a227f47d
RG
2394 for(auto& frontend : g_tlslocals) {
2395 ClientState* cs = new ClientState;
2396 cs->local = frontend->d_addr;
2397 cs->tcpFD = SSocket(cs->local.sin4.sin_family, SOCK_STREAM, 0);
2398 SSetsockopt(cs->tcpFD, SOL_SOCKET, SO_REUSEADDR, 1);
2399#ifdef TCP_DEFER_ACCEPT
2400 SSetsockopt(cs->tcpFD, SOL_TCP,TCP_DEFER_ACCEPT, 1);
2401#endif
2402 if (frontend->d_tcpFastOpenQueueSize > 0) {
2403#ifdef TCP_FASTOPEN
2404 SSetsockopt(cs->tcpFD, SOL_TCP, TCP_FASTOPEN, frontend->d_tcpFastOpenQueueSize);
2405#else
2406 warnlog("TCP Fast Open has been configured on local address '%s' but is not supported", cs->local.toStringWithPort());
2407#endif
2408 }
2409 if (frontend->d_reusePort) {
2410#ifdef SO_REUSEPORT
2411 SSetsockopt(cs->tcpFD, SOL_SOCKET, SO_REUSEPORT, 1);
2412#else
2413 warnlog("SO_REUSEPORT has been configured on local address '%s' but is not supported", cs.local.toStringWithPort());
2414#endif
2415 }
2416 if(cs->local.sin4.sin_family == AF_INET6) {
2417 SSetsockopt(cs->tcpFD, IPPROTO_IPV6, IPV6_V6ONLY, 1);
2418 }
2419
2420 if (!frontend->d_interface.empty()) {
2421#ifdef SO_BINDTODEVICE
2422 int res = setsockopt(cs->tcpFD, SOL_SOCKET, SO_BINDTODEVICE, frontend->d_interface.c_str(), frontend->d_interface.length());
2423 if (res != 0) {
2424 warnlog("Error setting up the interface on local address '%s': %s", cs->local.toStringWithPort(), strerror(errno));
2425 }
2426#else
2427 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", cs->local.toStringWithPort());
2428#endif
2429 }
2430
2431 cs->cpus = frontend->d_cpus;
2432
2433 bindAny(cs->local.sin4.sin_family, cs->tcpFD);
2434 if (frontend->setupTLS()) {
2435 cs->tlsFrontend = frontend;
2436 SBind(cs->tcpFD, cs->local);
2437 SListen(cs->tcpFD, 64);
2438 warnlog("Listening on %s for TLS", cs->local.toStringWithPort());
2439 toLaunch.push_back(cs);
2440 g_frontends.push_back(cs);
2441 tcpBindsCount++;
2442 }
2443 else {
2444 delete cs;
2445 errlog("Error while setting up TLS on local address '%s', exiting", cs->local.toStringWithPort());
2446 _exit(EXIT_FAILURE);
2447 }
2448 }
2449
7cc68f53 2450 if(g_cmdLine.beDaemon) {
a4652d55 2451 g_console=false;
b076b34a 2452 daemonize();
c7b1d943 2453 writepid(g_cmdLine.pidfile);
a4652d55 2454 }
b076b34a 2455 else {
64e4ebb4 2456 vinfolog("Running in the foreground");
43ac546d 2457 warnlog("dnsdist %s comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it according to the terms of the GPL version 2", VERSION);
3fa28388
RG
2458 vector<string> vec;
2459 std::string acls;
2460 g_ACL.getCopy().toStringVector(&vec);
2461 for(const auto& s : vec) {
2462 if (!acls.empty())
2463 acls += ", ";
2464 acls += s;
2465 }
2466 infolog("ACL allowing queries from: %s", acls.c_str());
b076b34a 2467 }
6d01c80c 2468
aac59883
RG
2469 uid_t newgid=0;
2470 gid_t newuid=0;
2471
2472 if(!g_cmdLine.gid.empty())
2473 newgid = strToGID(g_cmdLine.gid.c_str());
2474
2475 if(!g_cmdLine.uid.empty())
2476 newuid = strToUID(g_cmdLine.uid.c_str());
2477
2478 dropGroupPrivs(newgid);
2479 dropUserPrivs(newuid);
2480
a36ce055
RG
2481 /* this need to be done _after_ dropping privileges */
2482 g_delay = new DelayPipe<DelayedPacket>();
2483
9f4eb5cc
RG
2484 if (g_snmpAgent) {
2485 g_snmpAgent->run();
2486 }
2487
edbda1ad 2488 g_tcpclientthreads = std::make_shared<TCPClientCollection>(g_maxTCPClientThreads, g_useTCPSinglePipe);
a9bf3ec4 2489
2e72cc0e 2490 for(auto& t : todo)
2491 t();
2492
886e2cf2 2493 auto localPools = g_pools.getCopy();
8f4f5186
RG
2494 /* create the default pool no matter what */
2495 createPoolIfNotExists(localPools, "");
7cc68f53 2496 if(g_cmdLine.remotes.size()) {
2497 for(const auto& address : g_cmdLine.remotes) {
c9262563 2498 auto ret=std::make_shared<DownstreamState>(ComboAddress(address, 53));
886e2cf2 2499 addServerToPool(localPools, "", ret);
7565f4e6 2500 if (ret->connected) {
2717a92f 2501 ret->tid = thread(responderThread, ret);
7565f4e6 2502 }
ecbe9133 2503 g_dstates.modify([ret](servers_t& servers) { servers.push_back(ret); });
6d01c80c 2504 }
2505 }
886e2cf2 2506 g_pools.setState(localPools);
6d01c80c 2507
e73ec7d3 2508 if(g_dstates.getCopy().empty()) {
2509 errlog("No downstream servers defined: all packets will get dropped");
2510 // you might define them later, but you need to know
2511 }
2512
41408d3a
RG
2513 checkFileDescriptorsLimits(udpBindsCount, tcpBindsCount);
2514
e5a14b2b 2515 for(auto& dss : g_dstates.getCopy()) { // it is a copy, but the internal shared_ptrs are the real deal
773470ca 2516 if(dss->availability==DownstreamState::Availability::Auto) {
fbe2a2e0 2517 bool newState=upCheck(*dss);
a7940c06 2518 warnlog("Marking downstream %s as '%s'", dss->getNameWithAddr(), newState ? "up" : "down");
773470ca 2519 dss->upStatus = newState;
2520 }
2521 }
b076b34a 2522
2e72cc0e 2523 for(auto& cs : toLaunch) {
a36ce055
RG
2524 if (cs->udpFD >= 0) {
2525 thread t1(udpClientThread, cs);
f0e4dcba
RG
2526 if (!cs->cpus.empty()) {
2527 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2528 }
a36ce055 2529 t1.detach();
652a7355 2530 }
a36ce055
RG
2531 else if (cs->tcpFD >= 0) {
2532 thread t1(tcpAcceptorThread, cs);
f0e4dcba
RG
2533 if (!cs->cpus.empty()) {
2534 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2535 }
a36ce055 2536 t1.detach();
726ddf60 2537 }
24d5cb00 2538 }
7730131a 2539
42fae326 2540 thread carbonthread(carbonDumpThread);
2541 carbonthread.detach();
2542
3c115e0f 2543 thread stattid(maintThread);
886e2cf2 2544 stattid.detach();
6d01c80c 2545
886e2cf2
RG
2546 thread healththread(healthChecksThread);
2547
505ca3d1 2548 if(g_cmdLine.beDaemon || g_cmdLine.beSupervised) {
6ab65223
PL
2549#ifdef HAVE_SYSTEMD
2550 sd_notify(0, "READY=1");
2551#endif
886e2cf2 2552 healththread.join();
773470ca 2553 }
6d01c80c 2554 else {
886e2cf2 2555 healththread.detach();
505ca3d1 2556 doConsole();
3c115e0f 2557 }
9cf811d1 2558 _exit(EXIT_SUCCESS);
3c115e0f 2559
6d01c80c 2560}
3f5c3f1d
PD
2561catch(const LuaContext::ExecutionErrorException& e) {
2562 try {
2563 errlog("Fatal Lua error: %s", e.what());
2564 std::rethrow_if_nested(e);
2010ac95
RG
2565 } catch(const std::exception& ne) {
2566 errlog("Details: %s", ne.what());
3f5c3f1d
PD
2567 }
2568 catch(PDNSException &ae)
2569 {
2570 errlog("Fatal pdns error: %s", ae.reason);
2571 }
2572 _exit(EXIT_FAILURE);
2573}
24d5cb00 2574catch(std::exception &e)
2575{
6d01c80c 2576 errlog("Fatal error: %s", e.what());
4a966472 2577 _exit(EXIT_FAILURE);
24d5cb00 2578}
3f81d239 2579catch(PDNSException &ae)
7730131a 2580{
6d01c80c 2581 errlog("Fatal pdns error: %s", ae.reason);
4a966472 2582 _exit(EXIT_FAILURE);
7730131a 2583}