]> git.ipfire.org Git - thirdparty/pdns.git/blob - pdns/dnsdist.cc
Merge pull request #9134 from omoerbeek/secpoll-cleanup
[thirdparty/pdns.git] / pdns / dnsdist.cc
1 /*
2 * This file is part of PowerDNS or dnsdist.
3 * Copyright -- PowerDNS.COM B.V. and its contributors
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of version 2 of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * In addition, for the avoidance of any doubt, permission is granted to
10 * link this program with OpenSSL and to (re)distribute the binaries
11 * produced as the result of such linking.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
22
23 #include "config.h"
24
25 #include <fstream>
26 #include <getopt.h>
27 #include <grp.h>
28 #include <limits>
29 #include <netinet/tcp.h>
30 #include <pwd.h>
31 #include <sys/resource.h>
32 #include <unistd.h>
33
34 #if defined (__OpenBSD__) || defined(__NetBSD__)
35 #include <readline/readline.h>
36 #else
37 #include <editline/readline.h>
38 #endif
39
40 #include "dnsdist-systemd.hh"
41 #ifdef HAVE_SYSTEMD
42 #include <systemd/sd-daemon.h>
43 #endif
44
45 #include "dnsdist.hh"
46 #include "dnsdist-cache.hh"
47 #include "dnsdist-console.hh"
48 #include "dnsdist-ecs.hh"
49 #include "dnsdist-healthchecks.hh"
50 #include "dnsdist-lua.hh"
51 #include "dnsdist-proxy-protocol.hh"
52 #include "dnsdist-rings.hh"
53 #include "dnsdist-secpoll.hh"
54 #include "dnsdist-xpf.hh"
55
56 #include "base64.hh"
57 #include "delaypipe.hh"
58 #include "dolog.hh"
59 #include "dnsname.hh"
60 #include "dnsparser.hh"
61 #include "ednsoptions.hh"
62 #include "gettime.hh"
63 #include "lock.hh"
64 #include "misc.hh"
65 #include "sodcrypto.hh"
66 #include "sstuff.hh"
67 #include "threadname.hh"
68
69 /* Known sins:
70
71 Receiver is currently single threaded
72 not *that* bad actually, but now that we are thread safe, might want to scale
73 */
74
75 /* the Rulaction plan
76 Set of Rules, if one matches, it leads to an Action
77 Both rules and actions could conceivably be Lua based.
78 On the C++ side, both could be inherited from a class Rule and a class Action,
79 on the Lua side we can't do that. */
80
81 using std::atomic;
82 using std::thread;
83 bool g_verbose;
84
85 struct DNSDistStats g_stats;
86
87 uint16_t g_maxOutstanding{std::numeric_limits<uint16_t>::max()};
88 uint32_t g_staleCacheEntriesTTL{0};
89 bool g_syslog{true};
90 bool g_allowEmptyResponse{false};
91
92 GlobalStateHolder<NetmaskGroup> g_ACL;
93 string g_outputBuffer;
94
95 std::vector<std::shared_ptr<TLSFrontend>> g_tlslocals;
96 std::vector<std::shared_ptr<DOHFrontend>> g_dohlocals;
97 std::vector<std::shared_ptr<DNSCryptContext>> g_dnsCryptLocals;
98 #ifdef HAVE_EBPF
99 shared_ptr<BPFFilter> g_defaultBPFFilter;
100 std::vector<std::shared_ptr<DynBPFFilter> > g_dynBPFFilters;
101 #endif /* HAVE_EBPF */
102 std::vector<std::unique_ptr<ClientState>> g_frontends;
103 GlobalStateHolder<pools_t> g_pools;
104 size_t g_udpVectorSize{1};
105
106 /* UDP: the grand design. Per socket we listen on for incoming queries there is one thread.
107 Then we have a bunch of connected sockets for talking to downstream servers.
108 We send directly to those sockets.
109
110 For the return path, per downstream server we have a thread that listens to responses.
111
112 Per socket there is an array of 2^16 states, when we send out a packet downstream, we note
113 there the original requestor and the original id. The new ID is the offset in the array.
114
115 When an answer comes in on a socket, we look up the offset by the id, and lob it to the
116 original requestor.
117
118 IDs are assigned by atomic increments of the socket offset.
119 */
120
121 GlobalStateHolder<vector<DNSDistRuleAction> > g_rulactions;
122 GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_resprulactions;
123 GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_cachehitresprulactions;
124 GlobalStateHolder<vector<DNSDistResponseRuleAction> > g_selfansweredresprulactions;
125
126 Rings g_rings;
127 QueryCount g_qcount;
128
129 GlobalStateHolder<servers_t> g_dstates;
130 GlobalStateHolder<NetmaskTree<DynBlock>> g_dynblockNMG;
131 GlobalStateHolder<SuffixMatchTree<DynBlock>> g_dynblockSMT;
132 DNSAction::Action g_dynBlockAction = DNSAction::Action::Drop;
133 int g_tcpRecvTimeout{2};
134 int g_tcpSendTimeout{2};
135 int g_udpTimeout{2};
136
137 bool g_servFailOnNoPolicy{false};
138 bool g_truncateTC{false};
139 bool g_fixupCase{false};
140 bool g_preserveTrailingData{false};
141
142 std::set<std::string> g_capabilitiesToRetain;
143
144 static void truncateTC(char* packet, uint16_t* len, size_t responseSize, unsigned int consumed)
145 try
146 {
147 bool hadEDNS = false;
148 uint16_t payloadSize = 0;
149 uint16_t z = 0;
150
151 if (g_addEDNSToSelfGeneratedResponses) {
152 hadEDNS = getEDNSUDPPayloadSizeAndZ(packet, *len, &payloadSize, &z);
153 }
154
155 *len=static_cast<uint16_t>(sizeof(dnsheader)+consumed+DNS_TYPE_SIZE+DNS_CLASS_SIZE);
156 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(packet);
157 dh->ancount = dh->arcount = dh->nscount = 0;
158
159 if (hadEDNS) {
160 addEDNS(dh, *len, responseSize, z & EDNS_HEADER_FLAG_DO, payloadSize, 0);
161 }
162 }
163 catch(...)
164 {
165 g_stats.truncFail++;
166 }
167
168 struct DelayedPacket
169 {
170 int fd;
171 string packet;
172 ComboAddress destination;
173 ComboAddress origDest;
174 void operator()()
175 {
176 ssize_t res;
177 if(origDest.sin4.sin_family == 0) {
178 res = sendto(fd, packet.c_str(), packet.size(), 0, (struct sockaddr*)&destination, destination.getSocklen());
179 }
180 else {
181 res = sendfromto(fd, packet.c_str(), packet.size(), 0, origDest, destination);
182 }
183 if (res == -1) {
184 int err = errno;
185 vinfolog("Error sending delayed response to %s: %s", destination.toStringWithPort(), strerror(err));
186 }
187 }
188 };
189
190 DelayPipe<DelayedPacket>* g_delay = nullptr;
191
192 std::string DNSQuestion::getTrailingData() const
193 {
194 const char* message = reinterpret_cast<const char*>(this->dh);
195 const uint16_t messageLen = getDNSPacketLength(message, this->len);
196 return std::string(message + messageLen, this->len - messageLen);
197 }
198
199 bool DNSQuestion::setTrailingData(const std::string& tail)
200 {
201 char* message = reinterpret_cast<char*>(this->dh);
202 const uint16_t messageLen = getDNSPacketLength(message, this->len);
203 const uint16_t tailLen = tail.size();
204 if (tailLen > (this->size - messageLen)) {
205 return false;
206 }
207
208 /* Update length and copy data from the Lua string. */
209 this->len = messageLen + tailLen;
210 if(tailLen > 0) {
211 tail.copy(message + messageLen, tailLen);
212 }
213 return true;
214 }
215
216 void doLatencyStats(double udiff)
217 {
218 if(udiff < 1000) ++g_stats.latency0_1;
219 else if(udiff < 10000) ++g_stats.latency1_10;
220 else if(udiff < 50000) ++g_stats.latency10_50;
221 else if(udiff < 100000) ++g_stats.latency50_100;
222 else if(udiff < 1000000) ++g_stats.latency100_1000;
223 else ++g_stats.latencySlow;
224 g_stats.latencySum += udiff / 1000;
225
226 auto doAvg = [](double& var, double n, double weight) {
227 var = (weight -1) * var/weight + n/weight;
228 };
229
230 doAvg(g_stats.latencyAvg100, udiff, 100);
231 doAvg(g_stats.latencyAvg1000, udiff, 1000);
232 doAvg(g_stats.latencyAvg10000, udiff, 10000);
233 doAvg(g_stats.latencyAvg1000000, udiff, 1000000);
234 }
235
236 bool responseContentMatches(const char* response, const uint16_t responseLen, const DNSName& qname, const uint16_t qtype, const uint16_t qclass, const ComboAddress& remote, unsigned int& consumed)
237 {
238 if (responseLen < sizeof(dnsheader)) {
239 return false;
240 }
241
242 const struct dnsheader* dh = reinterpret_cast<const struct dnsheader*>(response);
243 if (dh->qr == 0) {
244 ++g_stats.nonCompliantResponses;
245 return false;
246 }
247
248 if (dh->qdcount == 0) {
249 if ((dh->rcode != RCode::NoError && dh->rcode != RCode::NXDomain) || g_allowEmptyResponse) {
250 return true;
251 }
252 else {
253 ++g_stats.nonCompliantResponses;
254 return false;
255 }
256 }
257
258 uint16_t rqtype, rqclass;
259 DNSName rqname;
260 try {
261 rqname=DNSName(response, responseLen, sizeof(dnsheader), false, &rqtype, &rqclass, &consumed);
262 }
263 catch(const std::exception& e) {
264 if(responseLen > 0 && static_cast<size_t>(responseLen) > sizeof(dnsheader)) {
265 infolog("Backend %s sent us a response with id %d that did not parse: %s", remote.toStringWithPort(), ntohs(dh->id), e.what());
266 }
267 ++g_stats.nonCompliantResponses;
268 return false;
269 }
270
271 if (rqtype != qtype || rqclass != qclass || rqname != qname) {
272 return false;
273 }
274
275 return true;
276 }
277
278 static void restoreFlags(struct dnsheader* dh, uint16_t origFlags)
279 {
280 static const uint16_t rdMask = 1 << FLAGS_RD_OFFSET;
281 static const uint16_t cdMask = 1 << FLAGS_CD_OFFSET;
282 static const uint16_t restoreFlagsMask = UINT16_MAX & ~(rdMask | cdMask);
283 uint16_t * flags = getFlagsFromDNSHeader(dh);
284 /* clear the flags we are about to restore */
285 *flags &= restoreFlagsMask;
286 /* only keep the flags we want to restore */
287 origFlags &= ~restoreFlagsMask;
288 /* set the saved flags as they were */
289 *flags |= origFlags;
290 }
291
292 static bool fixUpQueryTurnedResponse(DNSQuestion& dq, const uint16_t origFlags)
293 {
294 restoreFlags(dq.dh, origFlags);
295
296 return addEDNSToQueryTurnedResponse(dq);
297 }
298
299 static bool fixUpResponse(char** response, uint16_t* responseLen, size_t* responseSize, const DNSName& qname, uint16_t origFlags, bool ednsAdded, bool ecsAdded, std::vector<uint8_t>& rewrittenResponse, uint16_t addRoom, bool* zeroScope)
300 {
301 if (*responseLen < sizeof(dnsheader)) {
302 return false;
303 }
304
305 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(*response);
306 restoreFlags(dh, origFlags);
307
308 if (*responseLen == sizeof(dnsheader)) {
309 return true;
310 }
311
312 if(g_fixupCase) {
313 string realname = qname.toDNSString();
314 if (*responseLen >= (sizeof(dnsheader) + realname.length())) {
315 memcpy(*response + sizeof(dnsheader), realname.c_str(), realname.length());
316 }
317 }
318
319 if (ednsAdded || ecsAdded) {
320 uint16_t optStart;
321 size_t optLen = 0;
322 bool last = false;
323
324 const std::string responseStr(*response, *responseLen);
325 int res = locateEDNSOptRR(responseStr, &optStart, &optLen, &last);
326
327 if (res == 0) {
328 if (zeroScope) { // this finds if an EDNS Client Subnet scope was set, and if it is 0
329 size_t optContentStart = 0;
330 uint16_t optContentLen = 0;
331 /* we need at least 4 bytes after the option length (family: 2, source prefix-length: 1, scope prefix-length: 1) */
332 if (isEDNSOptionInOpt(responseStr, optStart, optLen, EDNSOptionCode::ECS, &optContentStart, &optContentLen) && optContentLen >= 4) {
333 /* see if the EDNS Client Subnet SCOPE PREFIX-LENGTH byte in position 3 is set to 0, which is the only thing
334 we care about. */
335 *zeroScope = responseStr.at(optContentStart + 3) == 0;
336 }
337 }
338
339 if (ednsAdded) {
340 /* we added the entire OPT RR,
341 therefore we need to remove it entirely */
342 if (last) {
343 /* simply remove the last AR */
344 *responseLen -= optLen;
345 uint16_t arcount = ntohs(dh->arcount);
346 arcount--;
347 dh->arcount = htons(arcount);
348 }
349 else {
350 /* Removing an intermediary RR could lead to compression error */
351 if (rewriteResponseWithoutEDNS(responseStr, rewrittenResponse) == 0) {
352 *responseLen = rewrittenResponse.size();
353 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
354 rewrittenResponse.reserve(*responseLen + addRoom);
355 }
356 *responseSize = rewrittenResponse.capacity();
357 *response = reinterpret_cast<char*>(rewrittenResponse.data());
358 }
359 else {
360 warnlog("Error rewriting content");
361 }
362 }
363 }
364 else {
365 /* the OPT RR was already present, but without ECS,
366 we need to remove the ECS option if any */
367 if (last) {
368 /* nothing after the OPT RR, we can simply remove the
369 ECS option */
370 size_t existingOptLen = optLen;
371 removeEDNSOptionFromOPT(*response + optStart, &optLen, EDNSOptionCode::ECS);
372 *responseLen -= (existingOptLen - optLen);
373 }
374 else {
375 /* Removing an intermediary RR could lead to compression error */
376 if (rewriteResponseWithoutEDNSOption(responseStr, EDNSOptionCode::ECS, rewrittenResponse) == 0) {
377 *responseLen = rewrittenResponse.size();
378 if (addRoom && (UINT16_MAX - *responseLen) > addRoom) {
379 rewrittenResponse.reserve(*responseLen + addRoom);
380 }
381 *responseSize = rewrittenResponse.capacity();
382 *response = reinterpret_cast<char*>(rewrittenResponse.data());
383 }
384 else {
385 warnlog("Error rewriting content");
386 }
387 }
388 }
389 }
390 }
391
392 return true;
393 }
394
395 #ifdef HAVE_DNSCRYPT
396 static bool encryptResponse(char* response, uint16_t* responseLen, size_t responseSize, bool tcp, std::shared_ptr<DNSCryptQuery> dnsCryptQuery, dnsheader** dh, dnsheader* dhCopy)
397 {
398 if (dnsCryptQuery) {
399 uint16_t encryptedResponseLen = 0;
400
401 /* save the original header before encrypting it in place */
402 if (dh != nullptr && *dh != nullptr && dhCopy != nullptr) {
403 memcpy(dhCopy, *dh, sizeof(dnsheader));
404 *dh = dhCopy;
405 }
406
407 int res = dnsCryptQuery->encryptResponse(response, *responseLen, responseSize, tcp, &encryptedResponseLen);
408 if (res == 0) {
409 *responseLen = encryptedResponseLen;
410 } else {
411 /* dropping response */
412 vinfolog("Error encrypting the response, dropping.");
413 return false;
414 }
415 }
416 return true;
417 }
418 #endif /* HAVE_DNSCRYPT */
419
420 static bool applyRulesToResponse(LocalStateHolder<vector<DNSDistResponseRuleAction> >& localRespRulactions, DNSResponse& dr)
421 {
422 DNSResponseAction::Action action=DNSResponseAction::Action::None;
423 std::string ruleresult;
424 for(const auto& lr : *localRespRulactions) {
425 if(lr.d_rule->matches(&dr)) {
426 lr.d_rule->d_matches++;
427 action=(*lr.d_action)(&dr, &ruleresult);
428 switch(action) {
429 case DNSResponseAction::Action::Allow:
430 return true;
431 break;
432 case DNSResponseAction::Action::Drop:
433 return false;
434 break;
435 case DNSResponseAction::Action::HeaderModify:
436 return true;
437 break;
438 case DNSResponseAction::Action::ServFail:
439 dr.dh->rcode = RCode::ServFail;
440 return true;
441 break;
442 /* non-terminal actions follow */
443 case DNSResponseAction::Action::Delay:
444 dr.delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
445 break;
446 case DNSResponseAction::Action::None:
447 break;
448 }
449 }
450 }
451
452 return true;
453 }
454
455 bool processResponse(char** response, uint16_t* responseLen, size_t* responseSize, LocalStateHolder<vector<DNSDistResponseRuleAction> >& localRespRulactions, DNSResponse& dr, size_t addRoom, std::vector<uint8_t>& rewrittenResponse, bool muted)
456 {
457 if (!applyRulesToResponse(localRespRulactions, dr)) {
458 return false;
459 }
460
461 bool zeroScope = false;
462 if (!fixUpResponse(response, responseLen, responseSize, *dr.qname, dr.origFlags, dr.ednsAdded, dr.ecsAdded, rewrittenResponse, addRoom, dr.useZeroScope ? &zeroScope : nullptr)) {
463 return false;
464 }
465
466 if (dr.packetCache && !dr.skipCache && *responseLen <= s_maxPacketCacheEntrySize) {
467 if (!dr.useZeroScope) {
468 /* if the query was not suitable for zero-scope, for
469 example because it had an existing ECS entry so the hash is
470 not really 'no ECS', so just insert it for the existing subnet
471 since:
472 - we don't have the correct hash for a non-ECS query
473 - inserting with hash computed before the ECS replacement but with
474 the subnet extracted _after_ the replacement would not work.
475 */
476 zeroScope = false;
477 }
478 // if zeroScope, pass the pre-ECS hash-key and do not pass the subnet to the cache
479 dr.packetCache->insert(zeroScope ? dr.cacheKeyNoECS : dr.cacheKey, zeroScope ? boost::none : dr.subnet, dr.origFlags, dr.dnssecOK, *dr.qname, dr.qtype, dr.qclass, *response, *responseLen, dr.tcp, dr.dh->rcode, dr.tempFailureTTL);
480 }
481
482 #ifdef HAVE_DNSCRYPT
483 if (!muted) {
484 if (!encryptResponse(*response, responseLen, *responseSize, dr.tcp, dr.dnsCryptQuery, nullptr, nullptr)) {
485 return false;
486 }
487 }
488 #endif /* HAVE_DNSCRYPT */
489
490 return true;
491 }
492
493 static bool sendUDPResponse(int origFD, const char* response, const uint16_t responseLen, const int delayMsec, const ComboAddress& origDest, const ComboAddress& origRemote)
494 {
495 if(delayMsec && g_delay) {
496 DelayedPacket dp{origFD, string(response,responseLen), origRemote, origDest};
497 g_delay->submit(dp, delayMsec);
498 }
499 else {
500 ssize_t res;
501 if(origDest.sin4.sin_family == 0) {
502 res = sendto(origFD, response, responseLen, 0, reinterpret_cast<const struct sockaddr*>(&origRemote), origRemote.getSocklen());
503 }
504 else {
505 res = sendfromto(origFD, response, responseLen, 0, origDest, origRemote);
506 }
507 if (res == -1) {
508 int err = errno;
509 vinfolog("Error sending response to %s: %s", origRemote.toStringWithPort(), stringerror(err));
510 }
511 }
512
513 return true;
514 }
515
516
517 int pickBackendSocketForSending(std::shared_ptr<DownstreamState>& state)
518 {
519 return state->sockets[state->socketsOffset++ % state->sockets.size()];
520 }
521
522 static void pickBackendSocketsReadyForReceiving(const std::shared_ptr<DownstreamState>& state, std::vector<int>& ready)
523 {
524 ready.clear();
525
526 if (state->sockets.size() == 1) {
527 ready.push_back(state->sockets[0]);
528 return ;
529 }
530
531 {
532 std::lock_guard<std::mutex> lock(state->socketsLock);
533 state->mplexer->getAvailableFDs(ready, -1);
534 }
535 }
536
537 // listens on a dedicated socket, lobs answers from downstream servers to original requestors
538 void responderThread(std::shared_ptr<DownstreamState> dss)
539 try {
540 setThreadName("dnsdist/respond");
541 auto localRespRulactions = g_resprulactions.getLocal();
542 char packet[s_maxPacketCacheEntrySize + DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE];
543 static_assert(sizeof(packet) <= UINT16_MAX, "Packet size should fit in a uint16_t");
544 /* when the answer is encrypted in place, we need to get a copy
545 of the original header before encryption to fill the ring buffer */
546 dnsheader cleartextDH;
547 vector<uint8_t> rewrittenResponse;
548
549 uint16_t queryId = 0;
550 std::vector<int> sockets;
551 sockets.reserve(dss->sockets.size());
552
553 for(;;) {
554 dnsheader* dh = reinterpret_cast<struct dnsheader*>(packet);
555 try {
556 pickBackendSocketsReadyForReceiving(dss, sockets);
557 for (const auto& fd : sockets) {
558 ssize_t got = recv(fd, packet, sizeof(packet), 0);
559 char * response = packet;
560 size_t responseSize = sizeof(packet);
561
562 if (got < 0 || static_cast<size_t>(got) < sizeof(dnsheader))
563 continue;
564
565 uint16_t responseLen = static_cast<uint16_t>(got);
566 queryId = dh->id;
567
568 if(queryId >= dss->idStates.size()) {
569 continue;
570 }
571
572 IDState* ids = &dss->idStates[queryId];
573 int64_t usageIndicator = ids->usageIndicator;
574
575 if(!IDState::isInUse(usageIndicator)) {
576 /* the corresponding state is marked as not in use, meaning that:
577 - it was already cleaned up by another thread and the state is gone ;
578 - we already got a response for this query and this one is a duplicate.
579 Either way, we don't touch it.
580 */
581 continue;
582 }
583
584 /* read the potential DOHUnit state as soon as possible, but don't use it
585 until we have confirmed that we own this state by updating usageIndicator */
586 auto du = ids->du;
587 /* setting age to 0 to prevent the maintainer thread from
588 cleaning this IDS while we process the response.
589 */
590 ids->age = 0;
591 int origFD = ids->origFD;
592
593 unsigned int consumed = 0;
594 if (!responseContentMatches(response, responseLen, ids->qname, ids->qtype, ids->qclass, dss->remote, consumed)) {
595 continue;
596 }
597
598 bool isDoH = du != nullptr;
599 /* atomically mark the state as available, but only if it has not been altered
600 in the meantime */
601 if (ids->tryMarkUnused(usageIndicator)) {
602 /* clear the potential DOHUnit asap, it's ours now
603 and since we just marked the state as unused,
604 someone could overwrite it. */
605 ids->du = nullptr;
606 /* we only decrement the outstanding counter if the value was not
607 altered in the meantime, which would mean that the state has been actively reused
608 and the other thread has not incremented the outstanding counter, so we don't
609 want it to be decremented twice. */
610 --dss->outstanding; // you'd think an attacker could game this, but we're using connected socket
611 } else {
612 /* someone updated the state in the meantime, we can't touch the existing pointer */
613 du = nullptr;
614 /* since the state has been updated, we can't safely access it so let's just drop
615 this response */
616 continue;
617 }
618
619 if(dh->tc && g_truncateTC) {
620 truncateTC(response, &responseLen, responseSize, consumed);
621 }
622
623 dh->id = ids->origID;
624
625 uint16_t addRoom = 0;
626 DNSResponse dr = makeDNSResponseFromIDState(*ids, dh, sizeof(packet), responseLen, false);
627 if (dr.dnsCryptQuery) {
628 addRoom = DNSCRYPT_MAX_RESPONSE_PADDING_AND_MAC_SIZE;
629 }
630
631 memcpy(&cleartextDH, dr.dh, sizeof(cleartextDH));
632 if (!processResponse(&response, &responseLen, &responseSize, localRespRulactions, dr, addRoom, rewrittenResponse, ids->cs && ids->cs->muted)) {
633 continue;
634 }
635
636 if (ids->cs && !ids->cs->muted) {
637 if (du) {
638 #ifdef HAVE_DNS_OVER_HTTPS
639 // DoH query
640 du->response = std::string(response, responseLen);
641 if (send(du->rsock, &du, sizeof(du), 0) != sizeof(du)) {
642 /* at this point we have the only remaining pointer on this
643 DOHUnit object since we did set ids->du to nullptr earlier,
644 except if we got the response before the pointer could be
645 released by the frontend */
646 du->release();
647 }
648 #endif /* HAVE_DNS_OVER_HTTPS */
649 du = nullptr;
650 }
651 else {
652 ComboAddress empty;
653 empty.sin4.sin_family = 0;
654 /* if ids->destHarvested is false, origDest holds the listening address.
655 We don't want to use that as a source since it could be 0.0.0.0 for example. */
656 sendUDPResponse(origFD, response, responseLen, dr.delayMsec, ids->destHarvested ? ids->origDest : empty, ids->origRemote);
657 }
658 }
659
660 ++g_stats.responses;
661 if (ids->cs) {
662 ++ids->cs->responses;
663 }
664 ++dss->responses;
665
666 double udiff = ids->sentTime.udiff();
667 vinfolog("Got answer from %s, relayed to %s%s, took %f usec", dss->remote.toStringWithPort(), ids->origRemote.toStringWithPort(),
668 isDoH ? " (https)": "", udiff);
669
670 struct timespec ts;
671 gettime(&ts);
672 g_rings.insertResponse(ts, *dr.remote, *dr.qname, dr.qtype, static_cast<unsigned int>(udiff), static_cast<unsigned int>(got), cleartextDH, dss->remote);
673
674 switch (cleartextDH.rcode) {
675 case RCode::NXDomain:
676 ++g_stats.frontendNXDomain;
677 break;
678 case RCode::ServFail:
679 ++g_stats.servfailResponses;
680 ++g_stats.frontendServFail;
681 break;
682 case RCode::NoError:
683 ++g_stats.frontendNoError;
684 break;
685 }
686 dss->latencyUsec = (127.0 * dss->latencyUsec / 128.0) + udiff/128.0;
687
688 doLatencyStats(udiff);
689
690 rewrittenResponse.clear();
691 }
692 }
693 catch(const std::exception& e){
694 vinfolog("Got an error in UDP responder thread while parsing a response from %s, id %d: %s", dss->remote.toStringWithPort(), queryId, e.what());
695 }
696 }
697 }
698 catch(const std::exception& e)
699 {
700 errlog("UDP responder thread died because of exception: %s", e.what());
701 }
702 catch(const PDNSException& e)
703 {
704 errlog("UDP responder thread died because of PowerDNS exception: %s", e.reason);
705 }
706 catch(...)
707 {
708 errlog("UDP responder thread died because of an exception: %s", "unknown");
709 }
710
711 std::mutex g_luamutex;
712 LuaContext g_lua;
713 ComboAddress g_serverControl{"127.0.0.1:5199"};
714
715
716 static void spoofResponseFromString(DNSQuestion& dq, const string& spoofContent, bool raw)
717 {
718 string result;
719
720 if (raw) {
721 SpoofAction sa(spoofContent);
722 sa(&dq, &result);
723 }
724 else {
725 std::vector<std::string> addrs;
726 stringtok(addrs, spoofContent, " ,");
727
728 if (addrs.size() == 1) {
729 try {
730 ComboAddress spoofAddr(spoofContent);
731 SpoofAction sa({spoofAddr});
732 sa(&dq, &result);
733 }
734 catch(const PDNSException &e) {
735 DNSName cname(spoofContent);
736 SpoofAction sa(cname); // CNAME then
737 sa(&dq, &result);
738 }
739 } else {
740 std::vector<ComboAddress> cas;
741 for (const auto& addr : addrs) {
742 try {
743 cas.push_back(ComboAddress(addr));
744 }
745 catch (...) {
746 }
747 }
748 SpoofAction sa(cas);
749 sa(&dq, &result);
750 }
751 }
752 }
753
754 bool processRulesResult(const DNSAction::Action& action, DNSQuestion& dq, std::string& ruleresult, bool& drop)
755 {
756 switch(action) {
757 case DNSAction::Action::Allow:
758 return true;
759 break;
760 case DNSAction::Action::Drop:
761 ++g_stats.ruleDrop;
762 drop = true;
763 return true;
764 break;
765 case DNSAction::Action::Nxdomain:
766 dq.dh->rcode = RCode::NXDomain;
767 dq.dh->qr=true;
768 ++g_stats.ruleNXDomain;
769 return true;
770 break;
771 case DNSAction::Action::Refused:
772 dq.dh->rcode = RCode::Refused;
773 dq.dh->qr=true;
774 ++g_stats.ruleRefused;
775 return true;
776 break;
777 case DNSAction::Action::ServFail:
778 dq.dh->rcode = RCode::ServFail;
779 dq.dh->qr=true;
780 ++g_stats.ruleServFail;
781 return true;
782 break;
783 case DNSAction::Action::Spoof:
784 spoofResponseFromString(dq, ruleresult, false);
785 return true;
786 break;
787 case DNSAction::Action::SpoofRaw:
788 spoofResponseFromString(dq, ruleresult, true);
789 return true;
790 break;
791 case DNSAction::Action::Truncate:
792 dq.dh->tc = true;
793 dq.dh->qr = true;
794 dq.dh->ra = dq.dh->rd;
795 dq.dh->aa = false;
796 dq.dh->ad = false;
797 return true;
798 break;
799 case DNSAction::Action::HeaderModify:
800 return true;
801 break;
802 case DNSAction::Action::Pool:
803 dq.poolname=ruleresult;
804 return true;
805 break;
806 case DNSAction::Action::NoRecurse:
807 dq.dh->rd = false;
808 return true;
809 break;
810 /* non-terminal actions follow */
811 case DNSAction::Action::Delay:
812 dq.delayMsec = static_cast<int>(pdns_stou(ruleresult)); // sorry
813 break;
814 case DNSAction::Action::None:
815 /* fall-through */
816 case DNSAction::Action::NoOp:
817 break;
818 }
819
820 /* false means that we don't stop the processing */
821 return false;
822 }
823
824
825 static bool applyRulesToQuery(LocalHolders& holders, DNSQuestion& dq, const struct timespec& now)
826 {
827 g_rings.insertQuery(now, *dq.remote, *dq.qname, dq.qtype, dq.len, *dq.dh);
828
829 if(g_qcount.enabled) {
830 string qname = (*dq.qname).toLogString();
831 bool countQuery{true};
832 if(g_qcount.filter) {
833 std::lock_guard<std::mutex> lock(g_luamutex);
834 std::tie (countQuery, qname) = g_qcount.filter(&dq);
835 }
836
837 if(countQuery) {
838 WriteLock wl(&g_qcount.queryLock);
839 if(!g_qcount.records.count(qname)) {
840 g_qcount.records[qname] = 0;
841 }
842 g_qcount.records[qname]++;
843 }
844 }
845
846 if(auto got = holders.dynNMGBlock->lookup(*dq.remote)) {
847 auto updateBlockStats = [&got]() {
848 ++g_stats.dynBlocked;
849 got->second.blocks++;
850 };
851
852 if(now < got->second.until) {
853 DNSAction::Action action = got->second.action;
854 if (action == DNSAction::Action::None) {
855 action = g_dynBlockAction;
856 }
857 switch (action) {
858 case DNSAction::Action::NoOp:
859 /* do nothing */
860 break;
861
862 case DNSAction::Action::Nxdomain:
863 vinfolog("Query from %s turned into NXDomain because of dynamic block", dq.remote->toStringWithPort());
864 updateBlockStats();
865
866 dq.dh->rcode = RCode::NXDomain;
867 dq.dh->qr=true;
868 return true;
869
870 case DNSAction::Action::Refused:
871 vinfolog("Query from %s refused because of dynamic block", dq.remote->toStringWithPort());
872 updateBlockStats();
873
874 dq.dh->rcode = RCode::Refused;
875 dq.dh->qr = true;
876 return true;
877
878 case DNSAction::Action::Truncate:
879 if(!dq.tcp) {
880 updateBlockStats();
881 vinfolog("Query from %s truncated because of dynamic block", dq.remote->toStringWithPort());
882 dq.dh->tc = true;
883 dq.dh->qr = true;
884 dq.dh->ra = dq.dh->rd;
885 dq.dh->aa = false;
886 dq.dh->ad = false;
887 return true;
888 }
889 else {
890 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
891 }
892 break;
893 case DNSAction::Action::NoRecurse:
894 updateBlockStats();
895 vinfolog("Query from %s setting rd=0 because of dynamic block", dq.remote->toStringWithPort());
896 dq.dh->rd = false;
897 return true;
898 default:
899 updateBlockStats();
900 vinfolog("Query from %s dropped because of dynamic block", dq.remote->toStringWithPort());
901 return false;
902 }
903 }
904 }
905
906 if(auto got = holders.dynSMTBlock->lookup(*dq.qname)) {
907 auto updateBlockStats = [&got]() {
908 ++g_stats.dynBlocked;
909 got->blocks++;
910 };
911
912 if(now < got->until) {
913 DNSAction::Action action = got->action;
914 if (action == DNSAction::Action::None) {
915 action = g_dynBlockAction;
916 }
917 switch (action) {
918 case DNSAction::Action::NoOp:
919 /* do nothing */
920 break;
921 case DNSAction::Action::Nxdomain:
922 vinfolog("Query from %s for %s turned into NXDomain because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
923 updateBlockStats();
924
925 dq.dh->rcode = RCode::NXDomain;
926 dq.dh->qr=true;
927 return true;
928 case DNSAction::Action::Refused:
929 vinfolog("Query from %s for %s refused because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
930 updateBlockStats();
931
932 dq.dh->rcode = RCode::Refused;
933 dq.dh->qr=true;
934 return true;
935 case DNSAction::Action::Truncate:
936 if(!dq.tcp) {
937 updateBlockStats();
938
939 vinfolog("Query from %s for %s truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
940 dq.dh->tc = true;
941 dq.dh->qr = true;
942 dq.dh->ra = dq.dh->rd;
943 dq.dh->aa = false;
944 dq.dh->ad = false;
945 return true;
946 }
947 else {
948 vinfolog("Query from %s for %s over TCP *not* truncated because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
949 }
950 break;
951 case DNSAction::Action::NoRecurse:
952 updateBlockStats();
953 vinfolog("Query from %s setting rd=0 because of dynamic block", dq.remote->toStringWithPort());
954 dq.dh->rd = false;
955 return true;
956 default:
957 updateBlockStats();
958 vinfolog("Query from %s for %s dropped because of dynamic block", dq.remote->toStringWithPort(), dq.qname->toLogString());
959 return false;
960 }
961 }
962 }
963
964 DNSAction::Action action=DNSAction::Action::None;
965 string ruleresult;
966 bool drop = false;
967 for(const auto& lr : *holders.rulactions) {
968 if(lr.d_rule->matches(&dq)) {
969 lr.d_rule->d_matches++;
970 action=(*lr.d_action)(&dq, &ruleresult);
971 if (processRulesResult(action, dq, ruleresult, drop)) {
972 break;
973 }
974 }
975 }
976
977 if (drop) {
978 return false;
979 }
980
981 return true;
982 }
983
984 ssize_t udpClientSendRequestToBackend(const std::shared_ptr<DownstreamState>& ss, const int sd, const char* request, const size_t requestLen, bool healthCheck)
985 {
986 ssize_t result;
987
988 if (ss->sourceItf == 0) {
989 result = send(sd, request, requestLen, 0);
990 }
991 else {
992 struct msghdr msgh;
993 struct iovec iov;
994 cmsgbuf_aligned cbuf;
995 ComboAddress remote(ss->remote);
996 fillMSGHdr(&msgh, &iov, &cbuf, sizeof(cbuf), const_cast<char*>(request), requestLen, &remote);
997 addCMsgSrcAddr(&msgh, &cbuf, &ss->sourceAddr, ss->sourceItf);
998 result = sendmsg(sd, &msgh, 0);
999 }
1000
1001 if (result == -1) {
1002 int savederrno = errno;
1003 vinfolog("Error sending request to backend %s: %d", ss->remote.toStringWithPort(), savederrno);
1004
1005 /* This might sound silly, but on Linux send() might fail with EINVAL
1006 if the interface the socket was bound to doesn't exist anymore.
1007 We don't want to reconnect the real socket if the healthcheck failed,
1008 because it's not using the same socket.
1009 */
1010 if (!healthCheck && (savederrno == EINVAL || savederrno == ENODEV)) {
1011 ss->reconnect();
1012 }
1013 }
1014
1015 return result;
1016 }
1017
1018 static bool isUDPQueryAcceptable(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest)
1019 {
1020 if (msgh->msg_flags & MSG_TRUNC) {
1021 /* message was too large for our buffer */
1022 vinfolog("Dropping message too large for our buffer");
1023 ++g_stats.nonCompliantQueries;
1024 return false;
1025 }
1026
1027 if(!holders.acl->match(remote)) {
1028 vinfolog("Query from %s dropped because of ACL", remote.toStringWithPort());
1029 ++g_stats.aclDrops;
1030 return false;
1031 }
1032
1033 cs.queries++;
1034 ++g_stats.queries;
1035
1036 if (HarvestDestinationAddress(msgh, &dest)) {
1037 /* we don't get the port, only the address */
1038 dest.sin4.sin_port = cs.local.sin4.sin_port;
1039 }
1040 else {
1041 dest.sin4.sin_family = 0;
1042 }
1043
1044 return true;
1045 }
1046
1047 boost::optional<std::vector<uint8_t>> checkDNSCryptQuery(const ClientState& cs, const char* query, uint16_t& len, std::shared_ptr<DNSCryptQuery>& dnsCryptQuery, time_t now, bool tcp)
1048 {
1049 if (cs.dnscryptCtx) {
1050 #ifdef HAVE_DNSCRYPT
1051 vector<uint8_t> response;
1052 uint16_t decryptedQueryLen = 0;
1053
1054 dnsCryptQuery = std::make_shared<DNSCryptQuery>(cs.dnscryptCtx);
1055
1056 bool decrypted = handleDNSCryptQuery(const_cast<char*>(query), len, dnsCryptQuery, &decryptedQueryLen, tcp, now, response);
1057
1058 if (!decrypted) {
1059 if (response.size() > 0) {
1060 return response;
1061 }
1062 throw std::runtime_error("Unable to decrypt DNSCrypt query, dropping.");
1063 }
1064
1065 len = decryptedQueryLen;
1066 #endif /* HAVE_DNSCRYPT */
1067 }
1068 return boost::none;
1069 }
1070
1071 bool checkQueryHeaders(const struct dnsheader* dh)
1072 {
1073 if (dh->qr) { // don't respond to responses
1074 ++g_stats.nonCompliantQueries;
1075 return false;
1076 }
1077
1078 if (dh->qdcount == 0) {
1079 ++g_stats.emptyQueries;
1080 return false;
1081 }
1082
1083 if (dh->rd) {
1084 ++g_stats.rdQueries;
1085 }
1086
1087 return true;
1088 }
1089
1090 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1091 static void queueResponse(const ClientState& cs, const char* response, uint16_t responseLen, const ComboAddress& dest, const ComboAddress& remote, struct mmsghdr& outMsg, struct iovec* iov, cmsgbuf_aligned* cbuf)
1092 {
1093 outMsg.msg_len = 0;
1094 fillMSGHdr(&outMsg.msg_hdr, iov, nullptr, 0, const_cast<char*>(response), responseLen, const_cast<ComboAddress*>(&remote));
1095
1096 if (dest.sin4.sin_family == 0) {
1097 outMsg.msg_hdr.msg_control = nullptr;
1098 }
1099 else {
1100 addCMsgSrcAddr(&outMsg.msg_hdr, cbuf, &dest, 0);
1101 }
1102 }
1103 #endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1104
1105 /* self-generated responses or cache hits */
1106 static bool prepareOutgoingResponse(LocalHolders& holders, ClientState& cs, DNSQuestion& dq, bool cacheHit)
1107 {
1108 DNSResponse dr(dq.qname, dq.qtype, dq.qclass, dq.consumed, dq.local, dq.remote, reinterpret_cast<dnsheader*>(dq.dh), dq.size, dq.len, dq.tcp, dq.queryTime);
1109
1110 #ifdef HAVE_PROTOBUF
1111 dr.uniqueId = dq.uniqueId;
1112 #endif
1113 dr.qTag = dq.qTag;
1114 dr.delayMsec = dq.delayMsec;
1115
1116 if (!applyRulesToResponse(cacheHit ? holders.cacheHitRespRulactions : holders.selfAnsweredRespRulactions, dr)) {
1117 return false;
1118 }
1119
1120 /* in case a rule changed it */
1121 dq.delayMsec = dr.delayMsec;
1122
1123 #ifdef HAVE_DNSCRYPT
1124 if (!cs.muted) {
1125 if (!encryptResponse(reinterpret_cast<char*>(dq.dh), &dq.len, dq.size, dq.tcp, dq.dnsCryptQuery, nullptr, nullptr)) {
1126 return false;
1127 }
1128 }
1129 #endif /* HAVE_DNSCRYPT */
1130
1131 if (cacheHit) {
1132 ++g_stats.cacheHits;
1133 }
1134
1135 switch (dr.dh->rcode) {
1136 case RCode::NXDomain:
1137 ++g_stats.frontendNXDomain;
1138 break;
1139 case RCode::ServFail:
1140 ++g_stats.frontendServFail;
1141 break;
1142 case RCode::NoError:
1143 ++g_stats.frontendNoError;
1144 break;
1145 }
1146
1147 doLatencyStats(0); // we're not going to measure this
1148 return true;
1149 }
1150
1151 ProcessQueryResult processQuery(DNSQuestion& dq, ClientState& cs, LocalHolders& holders, std::shared_ptr<DownstreamState>& selectedBackend)
1152 {
1153 const uint16_t queryId = ntohs(dq.dh->id);
1154
1155 try {
1156 /* we need an accurate ("real") value for the response and
1157 to store into the IDS, but not for insertion into the
1158 rings for example */
1159 struct timespec now;
1160 gettime(&now);
1161
1162 if (!applyRulesToQuery(holders, dq, now)) {
1163 return ProcessQueryResult::Drop;
1164 }
1165
1166 if(dq.dh->qr) { // something turned it into a response
1167 fixUpQueryTurnedResponse(dq, dq.origFlags);
1168
1169 if (!prepareOutgoingResponse(holders, cs, dq, false)) {
1170 return ProcessQueryResult::Drop;
1171 }
1172
1173 ++g_stats.selfAnswered;
1174 ++cs.responses;
1175 return ProcessQueryResult::SendAnswer;
1176 }
1177
1178 std::shared_ptr<ServerPool> serverPool = getPool(*holders.pools, dq.poolname);
1179 dq.packetCache = serverPool->packetCache;
1180 auto policy = *(holders.policy);
1181 if (serverPool->policy != nullptr) {
1182 policy = *(serverPool->policy);
1183 }
1184 auto servers = serverPool->getServers();
1185 selectedBackend = getSelectedBackendFromPolicy(policy, servers, dq);
1186
1187 uint16_t cachedResponseSize = dq.size;
1188 uint32_t allowExpired = selectedBackend ? 0 : g_staleCacheEntriesTTL;
1189
1190 if (dq.packetCache && !dq.skipCache) {
1191 dq.dnssecOK = (getEDNSZ(dq) & EDNS_HEADER_FLAG_DO);
1192 }
1193
1194 if (dq.useECS && ((selectedBackend && selectedBackend->useECS) || (!selectedBackend && serverPool->getECS()))) {
1195 // we special case our cache in case a downstream explicitly gave us a universally valid response with a 0 scope
1196 // we need ECS parsing (parseECS) to be true so we can be sure that the initial incoming query did not have an existing
1197 // ECS option, which would make it unsuitable for the zero-scope feature.
1198 if (dq.packetCache && !dq.skipCache && (!selectedBackend || !selectedBackend->disableZeroScope) && dq.packetCache->isECSParsingEnabled()) {
1199 if (dq.packetCache->get(dq, dq.consumed, dq.dh->id, reinterpret_cast<char*>(dq.dh), &cachedResponseSize, &dq.cacheKeyNoECS, dq.subnet, dq.dnssecOK, allowExpired)) {
1200 dq.len = cachedResponseSize;
1201
1202 if (!prepareOutgoingResponse(holders, cs, dq, true)) {
1203 return ProcessQueryResult::Drop;
1204 }
1205
1206 return ProcessQueryResult::SendAnswer;
1207 }
1208
1209 if (!dq.subnet) {
1210 /* there was no existing ECS on the query, enable the zero-scope feature */
1211 dq.useZeroScope = true;
1212 }
1213 }
1214
1215 if (!handleEDNSClientSubnet(dq, dq.ednsAdded, dq.ecsAdded, g_preserveTrailingData)) {
1216 vinfolog("Dropping query from %s because we couldn't insert the ECS value", dq.remote->toStringWithPort());
1217 return ProcessQueryResult::Drop;
1218 }
1219 }
1220
1221 if (dq.packetCache && !dq.skipCache) {
1222 if (dq.packetCache->get(dq, dq.consumed, dq.dh->id, reinterpret_cast<char*>(dq.dh), &cachedResponseSize, &dq.cacheKey, dq.subnet, dq.dnssecOK, allowExpired)) {
1223 dq.len = cachedResponseSize;
1224
1225 if (!prepareOutgoingResponse(holders, cs, dq, true)) {
1226 return ProcessQueryResult::Drop;
1227 }
1228
1229 return ProcessQueryResult::SendAnswer;
1230 }
1231 ++g_stats.cacheMisses;
1232 }
1233
1234 if(!selectedBackend) {
1235 ++g_stats.noPolicy;
1236
1237 vinfolog("%s query for %s|%s from %s, no policy applied", g_servFailOnNoPolicy ? "ServFailed" : "Dropped", dq.qname->toLogString(), QType(dq.qtype).getName(), dq.remote->toStringWithPort());
1238 if (g_servFailOnNoPolicy) {
1239 restoreFlags(dq.dh, dq.origFlags);
1240
1241 dq.dh->rcode = RCode::ServFail;
1242 dq.dh->qr = true;
1243
1244 if (!prepareOutgoingResponse(holders, cs, dq, false)) {
1245 return ProcessQueryResult::Drop;
1246 }
1247 // no response-only statistics counter to update.
1248 return ProcessQueryResult::SendAnswer;
1249 }
1250
1251 return ProcessQueryResult::Drop;
1252 }
1253
1254 if (dq.addXPF && selectedBackend->xpfRRCode != 0) {
1255 addXPF(dq, selectedBackend->xpfRRCode, g_preserveTrailingData);
1256 }
1257
1258 selectedBackend->queries++;
1259 return ProcessQueryResult::PassToBackend;
1260 }
1261 catch(const std::exception& e){
1262 vinfolog("Got an error while parsing a %s query from %s, id %d: %s", (dq.tcp ? "TCP" : "UDP"), dq.remote->toStringWithPort(), queryId, e.what());
1263 }
1264 return ProcessQueryResult::Drop;
1265 }
1266
1267 static void processUDPQuery(ClientState& cs, LocalHolders& holders, const struct msghdr* msgh, const ComboAddress& remote, ComboAddress& dest, char* query, uint16_t len, size_t queryBufferSize, struct mmsghdr* responsesVect, unsigned int* queuedResponses, struct iovec* respIOV, cmsgbuf_aligned* respCBuf)
1268 {
1269 assert(responsesVect == nullptr || (queuedResponses != nullptr && respIOV != nullptr && respCBuf != nullptr));
1270 uint16_t queryId = 0;
1271
1272 try {
1273 if (!isUDPQueryAcceptable(cs, holders, msgh, remote, dest)) {
1274 return;
1275 }
1276
1277 /* we need an accurate ("real") value for the response and
1278 to store into the IDS, but not for insertion into the
1279 rings for example */
1280 struct timespec queryRealTime;
1281 gettime(&queryRealTime, true);
1282
1283 std::shared_ptr<DNSCryptQuery> dnsCryptQuery = nullptr;
1284 auto dnsCryptResponse = checkDNSCryptQuery(cs, query, len, dnsCryptQuery, queryRealTime.tv_sec, false);
1285 if (dnsCryptResponse) {
1286 sendUDPResponse(cs.udpFD, reinterpret_cast<char*>(dnsCryptResponse->data()), static_cast<uint16_t>(dnsCryptResponse->size()), 0, dest, remote);
1287 return;
1288 }
1289
1290 struct dnsheader* dh = reinterpret_cast<struct dnsheader*>(query);
1291 queryId = ntohs(dh->id);
1292
1293 if (!checkQueryHeaders(dh)) {
1294 return;
1295 }
1296
1297 uint16_t qtype, qclass;
1298 unsigned int consumed = 0;
1299 DNSName qname(query, len, sizeof(dnsheader), false, &qtype, &qclass, &consumed);
1300 DNSQuestion dq(&qname, qtype, qclass, consumed, dest.sin4.sin_family != 0 ? &dest : &cs.local, &remote, dh, queryBufferSize, len, false, &queryRealTime);
1301 dq.dnsCryptQuery = std::move(dnsCryptQuery);
1302 std::shared_ptr<DownstreamState> ss{nullptr};
1303 auto result = processQuery(dq, cs, holders, ss);
1304
1305 if (result == ProcessQueryResult::Drop) {
1306 return;
1307 }
1308
1309 if (result == ProcessQueryResult::SendAnswer) {
1310 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1311 if (dq.delayMsec == 0 && responsesVect != nullptr) {
1312 queueResponse(cs, reinterpret_cast<char*>(dq.dh), dq.len, *dq.local, *dq.remote, responsesVect[*queuedResponses], respIOV, respCBuf);
1313 (*queuedResponses)++;
1314 return;
1315 }
1316 #endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1317 /* we use dest, always, because we don't want to use the listening address to send a response since it could be 0.0.0.0 */
1318 sendUDPResponse(cs.udpFD, reinterpret_cast<char*>(dq.dh), dq.len, dq.delayMsec, dest, *dq.remote);
1319 return;
1320 }
1321
1322 if (result != ProcessQueryResult::PassToBackend || ss == nullptr) {
1323 return;
1324 }
1325
1326 unsigned int idOffset = (ss->idOffset++) % ss->idStates.size();
1327 IDState* ids = &ss->idStates[idOffset];
1328 ids->age = 0;
1329 DOHUnit* du = nullptr;
1330
1331 /* that means that the state was in use, possibly with an allocated
1332 DOHUnit that we will need to handle, but we can't touch it before
1333 confirming that we now own this state */
1334 if (ids->isInUse()) {
1335 du = ids->du;
1336 }
1337
1338 /* we atomically replace the value, we now own this state */
1339 if (!ids->markAsUsed()) {
1340 /* the state was not in use.
1341 we reset 'du' because it might have still been in use when we read it. */
1342 du = nullptr;
1343 ++ss->outstanding;
1344 }
1345 else {
1346 /* we are reusing a state, no change in outstanding but if there was an existing DOHUnit we need
1347 to handle it because it's about to be overwritten. */
1348 ids->du = nullptr;
1349 ++ss->reuseds;
1350 ++g_stats.downstreamTimeouts;
1351 handleDOHTimeout(du);
1352 }
1353
1354 ids->cs = &cs;
1355 ids->origFD = cs.udpFD;
1356 ids->origID = dh->id;
1357 setIDStateFromDNSQuestion(*ids, dq, std::move(qname));
1358
1359 /* If we couldn't harvest the real dest addr, still
1360 write down the listening addr since it will be useful
1361 (especially if it's not an 'any' one).
1362 We need to keep track of which one it is since we may
1363 want to use the real but not the listening addr to reply.
1364 */
1365 if (dest.sin4.sin_family != 0) {
1366 ids->origDest = dest;
1367 ids->destHarvested = true;
1368 }
1369 else {
1370 ids->origDest = cs.local;
1371 ids->destHarvested = false;
1372 }
1373
1374 dh->id = idOffset;
1375
1376 if (ss->useProxyProtocol) {
1377 addProxyProtocol(dq);
1378 }
1379
1380 int fd = pickBackendSocketForSending(ss);
1381 ssize_t ret = udpClientSendRequestToBackend(ss, fd, query, dq.len);
1382
1383 if(ret < 0) {
1384 ++ss->sendErrors;
1385 ++g_stats.downstreamSendErrors;
1386 }
1387
1388 vinfolog("Got query for %s|%s from %s, relayed to %s", ids->qname.toLogString(), QType(ids->qtype).getName(), remote.toStringWithPort(), ss->getName());
1389 }
1390 catch(const std::exception& e){
1391 vinfolog("Got an error in UDP question thread while parsing a query from %s, id %d: %s", remote.toStringWithPort(), queryId, e.what());
1392 }
1393 }
1394
1395 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1396 static void MultipleMessagesUDPClientThread(ClientState* cs, LocalHolders& holders)
1397 {
1398 struct MMReceiver
1399 {
1400 char packet[s_maxPacketCacheEntrySize];
1401 ComboAddress remote;
1402 ComboAddress dest;
1403 struct iovec iov;
1404 /* used by HarvestDestinationAddress */
1405 cmsgbuf_aligned cbuf;
1406 };
1407 const size_t vectSize = g_udpVectorSize;
1408 /* the actual buffer is larger because:
1409 - we may have to add EDNS and/or ECS
1410 - we use it for self-generated responses (from rule or cache)
1411 but we only accept incoming payloads up to that size
1412 */
1413 static_assert(s_udpIncomingBufferSize <= sizeof(MMReceiver::packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1414
1415 auto recvData = std::unique_ptr<MMReceiver[]>(new MMReceiver[vectSize]);
1416 auto msgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1417 auto outMsgVec = std::unique_ptr<struct mmsghdr[]>(new struct mmsghdr[vectSize]);
1418
1419 /* initialize the structures needed to receive our messages */
1420 for (size_t idx = 0; idx < vectSize; idx++) {
1421 recvData[idx].remote.sin4.sin_family = cs->local.sin4.sin_family;
1422 fillMSGHdr(&msgVec[idx].msg_hdr, &recvData[idx].iov, &recvData[idx].cbuf, sizeof(recvData[idx].cbuf), recvData[idx].packet, cs->dnscryptCtx ? sizeof(recvData[idx].packet) : s_udpIncomingBufferSize, &recvData[idx].remote);
1423 }
1424
1425 /* go now */
1426 for(;;) {
1427
1428 /* reset the IO vector, since it's also used to send the vector of responses
1429 to avoid having to copy the data around */
1430 for (size_t idx = 0; idx < vectSize; idx++) {
1431 recvData[idx].iov.iov_base = recvData[idx].packet;
1432 recvData[idx].iov.iov_len = sizeof(recvData[idx].packet);
1433 }
1434
1435 /* block until we have at least one message ready, but return
1436 as many as possible to save the syscall costs */
1437 int msgsGot = recvmmsg(cs->udpFD, msgVec.get(), vectSize, MSG_WAITFORONE | MSG_TRUNC, nullptr);
1438
1439 if (msgsGot <= 0) {
1440 vinfolog("Getting UDP messages via recvmmsg() failed with: %s", stringerror());
1441 continue;
1442 }
1443
1444 unsigned int msgsToSend = 0;
1445
1446 /* process the received messages */
1447 for (int msgIdx = 0; msgIdx < msgsGot; msgIdx++) {
1448 const struct msghdr* msgh = &msgVec[msgIdx].msg_hdr;
1449 unsigned int got = msgVec[msgIdx].msg_len;
1450 const ComboAddress& remote = recvData[msgIdx].remote;
1451
1452 if (static_cast<size_t>(got) < sizeof(struct dnsheader)) {
1453 ++g_stats.nonCompliantQueries;
1454 continue;
1455 }
1456
1457 processUDPQuery(*cs, holders, msgh, remote, recvData[msgIdx].dest, recvData[msgIdx].packet, static_cast<uint16_t>(got), sizeof(recvData[msgIdx].packet), outMsgVec.get(), &msgsToSend, &recvData[msgIdx].iov, &recvData[msgIdx].cbuf);
1458
1459 }
1460
1461 /* immediate (not delayed or sent to a backend) responses (mostly from a rule, dynamic block
1462 or the cache) can be sent in batch too */
1463
1464 if (msgsToSend > 0 && msgsToSend <= static_cast<unsigned int>(msgsGot)) {
1465 int sent = sendmmsg(cs->udpFD, outMsgVec.get(), msgsToSend, 0);
1466
1467 if (sent < 0 || static_cast<unsigned int>(sent) != msgsToSend) {
1468 vinfolog("Error sending responses with sendmmsg() (%d on %u): %s", sent, msgsToSend, stringerror());
1469 }
1470 }
1471
1472 }
1473 }
1474 #endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1475
1476 // listens to incoming queries, sends out to downstream servers, noting the intended return path
1477 static void udpClientThread(ClientState* cs)
1478 try
1479 {
1480 setThreadName("dnsdist/udpClie");
1481 LocalHolders holders;
1482
1483 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
1484 if (g_udpVectorSize > 1) {
1485 MultipleMessagesUDPClientThread(cs, holders);
1486
1487 }
1488 else
1489 #endif /* defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE) */
1490 {
1491 char packet[s_maxPacketCacheEntrySize];
1492 /* the actual buffer is larger because:
1493 - we may have to add EDNS and/or ECS
1494 - we use it for self-generated responses (from rule or cache)
1495 but we only accept incoming payloads up to that size
1496 */
1497 static_assert(s_udpIncomingBufferSize <= sizeof(packet), "the incoming buffer size should not be larger than sizeof(MMReceiver::packet)");
1498 struct msghdr msgh;
1499 struct iovec iov;
1500 /* used by HarvestDestinationAddress */
1501 cmsgbuf_aligned cbuf;
1502
1503 ComboAddress remote;
1504 ComboAddress dest;
1505 remote.sin4.sin_family = cs->local.sin4.sin_family;
1506 fillMSGHdr(&msgh, &iov, &cbuf, sizeof(cbuf), packet, cs->dnscryptCtx ? sizeof(packet) : s_udpIncomingBufferSize, &remote);
1507
1508 for(;;) {
1509 ssize_t got = recvmsg(cs->udpFD, &msgh, 0);
1510
1511 if (got < 0 || static_cast<size_t>(got) < sizeof(struct dnsheader)) {
1512 ++g_stats.nonCompliantQueries;
1513 continue;
1514 }
1515
1516 processUDPQuery(*cs, holders, &msgh, remote, dest, packet, static_cast<uint16_t>(got), sizeof(packet), nullptr, nullptr, nullptr, nullptr);
1517 }
1518 }
1519 }
1520 catch(const std::exception &e)
1521 {
1522 errlog("UDP client thread died because of exception: %s", e.what());
1523 }
1524 catch(const PDNSException &e)
1525 {
1526 errlog("UDP client thread died because of PowerDNS exception: %s", e.reason);
1527 }
1528 catch(...)
1529 {
1530 errlog("UDP client thread died because of an exception: %s", "unknown");
1531 }
1532
1533 uint16_t getRandomDNSID()
1534 {
1535 #ifdef HAVE_LIBSODIUM
1536 return randombytes_uniform(65536);
1537 #else
1538 return (random() % 65536);
1539 #endif
1540 }
1541
1542 uint64_t g_maxTCPClientThreads{10};
1543 std::atomic<uint16_t> g_cacheCleaningDelay{60};
1544 std::atomic<uint16_t> g_cacheCleaningPercentage{100};
1545
1546 static void maintThread()
1547 {
1548 setThreadName("dnsdist/main");
1549 int interval = 1;
1550 size_t counter = 0;
1551 int32_t secondsToWaitLog = 0;
1552
1553 for(;;) {
1554 sleep(interval);
1555
1556 {
1557 std::lock_guard<std::mutex> lock(g_luamutex);
1558 auto f = g_lua.readVariable<boost::optional<std::function<void()> > >("maintenance");
1559 if(f) {
1560 try {
1561 (*f)();
1562 secondsToWaitLog = 0;
1563 }
1564 catch(std::exception &e) {
1565 if (secondsToWaitLog <= 0) {
1566 infolog("Error during execution of maintenance function: %s", e.what());
1567 secondsToWaitLog = 61;
1568 }
1569 secondsToWaitLog -= interval;
1570 }
1571 }
1572 }
1573
1574 counter++;
1575 if (counter >= g_cacheCleaningDelay) {
1576 /* keep track, for each cache, of whether we should keep
1577 expired entries */
1578 std::map<std::shared_ptr<DNSDistPacketCache>, bool> caches;
1579
1580 /* gather all caches actually used by at least one pool, and see
1581 if something prevents us from cleaning the expired entries */
1582 auto localPools = g_pools.getLocal();
1583 for (const auto& entry : *localPools) {
1584 auto& pool = entry.second;
1585
1586 auto packetCache = pool->packetCache;
1587 if (!packetCache) {
1588 continue;
1589 }
1590
1591 auto pair = caches.insert({packetCache, false});
1592 auto& iter = pair.first;
1593 /* if we need to keep stale data for this cache (ie, not clear
1594 expired entries when at least one pool using this cache
1595 has all its backends down) */
1596 if (packetCache->keepStaleData() && iter->second == false) {
1597 /* so far all pools had at least one backend up */
1598 if (pool->countServers(true) == 0) {
1599 iter->second = true;
1600 }
1601 }
1602 }
1603
1604 for (auto pair : caches) {
1605 /* shall we keep expired entries ? */
1606 if (pair.second == true) {
1607 continue;
1608 }
1609 auto& packetCache = pair.first;
1610 size_t upTo = (packetCache->getMaxEntries()* (100 - g_cacheCleaningPercentage)) / 100;
1611 packetCache->purgeExpired(upTo);
1612 }
1613 counter = 0;
1614 }
1615
1616 // ponder pruning g_dynblocks of expired entries here
1617 }
1618 }
1619
1620 static void secPollThread()
1621 {
1622 setThreadName("dnsdist/secpoll");
1623
1624 for (;;) {
1625 try {
1626 doSecPoll(g_secPollSuffix);
1627 }
1628 catch(...) {
1629 }
1630 sleep(g_secPollInterval);
1631 }
1632 }
1633
1634 static void healthChecksThread()
1635 {
1636 setThreadName("dnsdist/healthC");
1637
1638 static const int interval = 1;
1639
1640 for(;;) {
1641 sleep(interval);
1642
1643 if(g_tcpclientthreads->getQueuedCount() > 1 && !g_tcpclientthreads->hasReachedMaxThreads()) {
1644 g_tcpclientthreads->addTCPClientThread();
1645 }
1646
1647 auto mplexer = std::shared_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
1648 auto states = g_dstates.getLocal(); // this points to the actual shared_ptrs!
1649 for(auto& dss : *states) {
1650 if(++dss->lastCheck < dss->checkInterval) {
1651 continue;
1652 }
1653
1654 dss->lastCheck = 0;
1655
1656 if (dss->availability == DownstreamState::Availability::Auto) {
1657 if (!queueHealthCheck(mplexer, dss)) {
1658 updateHealthCheckResult(dss, false);
1659 }
1660 }
1661
1662 auto delta = dss->sw.udiffAndSet()/1000000.0;
1663 dss->queryLoad = 1.0*(dss->queries.load() - dss->prev.queries.load())/delta;
1664 dss->dropRate = 1.0*(dss->reuseds.load() - dss->prev.reuseds.load())/delta;
1665 dss->prev.queries.store(dss->queries.load());
1666 dss->prev.reuseds.store(dss->reuseds.load());
1667
1668 for (IDState& ids : dss->idStates) { // timeouts
1669 int64_t usageIndicator = ids.usageIndicator;
1670 if(IDState::isInUse(usageIndicator) && ids.age++ > g_udpTimeout) {
1671 /* We mark the state as unused as soon as possible
1672 to limit the risk of racing with the
1673 responder thread.
1674 */
1675 auto oldDU = ids.du;
1676
1677 if (!ids.tryMarkUnused(usageIndicator)) {
1678 /* this state has been altered in the meantime,
1679 don't go anywhere near it */
1680 continue;
1681 }
1682 ids.du = nullptr;
1683 handleDOHTimeout(oldDU);
1684 ids.age = 0;
1685 dss->reuseds++;
1686 --dss->outstanding;
1687 ++g_stats.downstreamTimeouts; // this is an 'actively' discovered timeout
1688 vinfolog("Had a downstream timeout from %s (%s) for query for %s|%s from %s",
1689 dss->remote.toStringWithPort(), dss->getName(),
1690 ids.qname.toLogString(), QType(ids.qtype).getName(), ids.origRemote.toStringWithPort());
1691
1692 struct timespec ts;
1693 gettime(&ts);
1694
1695 struct dnsheader fake;
1696 memset(&fake, 0, sizeof(fake));
1697 fake.id = ids.origID;
1698
1699 g_rings.insertResponse(ts, ids.origRemote, ids.qname, ids.qtype, std::numeric_limits<unsigned int>::max(), 0, fake, dss->remote);
1700 }
1701 }
1702 }
1703
1704 handleQueuedHealthChecks(mplexer);
1705 }
1706 }
1707
1708 static void bindAny(int af, int sock)
1709 {
1710 __attribute__((unused)) int one = 1;
1711
1712 #ifdef IP_FREEBIND
1713 if (setsockopt(sock, IPPROTO_IP, IP_FREEBIND, &one, sizeof(one)) < 0)
1714 warnlog("Warning: IP_FREEBIND setsockopt failed: %s", stringerror());
1715 #endif
1716
1717 #ifdef IP_BINDANY
1718 if (af == AF_INET)
1719 if (setsockopt(sock, IPPROTO_IP, IP_BINDANY, &one, sizeof(one)) < 0)
1720 warnlog("Warning: IP_BINDANY setsockopt failed: %s", stringerror());
1721 #endif
1722 #ifdef IPV6_BINDANY
1723 if (af == AF_INET6)
1724 if (setsockopt(sock, IPPROTO_IPV6, IPV6_BINDANY, &one, sizeof(one)) < 0)
1725 warnlog("Warning: IPV6_BINDANY setsockopt failed: %s", stringerror());
1726 #endif
1727 #ifdef SO_BINDANY
1728 if (setsockopt(sock, SOL_SOCKET, SO_BINDANY, &one, sizeof(one)) < 0)
1729 warnlog("Warning: SO_BINDANY setsockopt failed: %s", stringerror());
1730 #endif
1731 }
1732
1733 static void dropGroupPrivs(gid_t gid)
1734 {
1735 if (gid) {
1736 if (setgid(gid) == 0) {
1737 if (setgroups(0, NULL) < 0) {
1738 warnlog("Warning: Unable to drop supplementary gids: %s", stringerror());
1739 }
1740 }
1741 else {
1742 warnlog("Warning: Unable to set group ID to %d: %s", gid, stringerror());
1743 }
1744 }
1745 }
1746
1747 static void dropUserPrivs(uid_t uid)
1748 {
1749 if(uid) {
1750 if(setuid(uid) < 0) {
1751 warnlog("Warning: Unable to set user ID to %d: %s", uid, stringerror());
1752 }
1753 }
1754 }
1755
1756 static void checkFileDescriptorsLimits(size_t udpBindsCount, size_t tcpBindsCount)
1757 {
1758 /* stdin, stdout, stderr */
1759 size_t requiredFDsCount = 3;
1760 auto backends = g_dstates.getLocal();
1761 /* UDP sockets to backends */
1762 size_t backendUDPSocketsCount = 0;
1763 for (const auto& backend : *backends) {
1764 backendUDPSocketsCount += backend->sockets.size();
1765 }
1766 requiredFDsCount += backendUDPSocketsCount;
1767 /* TCP sockets to backends */
1768 requiredFDsCount += (backends->size() * g_maxTCPClientThreads);
1769 /* listening sockets */
1770 requiredFDsCount += udpBindsCount;
1771 requiredFDsCount += tcpBindsCount;
1772 /* max TCP connections currently served */
1773 requiredFDsCount += g_maxTCPClientThreads;
1774 /* max pipes for communicating between TCP acceptors and client threads */
1775 requiredFDsCount += (g_maxTCPClientThreads * 2);
1776 /* max TCP queued connections */
1777 requiredFDsCount += g_maxTCPQueuedConnections;
1778 /* DelayPipe pipe */
1779 requiredFDsCount += 2;
1780 /* syslog socket */
1781 requiredFDsCount++;
1782 /* webserver main socket */
1783 requiredFDsCount++;
1784 /* console main socket */
1785 requiredFDsCount++;
1786 /* carbon export */
1787 requiredFDsCount++;
1788 /* history file */
1789 requiredFDsCount++;
1790 struct rlimit rl;
1791 getrlimit(RLIMIT_NOFILE, &rl);
1792 if (rl.rlim_cur <= requiredFDsCount) {
1793 warnlog("Warning, this configuration can use more than %d file descriptors, web server and console connections not included, and the current limit is %d.", std::to_string(requiredFDsCount), std::to_string(rl.rlim_cur));
1794 #ifdef HAVE_SYSTEMD
1795 warnlog("You can increase this value by using LimitNOFILE= in the systemd unit file or ulimit.");
1796 #else
1797 warnlog("You can increase this value by using ulimit.");
1798 #endif
1799 }
1800 }
1801
1802 static void setUpLocalBind(std::unique_ptr<ClientState>& cs)
1803 {
1804 /* skip some warnings if there is an identical UDP context */
1805 bool warn = cs->tcp == false || cs->tlsFrontend != nullptr || cs->dohFrontend != nullptr;
1806 int& fd = cs->tcp == false ? cs->udpFD : cs->tcpFD;
1807 (void) warn;
1808
1809 fd = SSocket(cs->local.sin4.sin_family, cs->tcp == false ? SOCK_DGRAM : SOCK_STREAM, 0);
1810
1811 if (cs->tcp) {
1812 SSetsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 1);
1813 #ifdef TCP_DEFER_ACCEPT
1814 SSetsockopt(fd, IPPROTO_TCP, TCP_DEFER_ACCEPT, 1);
1815 #endif
1816 if (cs->fastOpenQueueSize > 0) {
1817 #ifdef TCP_FASTOPEN
1818 SSetsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, cs->fastOpenQueueSize);
1819 #else
1820 if (warn) {
1821 warnlog("TCP Fast Open has been configured on local address '%s' but is not supported", cs->local.toStringWithPort());
1822 }
1823 #endif
1824 }
1825 }
1826
1827 if(cs->local.sin4.sin_family == AF_INET6) {
1828 SSetsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, 1);
1829 }
1830
1831 bindAny(cs->local.sin4.sin_family, fd);
1832
1833 if(!cs->tcp && IsAnyAddress(cs->local)) {
1834 int one=1;
1835 setsockopt(fd, IPPROTO_IP, GEN_IP_PKTINFO, &one, sizeof(one)); // linux supports this, so why not - might fail on other systems
1836 #ifdef IPV6_RECVPKTINFO
1837 setsockopt(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
1838 #endif
1839 }
1840
1841 if (cs->reuseport) {
1842 #ifdef SO_REUSEPORT
1843 SSetsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
1844 #else
1845 if (warn) {
1846 /* no need to warn again if configured but support is not available, we already did for UDP */
1847 warnlog("SO_REUSEPORT has been configured on local address '%s' but is not supported", cs->local.toStringWithPort());
1848 }
1849 #endif
1850 }
1851
1852 /* Only set this on IPv4 UDP sockets.
1853 Don't set it for DNSCrypt binds. DNSCrypt pads queries for privacy
1854 purposes, so we do receive large, sometimes fragmented datagrams. */
1855 if (!cs->tcp && cs->local.isIPv4() && !cs->dnscryptCtx) {
1856 try {
1857 setSocketIgnorePMTU(cs->udpFD);
1858 }
1859 catch(const std::exception& e) {
1860 warnlog("Failed to set IP_MTU_DISCOVER on UDP server socket for local address '%s': %s", cs->local.toStringWithPort(), e.what());
1861 }
1862 }
1863
1864 const std::string& itf = cs->interface;
1865 if (!itf.empty()) {
1866 #ifdef SO_BINDTODEVICE
1867 int res = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, itf.c_str(), itf.length());
1868 if (res != 0) {
1869 warnlog("Error setting up the interface on local address '%s': %s", cs->local.toStringWithPort(), stringerror());
1870 }
1871 #else
1872 if (warn) {
1873 warnlog("An interface has been configured on local address '%s' but SO_BINDTODEVICE is not supported", cs->local.toStringWithPort());
1874 }
1875 #endif
1876 }
1877
1878 #ifdef HAVE_EBPF
1879 if (g_defaultBPFFilter) {
1880 cs->attachFilter(g_defaultBPFFilter);
1881 vinfolog("Attaching default BPF Filter to %s frontend %s", (!cs->tcp ? "UDP" : "TCP"), cs->local.toStringWithPort());
1882 }
1883 #endif /* HAVE_EBPF */
1884
1885 if (cs->tlsFrontend != nullptr) {
1886 if (!cs->tlsFrontend->setupTLS()) {
1887 errlog("Error while setting up TLS on local address '%s', exiting", cs->local.toStringWithPort());
1888 _exit(EXIT_FAILURE);
1889 }
1890 }
1891
1892 if (cs->dohFrontend != nullptr) {
1893 cs->dohFrontend->setup();
1894 }
1895
1896 SBind(fd, cs->local);
1897
1898 if (cs->tcp) {
1899 SListen(cs->tcpFD, cs->tcpListenQueueSize);
1900
1901 if (cs->tlsFrontend != nullptr) {
1902 warnlog("Listening on %s for TLS", cs->local.toStringWithPort());
1903 }
1904 else if (cs->dohFrontend != nullptr) {
1905 warnlog("Listening on %s for DoH", cs->local.toStringWithPort());
1906 }
1907 else if (cs->dnscryptCtx != nullptr) {
1908 warnlog("Listening on %s for DNSCrypt", cs->local.toStringWithPort());
1909 }
1910 else {
1911 warnlog("Listening on %s", cs->local.toStringWithPort());
1912 }
1913 }
1914
1915 cs->ready = true;
1916 }
1917
1918 struct
1919 {
1920 vector<string> locals;
1921 vector<string> remotes;
1922 bool checkConfig{false};
1923 bool beClient{false};
1924 bool beSupervised{false};
1925 string command;
1926 string config;
1927 string uid;
1928 string gid;
1929 } g_cmdLine;
1930
1931 std::atomic<bool> g_configurationDone{false};
1932
1933 static void usage()
1934 {
1935 cout<<endl;
1936 cout<<"Syntax: dnsdist [-C,--config file] [-c,--client [IP[:PORT]]]\n";
1937 cout<<"[-e,--execute cmd] [-h,--help] [-l,--local addr]\n";
1938 cout<<"[-v,--verbose] [--check-config] [--version]\n";
1939 cout<<"\n";
1940 cout<<"-a,--acl netmask Add this netmask to the ACL\n";
1941 cout<<"-C,--config file Load configuration from 'file'\n";
1942 cout<<"-c,--client Operate as a client, connect to dnsdist. This reads\n";
1943 cout<<" controlSocket from your configuration file, but also\n";
1944 cout<<" accepts an IP:PORT argument\n";
1945 #ifdef HAVE_LIBSODIUM
1946 cout<<"-k,--setkey KEY Use KEY for encrypted communication to dnsdist. This\n";
1947 cout<<" is similar to setting setKey in the configuration file.\n";
1948 cout<<" NOTE: this will leak this key in your shell's history\n";
1949 cout<<" and in the systems running process list.\n";
1950 #endif
1951 cout<<"--check-config Validate the configuration file and exit. The exit-code\n";
1952 cout<<" reflects the validation, 0 is OK, 1 means an error.\n";
1953 cout<<" Any errors are printed as well.\n";
1954 cout<<"-e,--execute cmd Connect to dnsdist and execute 'cmd'\n";
1955 cout<<"-g,--gid gid Change the process group ID after binding sockets\n";
1956 cout<<"-h,--help Display this helpful message\n";
1957 cout<<"-l,--local address Listen on this local address\n";
1958 cout<<"--supervised Don't open a console, I'm supervised\n";
1959 cout<<" (use with e.g. systemd and daemontools)\n";
1960 cout<<"--disable-syslog Don't log to syslog, only to stdout\n";
1961 cout<<" (use with e.g. systemd)\n";
1962 cout<<"-u,--uid uid Change the process user ID after binding sockets\n";
1963 cout<<"-v,--verbose Enable verbose mode\n";
1964 cout<<"-V,--version Show dnsdist version information and exit\n";
1965 }
1966
1967 int main(int argc, char** argv)
1968 try
1969 {
1970 size_t udpBindsCount = 0;
1971 size_t tcpBindsCount = 0;
1972 rl_attempted_completion_function = my_completion;
1973 rl_completion_append_character = 0;
1974
1975 signal(SIGPIPE, SIG_IGN);
1976 signal(SIGCHLD, SIG_IGN);
1977 openlog("dnsdist", LOG_PID|LOG_NDELAY, LOG_DAEMON);
1978
1979 #ifdef HAVE_LIBSODIUM
1980 if (sodium_init() == -1) {
1981 cerr<<"Unable to initialize crypto library"<<endl;
1982 exit(EXIT_FAILURE);
1983 }
1984 g_hashperturb=randombytes_uniform(0xffffffff);
1985 srandom(randombytes_uniform(0xffffffff));
1986 #else
1987 {
1988 struct timeval tv;
1989 gettimeofday(&tv, 0);
1990 srandom(tv.tv_sec ^ tv.tv_usec ^ getpid());
1991 g_hashperturb=random();
1992 }
1993
1994 #endif
1995 ComboAddress clientAddress = ComboAddress();
1996 g_cmdLine.config=SYSCONFDIR "/dnsdist.conf";
1997 struct option longopts[]={
1998 {"acl", required_argument, 0, 'a'},
1999 {"check-config", no_argument, 0, 1},
2000 {"client", no_argument, 0, 'c'},
2001 {"config", required_argument, 0, 'C'},
2002 {"disable-syslog", no_argument, 0, 2},
2003 {"execute", required_argument, 0, 'e'},
2004 {"gid", required_argument, 0, 'g'},
2005 {"help", no_argument, 0, 'h'},
2006 {"local", required_argument, 0, 'l'},
2007 {"setkey", required_argument, 0, 'k'},
2008 {"supervised", no_argument, 0, 3},
2009 {"uid", required_argument, 0, 'u'},
2010 {"verbose", no_argument, 0, 'v'},
2011 {"version", no_argument, 0, 'V'},
2012 {0,0,0,0}
2013 };
2014 int longindex=0;
2015 string optstring;
2016 for(;;) {
2017 int c=getopt_long(argc, argv, "a:cC:e:g:hk:l:u:vV", longopts, &longindex);
2018 if(c==-1)
2019 break;
2020 switch(c) {
2021 case 1:
2022 g_cmdLine.checkConfig=true;
2023 break;
2024 case 2:
2025 g_syslog=false;
2026 break;
2027 case 3:
2028 g_cmdLine.beSupervised=true;
2029 break;
2030 case 'C':
2031 g_cmdLine.config=optarg;
2032 break;
2033 case 'c':
2034 g_cmdLine.beClient=true;
2035 break;
2036 case 'e':
2037 g_cmdLine.command=optarg;
2038 break;
2039 case 'g':
2040 g_cmdLine.gid=optarg;
2041 break;
2042 case 'h':
2043 cout<<"dnsdist "<<VERSION<<endl;
2044 usage();
2045 cout<<"\n";
2046 exit(EXIT_SUCCESS);
2047 break;
2048 case 'a':
2049 optstring=optarg;
2050 g_ACL.modify([optstring](NetmaskGroup& nmg) { nmg.addMask(optstring); });
2051 break;
2052 case 'k':
2053 #ifdef HAVE_LIBSODIUM
2054 if (B64Decode(string(optarg), g_consoleKey) < 0) {
2055 cerr<<"Unable to decode key '"<<optarg<<"'."<<endl;
2056 exit(EXIT_FAILURE);
2057 }
2058 #else
2059 cerr<<"dnsdist has been built without libsodium, -k/--setkey is unsupported."<<endl;
2060 exit(EXIT_FAILURE);
2061 #endif
2062 break;
2063 case 'l':
2064 g_cmdLine.locals.push_back(trim_copy(string(optarg)));
2065 break;
2066 case 'u':
2067 g_cmdLine.uid=optarg;
2068 break;
2069 case 'v':
2070 g_verbose=true;
2071 break;
2072 case 'V':
2073 #ifdef LUAJIT_VERSION
2074 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<" ["<<LUAJIT_VERSION<<"])"<<endl;
2075 #else
2076 cout<<"dnsdist "<<VERSION<<" ("<<LUA_RELEASE<<")"<<endl;
2077 #endif
2078 cout<<"Enabled features: ";
2079 #ifdef HAVE_CDB
2080 cout<<"cdb ";
2081 #endif
2082 #ifdef HAVE_DNS_OVER_TLS
2083 cout<<"dns-over-tls(";
2084 #ifdef HAVE_GNUTLS
2085 cout<<"gnutls";
2086 #ifdef HAVE_LIBSSL
2087 cout<<" ";
2088 #endif
2089 #endif
2090 #ifdef HAVE_LIBSSL
2091 cout<<"openssl";
2092 #endif
2093 cout<<") ";
2094 #endif
2095 #ifdef HAVE_DNS_OVER_HTTPS
2096 cout<<"dns-over-https(DOH) ";
2097 #endif
2098 #ifdef HAVE_DNSCRYPT
2099 cout<<"dnscrypt ";
2100 #endif
2101 #ifdef HAVE_EBPF
2102 cout<<"ebpf ";
2103 #endif
2104 #ifdef HAVE_FSTRM
2105 cout<<"fstrm ";
2106 #endif
2107 #ifdef HAVE_LIBCRYPTO
2108 cout<<"ipcipher ";
2109 #endif
2110 #ifdef HAVE_LIBSODIUM
2111 cout<<"libsodium ";
2112 #endif
2113 #ifdef HAVE_LMDB
2114 cout<<"lmdb ";
2115 #endif
2116 #ifdef HAVE_PROTOBUF
2117 cout<<"protobuf ";
2118 #endif
2119 #ifdef HAVE_RE2
2120 cout<<"re2 ";
2121 #endif
2122 #if defined(HAVE_RECVMMSG) && defined(HAVE_SENDMMSG) && defined(MSG_WAITFORONE)
2123 cout<<"recvmmsg/sendmmsg ";
2124 #endif
2125 #ifdef HAVE_NET_SNMP
2126 cout<<"snmp ";
2127 #endif
2128 #ifdef HAVE_SYSTEMD
2129 cout<<"systemd";
2130 #endif
2131 cout<<endl;
2132 exit(EXIT_SUCCESS);
2133 break;
2134 case '?':
2135 //getopt_long printed an error message.
2136 usage();
2137 exit(EXIT_FAILURE);
2138 break;
2139 }
2140 }
2141
2142 argc-=optind;
2143 argv+=optind;
2144 for(auto p = argv; *p; ++p) {
2145 if(g_cmdLine.beClient) {
2146 clientAddress = ComboAddress(*p, 5199);
2147 } else {
2148 g_cmdLine.remotes.push_back(*p);
2149 }
2150 }
2151
2152 ServerPolicy leastOutstandingPol{"leastOutstanding", leastOutstanding, false};
2153
2154 g_policy.setState(leastOutstandingPol);
2155 if(g_cmdLine.beClient || !g_cmdLine.command.empty()) {
2156 setupLua(true, false, g_cmdLine.config);
2157 if (clientAddress != ComboAddress())
2158 g_serverControl = clientAddress;
2159 doClient(g_serverControl, g_cmdLine.command);
2160 _exit(EXIT_SUCCESS);
2161 }
2162
2163 auto acl = g_ACL.getCopy();
2164 if(acl.empty()) {
2165 for(auto& addr : {"127.0.0.0/8", "10.0.0.0/8", "100.64.0.0/10", "169.254.0.0/16", "192.168.0.0/16", "172.16.0.0/12", "::1/128", "fc00::/7", "fe80::/10"})
2166 acl.addMask(addr);
2167 g_ACL.setState(acl);
2168 }
2169
2170 auto consoleACL = g_consoleACL.getCopy();
2171 for (const auto& mask : { "127.0.0.1/8", "::1/128" }) {
2172 consoleACL.addMask(mask);
2173 }
2174 g_consoleACL.setState(consoleACL);
2175
2176 if (g_cmdLine.checkConfig) {
2177 setupLua(false, true, g_cmdLine.config);
2178 // No exception was thrown
2179 infolog("Configuration '%s' OK!", g_cmdLine.config);
2180 _exit(EXIT_SUCCESS);
2181 }
2182
2183 auto todo=setupLua(false, false, g_cmdLine.config);
2184
2185 auto localPools = g_pools.getCopy();
2186 {
2187 bool precompute = false;
2188 if (g_policy.getLocal()->name == "chashed") {
2189 precompute = true;
2190 } else {
2191 for (const auto& entry: localPools) {
2192 if (entry.second->policy != nullptr && entry.second->policy->name == "chashed") {
2193 precompute = true;
2194 break ;
2195 }
2196 }
2197 }
2198 if (precompute) {
2199 vinfolog("Pre-computing hashes for consistent hash load-balancing policy");
2200 // pre compute hashes
2201 auto backends = g_dstates.getLocal();
2202 for (auto& backend: *backends) {
2203 if (backend->weight < 100) {
2204 vinfolog("Warning, the backend '%s' has a very low weight (%d), which will not yield a good distribution of queries with the 'chashed' policy. Please consider raising it to at least '100'.", backend->getName(), backend->weight);
2205 }
2206
2207 backend->hash();
2208 }
2209 }
2210 }
2211
2212 if (!g_cmdLine.locals.empty()) {
2213 for (auto it = g_frontends.begin(); it != g_frontends.end(); ) {
2214 /* DoH, DoT and DNSCrypt frontends are separate */
2215 if ((*it)->dohFrontend == nullptr && (*it)->tlsFrontend == nullptr && (*it)->dnscryptCtx == nullptr) {
2216 it = g_frontends.erase(it);
2217 }
2218 else {
2219 ++it;
2220 }
2221 }
2222
2223 for(const auto& loc : g_cmdLine.locals) {
2224 /* UDP */
2225 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress(loc, 53), false, false, 0, "", {})));
2226 /* TCP */
2227 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress(loc, 53), true, false, 0, "", {})));
2228 }
2229 }
2230
2231 if (g_frontends.empty()) {
2232 /* UDP */
2233 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress("127.0.0.1", 53), false, false, 0, "", {})));
2234 /* TCP */
2235 g_frontends.push_back(std::unique_ptr<ClientState>(new ClientState(ComboAddress("127.0.0.1", 53), true, false, 0, "", {})));
2236 }
2237
2238 g_configurationDone = true;
2239
2240 for(auto& frontend : g_frontends) {
2241 setUpLocalBind(frontend);
2242
2243 if (frontend->tcp == false) {
2244 ++udpBindsCount;
2245 }
2246 else {
2247 ++tcpBindsCount;
2248 }
2249 }
2250
2251 warnlog("dnsdist %s comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it according to the terms of the GPL version 2", VERSION);
2252
2253 vector<string> vec;
2254 std::string acls;
2255 g_ACL.getLocal()->toStringVector(&vec);
2256 for(const auto& s : vec) {
2257 if (!acls.empty())
2258 acls += ", ";
2259 acls += s;
2260 }
2261 infolog("ACL allowing queries from: %s", acls.c_str());
2262 vec.clear();
2263 acls.clear();
2264 g_consoleACL.getLocal()->toStringVector(&vec);
2265 for (const auto& entry : vec) {
2266 if (!acls.empty()) {
2267 acls += ", ";
2268 }
2269 acls += entry;
2270 }
2271 infolog("Console ACL allowing connections from: %s", acls.c_str());
2272
2273 #ifdef HAVE_LIBSODIUM
2274 if (g_consoleEnabled && g_consoleKey.empty()) {
2275 warnlog("Warning, the console has been enabled via 'controlSocket()' but no key has been set with 'setKey()' so all connections will fail until a key has been set");
2276 }
2277 #endif
2278
2279 uid_t newgid=getegid();
2280 gid_t newuid=geteuid();
2281
2282 if(!g_cmdLine.gid.empty())
2283 newgid = strToGID(g_cmdLine.gid.c_str());
2284
2285 if(!g_cmdLine.uid.empty())
2286 newuid = strToUID(g_cmdLine.uid.c_str());
2287
2288 if (getegid() != newgid) {
2289 if (running_in_service_mgr()) {
2290 errlog("--gid/-g set on command-line, but dnsdist was started as a systemd service. Use the 'Group' setting in the systemd unit file to set the group to run as");
2291 _exit(EXIT_FAILURE);
2292 }
2293 dropGroupPrivs(newgid);
2294 }
2295
2296 if (geteuid() != newuid) {
2297 if (running_in_service_mgr()) {
2298 errlog("--uid/-u set on command-line, but dnsdist was started as a systemd service. Use the 'User' setting in the systemd unit file to set the user to run as");
2299 _exit(EXIT_FAILURE);
2300 }
2301 dropUserPrivs(newuid);
2302 }
2303
2304 try {
2305 /* we might still have capabilities remaining,
2306 for example if we have been started as root
2307 without --uid or --gid (please don't do that)
2308 or as an unprivileged user with ambient
2309 capabilities like CAP_NET_BIND_SERVICE.
2310 */
2311 dropCapabilities(g_capabilitiesToRetain);
2312 }
2313 catch(const std::exception& e) {
2314 warnlog("%s", e.what());
2315 }
2316
2317 /* this need to be done _after_ dropping privileges */
2318 g_delay = new DelayPipe<DelayedPacket>();
2319
2320 if (g_snmpAgent) {
2321 g_snmpAgent->run();
2322 }
2323
2324 g_tcpclientthreads = std::unique_ptr<TCPClientCollection>(new TCPClientCollection(g_maxTCPClientThreads, g_useTCPSinglePipe));
2325
2326 for(auto& t : todo)
2327 t();
2328
2329 localPools = g_pools.getCopy();
2330 /* create the default pool no matter what */
2331 createPoolIfNotExists(localPools, "");
2332 if(g_cmdLine.remotes.size()) {
2333 for(const auto& address : g_cmdLine.remotes) {
2334 auto ret=std::make_shared<DownstreamState>(ComboAddress(address, 53));
2335 addServerToPool(localPools, "", ret);
2336 if (ret->connected && !ret->threadStarted.test_and_set()) {
2337 ret->tid = thread(responderThread, ret);
2338 }
2339 g_dstates.modify([ret](servers_t& servers) { servers.push_back(ret); });
2340 }
2341 }
2342 g_pools.setState(localPools);
2343
2344 if(g_dstates.getLocal()->empty()) {
2345 errlog("No downstream servers defined: all packets will get dropped");
2346 // you might define them later, but you need to know
2347 }
2348
2349 checkFileDescriptorsLimits(udpBindsCount, tcpBindsCount);
2350
2351 auto mplexer = std::shared_ptr<FDMultiplexer>(FDMultiplexer::getMultiplexerSilent());
2352 for(auto& dss : g_dstates.getCopy()) { // it is a copy, but the internal shared_ptrs are the real deal
2353 if (dss->availability == DownstreamState::Availability::Auto) {
2354 if (!queueHealthCheck(mplexer, dss, true)) {
2355 dss->upStatus = false;
2356 warnlog("Marking downstream %s as 'down'", dss->getNameWithAddr());
2357 }
2358 }
2359 }
2360 handleQueuedHealthChecks(mplexer, true);
2361
2362 for(auto& cs : g_frontends) {
2363 if (cs->dohFrontend != nullptr) {
2364 #ifdef HAVE_DNS_OVER_HTTPS
2365 std::thread t1(dohThread, cs.get());
2366 if (!cs->cpus.empty()) {
2367 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2368 }
2369 t1.detach();
2370 #endif /* HAVE_DNS_OVER_HTTPS */
2371 continue;
2372 }
2373 if (cs->udpFD >= 0) {
2374 thread t1(udpClientThread, cs.get());
2375 if (!cs->cpus.empty()) {
2376 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2377 }
2378 t1.detach();
2379 }
2380 else if (cs->tcpFD >= 0) {
2381 thread t1(tcpAcceptorThread, cs.get());
2382 if (!cs->cpus.empty()) {
2383 mapThreadToCPUList(t1.native_handle(), cs->cpus);
2384 }
2385 t1.detach();
2386 }
2387 }
2388
2389 thread carbonthread(carbonDumpThread);
2390 carbonthread.detach();
2391
2392 thread stattid(maintThread);
2393 stattid.detach();
2394
2395 thread healththread(healthChecksThread);
2396
2397 if (!g_secPollSuffix.empty()) {
2398 thread secpollthread(secPollThread);
2399 secpollthread.detach();
2400 }
2401
2402 if(g_cmdLine.beSupervised) {
2403 #ifdef HAVE_SYSTEMD
2404 sd_notify(0, "READY=1");
2405 #endif
2406 healththread.join();
2407 }
2408 else {
2409 healththread.detach();
2410 doConsole();
2411 }
2412 _exit(EXIT_SUCCESS);
2413
2414 }
2415 catch(const LuaContext::ExecutionErrorException& e) {
2416 try {
2417 errlog("Fatal Lua error: %s", e.what());
2418 std::rethrow_if_nested(e);
2419 } catch(const std::exception& ne) {
2420 errlog("Details: %s", ne.what());
2421 }
2422 catch(PDNSException &ae)
2423 {
2424 errlog("Fatal pdns error: %s", ae.reason);
2425 }
2426 _exit(EXIT_FAILURE);
2427 }
2428 catch(std::exception &e)
2429 {
2430 errlog("Fatal error: %s", e.what());
2431 _exit(EXIT_FAILURE);
2432 }
2433 catch(PDNSException &ae)
2434 {
2435 errlog("Fatal pdns error: %s", ae.reason);
2436 _exit(EXIT_FAILURE);
2437 }
2438
2439 uint64_t getLatencyCount(const std::string&)
2440 {
2441 return g_stats.responses + g_stats.selfAnswered + g_stats.cacheHits;
2442 }