]> git.ipfire.org Git - thirdparty/pdns.git/blame - pdns/iputils.cc
Ignore Path MTU Discovery on UDP server socket
[thirdparty/pdns.git] / pdns / iputils.cc
CommitLineData
12471842
PL
1/*
2 * This file is part of PowerDNS or dnsdist.
3 * Copyright -- PowerDNS.COM B.V. and its contributors
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of version 2 of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * In addition, for the avoidance of any doubt, permission is granted to
10 * link this program with OpenSSL and to (re)distribute the binaries
11 * produced as the result of such linking.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
870a0fe4
AT
22#ifdef HAVE_CONFIG_H
23#include "config.h"
24#endif
002c970a 25#include "iputils.hh"
22cf1fda 26#include <sys/socket.h>
27
002c970a 28/** these functions provide a very lightweight wrapper to the Berkeley sockets API. Errors -> exceptions! */
29
30static void RuntimeError(const boost::format& fmt)
31{
32 throw runtime_error(fmt.str());
33}
34
73ba5999
CHB
35static void NetworkErr(const boost::format& fmt)
36{
37 throw NetworkError(fmt.str());
38}
002c970a 39
40int SSocket(int family, int type, int flags)
41{
42 int ret = socket(family, type, flags);
43 if(ret < 0)
44 RuntimeError(boost::format("creating socket of type %d: %s") % family % strerror(errno));
45 return ret;
46}
47
48int SConnect(int sockfd, const ComboAddress& remote)
49{
bdf8277d 50 int ret = connect(sockfd, reinterpret_cast<const struct sockaddr*>(&remote), remote.getSocklen());
ccb4b5e2
PD
51 if(ret < 0) {
52 int savederrno = errno;
53 RuntimeError(boost::format("connecting socket to %s: %s") % remote.toStringWithPort() % strerror(savederrno));
54 }
002c970a 55 return ret;
56}
57
51959320
RG
58int SConnectWithTimeout(int sockfd, const ComboAddress& remote, int timeout)
59{
bdf8277d 60 int ret = connect(sockfd, reinterpret_cast<const struct sockaddr*>(&remote), remote.getSocklen());
51959320
RG
61 if(ret < 0) {
62 int savederrno = errno;
63 if (savederrno == EINPROGRESS) {
399fd947 64 if (timeout <= 0) {
bdf8277d 65 return savederrno;
399fd947
RG
66 }
67
51959320
RG
68 /* we wait until the connection has been established */
69 bool error = false;
70 bool disconnected = false;
71 int res = waitForRWData(sockfd, false, timeout, 0, &error, &disconnected);
72 if (res == 1) {
73 if (error) {
74 savederrno = 0;
75 socklen_t errlen = sizeof(savederrno);
76 if (getsockopt(sockfd, SOL_SOCKET, SO_ERROR, (void *)&savederrno, &errlen) == 0) {
73ba5999 77 NetworkErr(boost::format("connecting to %s failed: %s") % remote.toStringWithPort() % string(strerror(savederrno)));
51959320
RG
78 }
79 else {
73ba5999 80 NetworkErr(boost::format("connecting to %s failed") % remote.toStringWithPort());
51959320
RG
81 }
82 }
83 if (disconnected) {
73ba5999 84 NetworkErr(boost::format("%s closed the connection") % remote.toStringWithPort());
51959320
RG
85 }
86 return 0;
87 }
88 else if (res == 0) {
73ba5999 89 NetworkErr(boost::format("timeout while connecting to %s") % remote.toStringWithPort());
51959320
RG
90 } else if (res < 0) {
91 savederrno = errno;
73ba5999 92 NetworkErr(boost::format("waiting to connect to %s: %s") % remote.toStringWithPort() % string(strerror(savederrno)));
51959320
RG
93 }
94 }
95 else {
73ba5999 96 NetworkErr(boost::format("connecting to %s: %s") % remote.toStringWithPort() % string(strerror(savederrno)));
51959320
RG
97 }
98 }
99
bdf8277d 100 return 0;
51959320
RG
101}
102
002c970a 103int SBind(int sockfd, const ComboAddress& local)
104{
105 int ret = bind(sockfd, (struct sockaddr*)&local, local.getSocklen());
ccb4b5e2
PD
106 if(ret < 0) {
107 int savederrno = errno;
108 RuntimeError(boost::format("binding socket to %s: %s") % local.toStringWithPort() % strerror(savederrno));
109 }
002c970a 110 return ret;
111}
112
113int SAccept(int sockfd, ComboAddress& remote)
114{
115 socklen_t remlen = remote.getSocklen();
116
117 int ret = accept(sockfd, (struct sockaddr*)&remote, &remlen);
118 if(ret < 0)
119 RuntimeError(boost::format("accepting new connection on socket: %s") % strerror(errno));
120 return ret;
121}
122
123int SListen(int sockfd, int limit)
124{
125 int ret = listen(sockfd, limit);
126 if(ret < 0)
127 RuntimeError(boost::format("setting socket to listen: %s") % strerror(errno));
128 return ret;
129}
130
131int SSetsockopt(int sockfd, int level, int opname, int value)
132{
133 int ret = setsockopt(sockfd, level, opname, &value, sizeof(value));
134 if(ret < 0)
135 RuntimeError(boost::format("setsockopt for level %d and opname %d to %d failed: %s") % level % opname % value % strerror(errno));
136 return ret;
137}
138
90f9fbc0
RG
139void setSocketIgnorePMTU(int sockfd)
140{
141#ifdef IP_PMTUDISC_OMIT
142 /* Linux 3.15+ has IP_PMTUDISC_OMIT, which discards PMTU information to prevent
143 poisoning, but still allows fragmentation if the packet size exceeds the
144 outgoing interface MTU, which is good.
145 */
146 try {
147 SSetsockopt(sockfd, IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_OMIT);
148 return;
149 }
150 catch(const std::exception& e) {
151 /* failed, let's try IP_PMTUDISC_DONT instead */
152 }
153#endif /* IP_PMTUDISC_OMIT */
154
155 /* IP_PMTUDISC_DONT disables Path MTU discovery */
156 SSetsockopt(sockfd, IPPROTO_IP, IP_MTU_DISCOVER, IP_PMTUDISC_DONT);
157}
002c970a 158
3e3f0358 159bool HarvestTimestamp(struct msghdr* msgh, struct timeval* tv)
160{
161#ifdef SO_TIMESTAMP
162 struct cmsghdr *cmsg;
163 for (cmsg = CMSG_FIRSTHDR(msgh); cmsg != NULL; cmsg = CMSG_NXTHDR(msgh,cmsg)) {
22cf1fda 164 if ((cmsg->cmsg_level == SOL_SOCKET) && (cmsg->cmsg_type == SO_TIMESTAMP || cmsg->cmsg_type == SCM_TIMESTAMP) &&
3e3f0358 165 CMSG_LEN(sizeof(*tv)) == cmsg->cmsg_len) {
166 memcpy(tv, CMSG_DATA(cmsg), sizeof(*tv));
167 return true;
168 }
169 }
170#endif
171 return false;
172}
2b3eefc3 173bool HarvestDestinationAddress(const struct msghdr* msgh, ComboAddress* destination)
3e3f0358 174{
d38e2ba9 175 destination->reset();
4d39d7f3
TIH
176#ifdef __NetBSD__
177 struct cmsghdr* cmsg;
178#else
2b3eefc3 179 const struct cmsghdr* cmsg;
4d39d7f3 180#endif
2b3eefc3 181 for (cmsg = CMSG_FIRSTHDR(msgh); cmsg != NULL; cmsg = CMSG_NXTHDR(const_cast<struct msghdr*>(msgh), const_cast<struct cmsghdr*>(cmsg))) {
3e3f0358 182#if defined(IP_PKTINFO)
183 if ((cmsg->cmsg_level == IPPROTO_IP) && (cmsg->cmsg_type == IP_PKTINFO)) {
184 struct in_pktinfo *i = (struct in_pktinfo *) CMSG_DATA(cmsg);
185 destination->sin4.sin_addr = i->ipi_addr;
186 destination->sin4.sin_family = AF_INET;
187 return true;
188 }
189#elif defined(IP_RECVDSTADDR)
190 if ((cmsg->cmsg_level == IPPROTO_IP) && (cmsg->cmsg_type == IP_RECVDSTADDR)) {
191 struct in_addr *i = (struct in_addr *) CMSG_DATA(cmsg);
192 destination->sin4.sin_addr = *i;
193 destination->sin4.sin_family = AF_INET;
194 return true;
195 }
196#endif
197
198 if ((cmsg->cmsg_level == IPPROTO_IPV6) && (cmsg->cmsg_type == IPV6_PKTINFO)) {
199 struct in6_pktinfo *i = (struct in6_pktinfo *) CMSG_DATA(cmsg);
200 destination->sin6.sin6_addr = i->ipi6_addr;
201 destination->sin4.sin_family = AF_INET6;
202 return true;
203 }
204 }
205 return false;
206}
207
208bool IsAnyAddress(const ComboAddress& addr)
209{
210 if(addr.sin4.sin_family == AF_INET)
211 return addr.sin4.sin_addr.s_addr == 0;
212 else if(addr.sin4.sin_family == AF_INET6)
213 return !memcmp(&addr.sin6.sin6_addr, &in6addr_any, sizeof(addr.sin6.sin6_addr));
214
215 return false;
216}
217
a683e8bd 218ssize_t sendfromto(int sock, const char* data, size_t len, int flags, const ComboAddress& from, const ComboAddress& to)
3e3f0358 219{
220 struct msghdr msgh;
221 struct iovec iov;
222 char cbuf[256];
223
224 /* Set up iov and msgh structures. */
225 memset(&msgh, 0, sizeof(struct msghdr));
226 iov.iov_base = (void*)data;
227 iov.iov_len = len;
228 msgh.msg_iov = &iov;
229 msgh.msg_iovlen = 1;
230 msgh.msg_name = (struct sockaddr*)&to;
231 msgh.msg_namelen = to.getSocklen();
232
233 if(from.sin4.sin_family) {
fbe2a2e0 234 addCMsgSrcAddr(&msgh, cbuf, &from, 0);
3e3f0358 235 }
579cae19
PD
236 else {
237 msgh.msg_control=NULL;
238 }
3e3f0358 239 return sendmsg(sock, &msgh, flags);
240}
b71b60ee 241
242// be careful: when using this for receive purposes, make sure addr->sin4.sin_family is set appropriately so getSocklen works!
243// be careful: when using this function for *send* purposes, be sure to set cbufsize to 0!
579cae19 244// be careful: if you don't call addCMsgSrcAddr after fillMSGHdr, make sure to set msg_control to NULL
b71b60ee 245void fillMSGHdr(struct msghdr* msgh, struct iovec* iov, char* cbuf, size_t cbufsize, char* data, size_t datalen, ComboAddress* addr)
246{
247 iov->iov_base = data;
248 iov->iov_len = datalen;
249
250 memset(msgh, 0, sizeof(struct msghdr));
251
252 msgh->msg_control = cbuf;
253 msgh->msg_controllen = cbufsize;
254 msgh->msg_name = addr;
255 msgh->msg_namelen = addr->getSocklen();
256 msgh->msg_iov = iov;
257 msgh->msg_iovlen = 1;
258 msgh->msg_flags = 0;
259}
22779196 260
5b6099b2 261// warning: various parts of PowerDNS assume 'truncate' will never throw
262void ComboAddress::truncate(unsigned int bits) noexcept
22779196 263{
264 uint8_t* start;
265 int len=4;
266 if(sin4.sin_family==AF_INET) {
ecd43f08 267 if(bits >= 32)
22779196 268 return;
269 start = (uint8_t*)&sin4.sin_addr.s_addr;
270 len=4;
271 }
272 else {
ecd43f08 273 if(bits >= 128)
22779196 274 return;
275 start = (uint8_t*)&sin6.sin6_addr.s6_addr;
276 len=16;
277 }
278
22779196 279 auto tozero= len*8 - bits; // if set to 22, this will clear 1 byte, as it should
280
281 memset(start + len - tozero/8, 0, tozero/8); // blot out the whole bytes on the right
282
283 auto bitsleft=tozero % 8; // 2 bits left to clear
284
285 // a b c d, to truncate to 22 bits, we just zeroed 'd' and need to zero 2 bits from c
286 // so and by '11111100', which is ~((1<<2)-1) = ~3
287 uint8_t* place = start + len - 1 - tozero/8;
22779196 288 *place &= (~((1<<bitsleft)-1));
289}
f9f9592e 290
d0ae6360 291size_t sendMsgWithTimeout(int fd, const char* buffer, size_t len, int idleTimeout, const ComboAddress* dest, const ComboAddress* local, unsigned int localItf, int totalTimeout, int flags)
fbe2a2e0 292{
d0ae6360
RG
293 int remainingTime = totalTimeout;
294 time_t start = 0;
295 if (totalTimeout) {
296 start = time(nullptr);
297 }
298
fbe2a2e0
RG
299 struct msghdr msgh;
300 struct iovec iov;
301 char cbuf[256];
d0ae6360
RG
302
303 /* Set up iov and msgh structures. */
304 memset(&msgh, 0, sizeof(struct msghdr));
305 msgh.msg_control = nullptr;
306 msgh.msg_controllen = 0;
307 if (dest) {
308 msgh.msg_name = reinterpret_cast<void*>(const_cast<ComboAddress*>(dest));
309 msgh.msg_namelen = dest->getSocklen();
310 }
311 else {
312 msgh.msg_name = nullptr;
313 msgh.msg_namelen = 0;
314 }
315
316 msgh.msg_flags = 0;
317
318 if (localItf != 0 && local) {
319 addCMsgSrcAddr(&msgh, cbuf, local, localItf);
320 }
321
d0ae6360
RG
322 iov.iov_base = reinterpret_cast<void*>(const_cast<char*>(buffer));
323 iov.iov_len = len;
324 msgh.msg_iov = &iov;
325 msgh.msg_iovlen = 1;
326 msgh.msg_flags = 0;
327
328 size_t sent = 0;
fbe2a2e0 329 bool firstTry = true;
fbe2a2e0
RG
330
331 do {
fbe2a2e0 332
d0ae6360
RG
333#ifdef MSG_FASTOPEN
334 if (flags & MSG_FASTOPEN && firstTry == false) {
335 flags &= ~MSG_FASTOPEN;
336 }
337#endif /* MSG_FASTOPEN */
338
339 ssize_t res = sendmsg(fd, &msgh, flags);
fbe2a2e0 340
d0ae6360
RG
341 if (res > 0) {
342 size_t written = static_cast<size_t>(res);
343 sent += written;
344
345 if (sent == len) {
346 return sent;
347 }
348
349 /* partial write */
350 iov.iov_len -= written;
351 iov.iov_base = reinterpret_cast<void*>(reinterpret_cast<char*>(iov.iov_base) + written);
352 written = 0;
353 }
354 else if (res == -1) {
355 if (errno == EINTR) {
356 continue;
357 }
2bd6a088 358 else if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINPROGRESS || errno == ENOTCONN) {
d0ae6360
RG
359 /* EINPROGRESS might happen with non blocking socket,
360 especially with TCP Fast Open */
361 if (totalTimeout <= 0 && idleTimeout <= 0) {
362 return sent;
fbe2a2e0 363 }
d0ae6360
RG
364
365 if (firstTry) {
366 int res = waitForRWData(fd, false, (totalTimeout == 0 || idleTimeout <= remainingTime) ? idleTimeout : remainingTime, 0);
367 if (res > 0) {
368 /* there is room available */
369 firstTry = false;
370 }
371 else if (res == 0) {
372 throw runtime_error("Timeout while waiting to write data");
373 } else {
374 throw runtime_error("Error while waiting for room to write data");
375 }
376 }
377 else {
fbe2a2e0 378 throw runtime_error("Timeout while waiting to write data");
fbe2a2e0
RG
379 }
380 }
381 else {
d0ae6360 382 unixDie("failed in sendMsgWithTimeout");
fbe2a2e0
RG
383 }
384 }
d0ae6360
RG
385 if (totalTimeout) {
386 time_t now = time(nullptr);
387 int elapsed = now - start;
388 if (elapsed >= remainingTime) {
389 throw runtime_error("Timeout while sending data");
390 }
391 start = now;
392 remainingTime -= elapsed;
fbe2a2e0
RG
393 }
394 }
395 while (firstTry);
396
397 return 0;
398}
399
f9f9592e 400template class NetmaskTree<bool>;
fbe2a2e0 401
17bca36a
RG
402bool sendSizeAndMsgWithTimeout(int sock, uint16_t bufferLen, const char* buffer, int idleTimeout, const ComboAddress* dest, const ComboAddress* local, unsigned int localItf, int totalTimeout, int flags)
403{
404 uint16_t size = htons(bufferLen);
405 char cbuf[256];
406 struct msghdr msgh;
407 struct iovec iov[2];
408 int remainingTime = totalTimeout;
409 time_t start = 0;
410 if (totalTimeout) {
411 start = time(NULL);
412 }
413
414 /* Set up iov and msgh structures. */
415 memset(&msgh, 0, sizeof(struct msghdr));
416 msgh.msg_control = nullptr;
417 msgh.msg_controllen = 0;
418 if (dest) {
419 msgh.msg_name = reinterpret_cast<void*>(const_cast<ComboAddress*>(dest));
420 msgh.msg_namelen = dest->getSocklen();
421 }
422 else {
423 msgh.msg_name = nullptr;
424 msgh.msg_namelen = 0;
425 }
426
427 msgh.msg_flags = 0;
428
429 if (localItf != 0 && local) {
430 addCMsgSrcAddr(&msgh, cbuf, local, localItf);
431 }
432
433 iov[0].iov_base = &size;
434 iov[0].iov_len = sizeof(size);
435 iov[1].iov_base = reinterpret_cast<void*>(const_cast<char*>(buffer));
436 iov[1].iov_len = bufferLen;
437
438 size_t pos = 0;
439 size_t sent = 0;
440 size_t nbElements = sizeof(iov)/sizeof(*iov);
441 while (true) {
442 msgh.msg_iov = &iov[pos];
443 msgh.msg_iovlen = nbElements - pos;
444
445 ssize_t res = sendmsg(sock, &msgh, flags);
446 if (res > 0) {
447 size_t written = static_cast<size_t>(res);
448 sent += written;
449
450 if (sent == (sizeof(size) + bufferLen)) {
451 return true;
452 }
453 /* partial write, we need to keep only the (parts of) elements
454 that have not been written.
455 */
456 do {
457 if (written < iov[pos].iov_len) {
458 iov[pos].iov_len -= written;
d9442969 459 iov[pos].iov_base = reinterpret_cast<void*>(reinterpret_cast<char*>(iov[pos].iov_base) + written);
17bca36a
RG
460 written = 0;
461 }
462 else {
463 written -= iov[pos].iov_len;
464 iov[pos].iov_len = 0;
465 pos++;
466 }
467 }
468 while (written > 0 && pos < nbElements);
469 }
470 else if (res == -1) {
471 if (errno == EINTR) {
472 continue;
473 }
474 else if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINPROGRESS) {
475 /* EINPROGRESS might happen with non blocking socket,
476 especially with TCP Fast Open */
477 int ret = waitForRWData(sock, false, (totalTimeout == 0 || idleTimeout <= remainingTime) ? idleTimeout : remainingTime, 0);
478 if (ret > 0) {
479 /* there is room available */
480 }
481 else if (ret == 0) {
482 throw runtime_error("Timeout while waiting to send data");
483 } else {
484 throw runtime_error("Error while waiting for room to send data");
485 }
486 }
487 else {
488 unixDie("failed in sendSizeAndMsgWithTimeout");
489 }
490 }
491 if (totalTimeout) {
492 time_t now = time(NULL);
493 int elapsed = now - start;
494 if (elapsed >= remainingTime) {
495 throw runtime_error("Timeout while sending data");
496 }
497 start = now;
498 remainingTime -= elapsed;
499 }
500 }
501
502 return false;
503}
840ed663
RG
504
505/* requires a non-blocking socket.
506 On Linux, we could use MSG_DONTWAIT on a blocking socket
507 but this is not portable.
508*/
509bool isTCPSocketUsable(int sock)
510{
511 int err = 0;
512 char buf = '\0';
513 size_t buf_size = sizeof(buf);
514
515 do {
516 ssize_t got = recv(sock, &buf, buf_size, MSG_PEEK);
517
518 if (got > 0) {
519 /* socket is usable, some data is even waiting to be read */
520 return true;
521 }
522 else if (got == 0) {
523 /* other end has closed the socket */
524 return false;
525 }
526 else {
88479ac5 527 err = errno;
840ed663
RG
528
529 if (err == EAGAIN || err == EWOULDBLOCK) {
530 /* socket is usable, no data waiting */
531 return true;
532 }
533 else {
534 if (err != EINTR) {
535 /* something is wrong, could be ECONNRESET,
536 ENOTCONN, EPIPE, but anyway this socket is
537 not usable. */
538 return false;
539 }
540 }
541 }
542 } while (err == EINTR);
543
544 return false;
545}