]> git.ipfire.org Git - thirdparty/bird.git/blob - sysdep/unix/io.c
Merge branch 'int-new-rpki-squashed' (early part) into int-new
[thirdparty/bird.git] / sysdep / unix / io.c
1 /*
2 * BIRD Internet Routing Daemon -- Unix I/O
3 *
4 * (c) 1998--2004 Martin Mares <mj@ucw.cz>
5 * (c) 2004 Ondrej Filip <feela@network.cz>
6 *
7 * Can be freely distributed and used under the terms of the GNU GPL.
8 */
9
10 /* Unfortunately, some glibc versions hide parts of RFC 3542 API
11 if _GNU_SOURCE is not defined. */
12 #ifndef _GNU_SOURCE
13 #define _GNU_SOURCE
14 #endif
15
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <time.h>
19 #include <sys/time.h>
20 #include <sys/types.h>
21 #include <sys/socket.h>
22 #include <sys/uio.h>
23 #include <sys/un.h>
24 #include <poll.h>
25 #include <unistd.h>
26 #include <fcntl.h>
27 #include <errno.h>
28 #include <net/if.h>
29 #include <netinet/in.h>
30 #include <netinet/tcp.h>
31 #include <netinet/udp.h>
32 #include <netinet/icmp6.h>
33
34 #include "nest/bird.h"
35 #include "lib/lists.h"
36 #include "lib/resource.h"
37 #include "sysdep/unix/timer.h"
38 #include "lib/socket.h"
39 #include "lib/event.h"
40 #include "lib/string.h"
41 #include "nest/iface.h"
42
43 #include "sysdep/unix/unix.h"
44 #include CONFIG_INCLUDE_SYSIO_H
45
46 /* Maximum number of calls of tx handler for one socket in one
47 * poll iteration. Should be small enough to not monopolize CPU by
48 * one protocol instance.
49 */
50 #define MAX_STEPS 4
51
52 /* Maximum number of calls of rx handler for all sockets in one poll
53 iteration. RX callbacks are often much more costly so we limit
54 this to gen small latencies */
55 #define MAX_RX_STEPS 4
56
57 /*
58 * Tracked Files
59 */
60
61 struct rfile {
62 resource r;
63 FILE *f;
64 };
65
66 static void
67 rf_free(resource *r)
68 {
69 struct rfile *a = (struct rfile *) r;
70
71 fclose(a->f);
72 }
73
74 static void
75 rf_dump(resource *r)
76 {
77 struct rfile *a = (struct rfile *) r;
78
79 debug("(FILE *%p)\n", a->f);
80 }
81
82 static struct resclass rf_class = {
83 "FILE",
84 sizeof(struct rfile),
85 rf_free,
86 rf_dump,
87 NULL,
88 NULL
89 };
90
91 void *
92 tracked_fopen(pool *p, char *name, char *mode)
93 {
94 FILE *f = fopen(name, mode);
95
96 if (f)
97 {
98 struct rfile *r = ralloc(p, &rf_class);
99 r->f = f;
100 }
101 return f;
102 }
103
104 /**
105 * DOC: Timers
106 *
107 * Timers are resources which represent a wish of a module to call
108 * a function at the specified time. The platform dependent code
109 * doesn't guarantee exact timing, only that a timer function
110 * won't be called before the requested time.
111 *
112 * In BIRD, time is represented by values of the &bird_clock_t type
113 * which are integral numbers interpreted as a relative number of seconds since
114 * some fixed time point in past. The current time can be read
115 * from variable @now with reasonable accuracy and is monotonic. There is also
116 * a current 'absolute' time in variable @now_real reported by OS.
117 *
118 * Each timer is described by a &timer structure containing a pointer
119 * to the handler function (@hook), data private to this function (@data),
120 * time the function should be called at (@expires, 0 for inactive timers),
121 * for the other fields see |timer.h|.
122 */
123
124 #define NEAR_TIMER_LIMIT 4
125
126 static list near_timers, far_timers;
127 static bird_clock_t first_far_timer = TIME_INFINITY;
128
129 /* now must be different from 0, because 0 is a special value in timer->expires */
130 bird_clock_t now = 1, now_real, boot_time;
131
132 static void
133 update_times_plain(void)
134 {
135 bird_clock_t new_time = time(NULL);
136 int delta = new_time - now_real;
137
138 if ((delta >= 0) && (delta < 60))
139 now += delta;
140 else if (now_real != 0)
141 log(L_WARN "Time jump, delta %d s", delta);
142
143 now_real = new_time;
144 }
145
146 static void
147 update_times_gettime(void)
148 {
149 struct timespec ts;
150 int rv;
151
152 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
153 if (rv != 0)
154 die("clock_gettime: %m");
155
156 if (ts.tv_sec != now) {
157 if (ts.tv_sec < now)
158 log(L_ERR "Monotonic timer is broken");
159
160 now = ts.tv_sec;
161 now_real = time(NULL);
162 }
163 }
164
165 static int clock_monotonic_available;
166
167 static inline void
168 update_times(void)
169 {
170 if (clock_monotonic_available)
171 update_times_gettime();
172 else
173 update_times_plain();
174 }
175
176 static inline void
177 init_times(void)
178 {
179 struct timespec ts;
180 clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
181 if (!clock_monotonic_available)
182 log(L_WARN "Monotonic timer is missing");
183 }
184
185
186 static void
187 tm_free(resource *r)
188 {
189 timer *t = (timer *) r;
190
191 tm_stop(t);
192 }
193
194 static void
195 tm_dump(resource *r)
196 {
197 timer *t = (timer *) r;
198
199 debug("(code %p, data %p, ", t->hook, t->data);
200 if (t->randomize)
201 debug("rand %d, ", t->randomize);
202 if (t->recurrent)
203 debug("recur %d, ", t->recurrent);
204 if (t->expires)
205 debug("expires in %d sec)\n", t->expires - now);
206 else
207 debug("inactive)\n");
208 }
209
210 static struct resclass tm_class = {
211 "Timer",
212 sizeof(timer),
213 tm_free,
214 tm_dump,
215 NULL,
216 NULL
217 };
218
219 /**
220 * tm_new - create a timer
221 * @p: pool
222 *
223 * This function creates a new timer resource and returns
224 * a pointer to it. To use the timer, you need to fill in
225 * the structure fields and call tm_start() to start timing.
226 */
227 timer *
228 tm_new(pool *p)
229 {
230 timer *t = ralloc(p, &tm_class);
231 return t;
232 }
233
234 static inline void
235 tm_insert_near(timer *t)
236 {
237 node *n = HEAD(near_timers);
238
239 while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
240 n = n->next;
241 insert_node(&t->n, n->prev);
242 }
243
244 /**
245 * tm_start - start a timer
246 * @t: timer
247 * @after: number of seconds the timer should be run after
248 *
249 * This function schedules the hook function of the timer to
250 * be called after @after seconds. If the timer has been already
251 * started, it's @expire time is replaced by the new value.
252 *
253 * You can have set the @randomize field of @t, the timeout
254 * will be increased by a random number of seconds chosen
255 * uniformly from range 0 .. @randomize.
256 *
257 * You can call tm_start() from the handler function of the timer
258 * to request another run of the timer. Also, you can set the @recurrent
259 * field to have the timer re-added automatically with the same timeout.
260 */
261 void
262 tm_start(timer *t, unsigned after)
263 {
264 bird_clock_t when;
265
266 if (t->randomize)
267 after += random() % (t->randomize + 1);
268 when = now + after;
269 if (t->expires == when)
270 return;
271 if (t->expires)
272 rem_node(&t->n);
273 t->expires = when;
274 if (after <= NEAR_TIMER_LIMIT)
275 tm_insert_near(t);
276 else
277 {
278 if (!first_far_timer || first_far_timer > when)
279 first_far_timer = when;
280 add_tail(&far_timers, &t->n);
281 }
282 }
283
284 /**
285 * tm_stop - stop a timer
286 * @t: timer
287 *
288 * This function stops a timer. If the timer is already stopped,
289 * nothing happens.
290 */
291 void
292 tm_stop(timer *t)
293 {
294 if (t->expires)
295 {
296 rem_node(&t->n);
297 t->expires = 0;
298 }
299 }
300
301 static void
302 tm_dump_them(char *name, list *l)
303 {
304 node *n;
305 timer *t;
306
307 debug("%s timers:\n", name);
308 WALK_LIST(n, *l)
309 {
310 t = SKIP_BACK(timer, n, n);
311 debug("%p ", t);
312 tm_dump(&t->r);
313 }
314 debug("\n");
315 }
316
317 void
318 tm_dump_all(void)
319 {
320 tm_dump_them("Near", &near_timers);
321 tm_dump_them("Far", &far_timers);
322 }
323
324 static inline time_t
325 tm_first_shot(void)
326 {
327 time_t x = first_far_timer;
328
329 if (!EMPTY_LIST(near_timers))
330 {
331 timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
332 if (t->expires < x)
333 x = t->expires;
334 }
335 return x;
336 }
337
338 void io_log_event(void *hook, void *data);
339
340 static void
341 tm_shot(void)
342 {
343 timer *t;
344 node *n, *m;
345
346 if (first_far_timer <= now)
347 {
348 bird_clock_t limit = now + NEAR_TIMER_LIMIT;
349 first_far_timer = TIME_INFINITY;
350 n = HEAD(far_timers);
351 while (m = n->next)
352 {
353 t = SKIP_BACK(timer, n, n);
354 if (t->expires <= limit)
355 {
356 rem_node(n);
357 tm_insert_near(t);
358 }
359 else if (t->expires < first_far_timer)
360 first_far_timer = t->expires;
361 n = m;
362 }
363 }
364 while ((n = HEAD(near_timers)) -> next)
365 {
366 int delay;
367 t = SKIP_BACK(timer, n, n);
368 if (t->expires > now)
369 break;
370 rem_node(n);
371 delay = t->expires - now;
372 t->expires = 0;
373 if (t->recurrent)
374 {
375 int i = t->recurrent - delay;
376 if (i < 0)
377 i = 0;
378 tm_start(t, i);
379 }
380 io_log_event(t->hook, t->data);
381 t->hook(t);
382 }
383 }
384
385 /**
386 * tm_parse_datetime - parse a date and time
387 * @x: datetime string
388 *
389 * tm_parse_datetime() takes a textual representation of
390 * a date and time (dd-mm-yyyy hh:mm:ss)
391 * and converts it to the corresponding value of type &bird_clock_t.
392 */
393 bird_clock_t
394 tm_parse_datetime(char *x)
395 {
396 struct tm tm;
397 int n;
398 time_t t;
399
400 if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
401 return tm_parse_date(x);
402 tm.tm_mon--;
403 tm.tm_year -= 1900;
404 t = mktime(&tm);
405 if (t == (time_t) -1)
406 return 0;
407 return t;
408 }
409 /**
410 * tm_parse_date - parse a date
411 * @x: date string
412 *
413 * tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
414 * and converts it to the corresponding value of type &bird_clock_t.
415 */
416 bird_clock_t
417 tm_parse_date(char *x)
418 {
419 struct tm tm;
420 int n;
421 time_t t;
422
423 if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
424 return 0;
425 tm.tm_mon--;
426 tm.tm_year -= 1900;
427 tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
428 t = mktime(&tm);
429 if (t == (time_t) -1)
430 return 0;
431 return t;
432 }
433
434 static void
435 tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
436 {
437 static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
438 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
439
440 if (delta < 20*3600)
441 bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
442 else if (delta < 360*86400)
443 bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
444 else
445 bsprintf(x, "%d", tm->tm_year+1900);
446 }
447
448 #include "conf/conf.h"
449
450 /**
451 * tm_format_datetime - convert date and time to textual representation
452 * @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
453 * @fmt_spec: specification of resulting textual representation of the time
454 * @t: time
455 *
456 * This function formats the given relative time value @t to a textual
457 * date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
458 */
459 void
460 tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
461 {
462 const char *fmt_used;
463 struct tm *tm;
464 bird_clock_t delta = now - t;
465 t = now_real - delta;
466 tm = localtime(&t);
467
468 if (fmt_spec->fmt1 == NULL)
469 return tm_format_reltime(x, tm, delta);
470
471 if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
472 fmt_used = fmt_spec->fmt1;
473 else
474 fmt_used = fmt_spec->fmt2;
475
476 int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
477 if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
478 strcpy(x, "<too-long>");
479 }
480
481
482 /**
483 * DOC: Sockets
484 *
485 * Socket resources represent network connections. Their data structure (&socket)
486 * contains a lot of fields defining the exact type of the socket, the local and
487 * remote addresses and ports, pointers to socket buffers and finally pointers to
488 * hook functions to be called when new data have arrived to the receive buffer
489 * (@rx_hook), when the contents of the transmit buffer have been transmitted
490 * (@tx_hook) and when an error or connection close occurs (@err_hook).
491 *
492 * Freeing of sockets from inside socket hooks is perfectly safe.
493 */
494
495 #ifndef SOL_IP
496 #define SOL_IP IPPROTO_IP
497 #endif
498
499 #ifndef SOL_IPV6
500 #define SOL_IPV6 IPPROTO_IPV6
501 #endif
502
503 #ifndef SOL_ICMPV6
504 #define SOL_ICMPV6 IPPROTO_ICMPV6
505 #endif
506
507
508 /*
509 * Sockaddr helper functions
510 */
511
512 static inline int UNUSED sockaddr_length(int af)
513 { return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
514
515 static inline void
516 sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
517 {
518 memset(sa, 0, sizeof(struct sockaddr_in));
519 #ifdef HAVE_SIN_LEN
520 sa->sin_len = sizeof(struct sockaddr_in);
521 #endif
522 sa->sin_family = AF_INET;
523 sa->sin_port = htons(port);
524 sa->sin_addr = ipa_to_in4(a);
525 }
526
527 static inline void
528 sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
529 {
530 memset(sa, 0, sizeof(struct sockaddr_in6));
531 #ifdef SIN6_LEN
532 sa->sin6_len = sizeof(struct sockaddr_in6);
533 #endif
534 sa->sin6_family = AF_INET6;
535 sa->sin6_port = htons(port);
536 sa->sin6_flowinfo = 0;
537 sa->sin6_addr = ipa_to_in6(a);
538
539 if (ifa && ipa_is_link_local(a))
540 sa->sin6_scope_id = ifa->index;
541 }
542
543 void
544 sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
545 {
546 if (af == AF_INET)
547 sockaddr_fill4((struct sockaddr_in *) sa, a, port);
548 else if (af == AF_INET6)
549 sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
550 else
551 bug("Unknown AF");
552 }
553
554 static inline void
555 sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
556 {
557 *port = ntohs(sa->sin_port);
558 *a = ipa_from_in4(sa->sin_addr);
559 }
560
561 static inline void
562 sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
563 {
564 *port = ntohs(sa->sin6_port);
565 *a = ipa_from_in6(sa->sin6_addr);
566
567 if (ifa && ipa_is_link_local(*a))
568 *ifa = if_find_by_index(sa->sin6_scope_id);
569 }
570
571 int
572 sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
573 {
574 if (sa->sa.sa_family != af)
575 goto fail;
576
577 if (af == AF_INET)
578 sockaddr_read4((struct sockaddr_in *) sa, a, port);
579 else if (af == AF_INET6)
580 sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
581 else
582 goto fail;
583
584 return 0;
585
586 fail:
587 *a = IPA_NONE;
588 *port = 0;
589 return -1;
590 }
591
592
593 /*
594 * IPv6 multicast syscalls
595 */
596
597 /* Fortunately standardized in RFC 3493 */
598
599 #define INIT_MREQ6(maddr,ifa) \
600 { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
601
602 static inline int
603 sk_setup_multicast6(sock *s)
604 {
605 int index = s->iface->index;
606 int ttl = s->ttl;
607 int n = 0;
608
609 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
610 ERR("IPV6_MULTICAST_IF");
611
612 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
613 ERR("IPV6_MULTICAST_HOPS");
614
615 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
616 ERR("IPV6_MULTICAST_LOOP");
617
618 return 0;
619 }
620
621 static inline int
622 sk_join_group6(sock *s, ip_addr maddr)
623 {
624 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
625
626 if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
627 ERR("IPV6_JOIN_GROUP");
628
629 return 0;
630 }
631
632 static inline int
633 sk_leave_group6(sock *s, ip_addr maddr)
634 {
635 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
636
637 if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
638 ERR("IPV6_LEAVE_GROUP");
639
640 return 0;
641 }
642
643
644 /*
645 * IPv6 packet control messages
646 */
647
648 /* Also standardized, in RFC 3542 */
649
650 /*
651 * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
652 * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
653 * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
654 * RFC and we use IPV6_PKTINFO.
655 */
656 #ifndef IPV6_RECVPKTINFO
657 #define IPV6_RECVPKTINFO IPV6_PKTINFO
658 #endif
659 /*
660 * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
661 */
662 #ifndef IPV6_RECVHOPLIMIT
663 #define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
664 #endif
665
666
667 #define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
668 #define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
669
670 static inline int
671 sk_request_cmsg6_pktinfo(sock *s)
672 {
673 int y = 1;
674
675 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
676 ERR("IPV6_RECVPKTINFO");
677
678 return 0;
679 }
680
681 static inline int
682 sk_request_cmsg6_ttl(sock *s)
683 {
684 int y = 1;
685
686 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
687 ERR("IPV6_RECVHOPLIMIT");
688
689 return 0;
690 }
691
692 static inline void
693 sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
694 {
695 if (cm->cmsg_type == IPV6_PKTINFO)
696 {
697 struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
698 s->laddr = ipa_from_in6(pi->ipi6_addr);
699 s->lifindex = pi->ipi6_ifindex;
700 }
701 }
702
703 static inline void
704 sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
705 {
706 if (cm->cmsg_type == IPV6_HOPLIMIT)
707 s->rcv_ttl = * (int *) CMSG_DATA(cm);
708 }
709
710 static inline void
711 sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
712 {
713 struct cmsghdr *cm;
714 struct in6_pktinfo *pi;
715 int controllen = 0;
716
717 msg->msg_control = cbuf;
718 msg->msg_controllen = cbuflen;
719
720 cm = CMSG_FIRSTHDR(msg);
721 cm->cmsg_level = SOL_IPV6;
722 cm->cmsg_type = IPV6_PKTINFO;
723 cm->cmsg_len = CMSG_LEN(sizeof(*pi));
724 controllen += CMSG_SPACE(sizeof(*pi));
725
726 pi = (struct in6_pktinfo *) CMSG_DATA(cm);
727 pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
728 pi->ipi6_addr = ipa_to_in6(s->saddr);
729
730 msg->msg_controllen = controllen;
731 }
732
733
734 /*
735 * Miscellaneous socket syscalls
736 */
737
738 static inline int
739 sk_set_ttl4(sock *s, int ttl)
740 {
741 if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
742 ERR("IP_TTL");
743
744 return 0;
745 }
746
747 static inline int
748 sk_set_ttl6(sock *s, int ttl)
749 {
750 if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
751 ERR("IPV6_UNICAST_HOPS");
752
753 return 0;
754 }
755
756 static inline int
757 sk_set_tos4(sock *s, int tos)
758 {
759 if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
760 ERR("IP_TOS");
761
762 return 0;
763 }
764
765 static inline int
766 sk_set_tos6(sock *s, int tos)
767 {
768 if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
769 ERR("IPV6_TCLASS");
770
771 return 0;
772 }
773
774 static inline int
775 sk_set_high_port(sock *s UNUSED)
776 {
777 /* Port range setting is optional, ignore it if not supported */
778
779 #ifdef IP_PORTRANGE
780 if (sk_is_ipv4(s))
781 {
782 int range = IP_PORTRANGE_HIGH;
783 if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
784 ERR("IP_PORTRANGE");
785 }
786 #endif
787
788 #ifdef IPV6_PORTRANGE
789 if (sk_is_ipv6(s))
790 {
791 int range = IPV6_PORTRANGE_HIGH;
792 if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
793 ERR("IPV6_PORTRANGE");
794 }
795 #endif
796
797 return 0;
798 }
799
800 static inline byte *
801 sk_skip_ip_header(byte *pkt, int *len)
802 {
803 if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
804 return NULL;
805
806 int hlen = (*pkt & 0x0f) * 4;
807 if ((hlen < 20) || (hlen > *len))
808 return NULL;
809
810 *len -= hlen;
811 return pkt + hlen;
812 }
813
814 byte *
815 sk_rx_buffer(sock *s, int *len)
816 {
817 if (sk_is_ipv4(s) && (s->type == SK_IP))
818 return sk_skip_ip_header(s->rbuf, len);
819 else
820 return s->rbuf;
821 }
822
823
824 /*
825 * Public socket functions
826 */
827
828 /**
829 * sk_setup_multicast - enable multicast for given socket
830 * @s: socket
831 *
832 * Prepare transmission of multicast packets for given datagram socket.
833 * The socket must have defined @iface.
834 *
835 * Result: 0 for success, -1 for an error.
836 */
837
838 int
839 sk_setup_multicast(sock *s)
840 {
841 ASSERT(s->iface);
842
843 if (sk_is_ipv4(s))
844 return sk_setup_multicast4(s);
845 else
846 return sk_setup_multicast6(s);
847 }
848
849 /**
850 * sk_join_group - join multicast group for given socket
851 * @s: socket
852 * @maddr: multicast address
853 *
854 * Join multicast group for given datagram socket and associated interface.
855 * The socket must have defined @iface.
856 *
857 * Result: 0 for success, -1 for an error.
858 */
859
860 int
861 sk_join_group(sock *s, ip_addr maddr)
862 {
863 if (sk_is_ipv4(s))
864 return sk_join_group4(s, maddr);
865 else
866 return sk_join_group6(s, maddr);
867 }
868
869 /**
870 * sk_leave_group - leave multicast group for given socket
871 * @s: socket
872 * @maddr: multicast address
873 *
874 * Leave multicast group for given datagram socket and associated interface.
875 * The socket must have defined @iface.
876 *
877 * Result: 0 for success, -1 for an error.
878 */
879
880 int
881 sk_leave_group(sock *s, ip_addr maddr)
882 {
883 if (sk_is_ipv4(s))
884 return sk_leave_group4(s, maddr);
885 else
886 return sk_leave_group6(s, maddr);
887 }
888
889 /**
890 * sk_setup_broadcast - enable broadcast for given socket
891 * @s: socket
892 *
893 * Allow reception and transmission of broadcast packets for given datagram
894 * socket. The socket must have defined @iface. For transmission, packets should
895 * be send to @brd address of @iface.
896 *
897 * Result: 0 for success, -1 for an error.
898 */
899
900 int
901 sk_setup_broadcast(sock *s)
902 {
903 int y = 1;
904
905 if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
906 ERR("SO_BROADCAST");
907
908 return 0;
909 }
910
911 /**
912 * sk_set_ttl - set transmit TTL for given socket
913 * @s: socket
914 * @ttl: TTL value
915 *
916 * Set TTL for already opened connections when TTL was not set before. Useful
917 * for accepted connections when different ones should have different TTL.
918 *
919 * Result: 0 for success, -1 for an error.
920 */
921
922 int
923 sk_set_ttl(sock *s, int ttl)
924 {
925 s->ttl = ttl;
926
927 if (sk_is_ipv4(s))
928 return sk_set_ttl4(s, ttl);
929 else
930 return sk_set_ttl6(s, ttl);
931 }
932
933 /**
934 * sk_set_min_ttl - set minimal accepted TTL for given socket
935 * @s: socket
936 * @ttl: TTL value
937 *
938 * Set minimal accepted TTL for given socket. Can be used for TTL security.
939 * implementations.
940 *
941 * Result: 0 for success, -1 for an error.
942 */
943
944 int
945 sk_set_min_ttl(sock *s, int ttl)
946 {
947 if (sk_is_ipv4(s))
948 return sk_set_min_ttl4(s, ttl);
949 else
950 return sk_set_min_ttl6(s, ttl);
951 }
952
953 #if 0
954 /**
955 * sk_set_md5_auth - add / remove MD5 security association for given socket
956 * @s: socket
957 * @local: IP address of local side
958 * @remote: IP address of remote side
959 * @ifa: Interface for link-local IP address
960 * @passwd: Password used for MD5 authentication
961 * @setkey: Update also system SA/SP database
962 *
963 * In TCP MD5 handling code in kernel, there is a set of security associations
964 * used for choosing password and other authentication parameters according to
965 * the local and remote address. This function is useful for listening socket,
966 * for active sockets it may be enough to set s->password field.
967 *
968 * When called with passwd != NULL, the new pair is added,
969 * When called with passwd == NULL, the existing pair is removed.
970 *
971 * Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
972 * stored in global SA/SP database (but the behavior also must be enabled on
973 * per-socket basis). In case of multiple sockets to the same neighbor, the
974 * socket-specific state must be configured for each socket while global state
975 * just once per src-dst pair. The @setkey argument controls whether the global
976 * state (SA/SP database) is also updated.
977 *
978 * Result: 0 for success, -1 for an error.
979 */
980
981 int
982 sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
983 { DUMMY; }
984 #endif
985
986 /**
987 * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
988 * @s: socket
989 * @offset: offset
990 *
991 * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
992 * kernel will automatically fill it for outgoing packets and check it for
993 * incoming packets. Should not be used on ICMPv6 sockets, where the position is
994 * known to the kernel.
995 *
996 * Result: 0 for success, -1 for an error.
997 */
998
999 int
1000 sk_set_ipv6_checksum(sock *s, int offset)
1001 {
1002 if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
1003 ERR("IPV6_CHECKSUM");
1004
1005 return 0;
1006 }
1007
1008 int
1009 sk_set_icmp6_filter(sock *s, int p1, int p2)
1010 {
1011 /* a bit of lame interface, but it is here only for Radv */
1012 struct icmp6_filter f;
1013
1014 ICMP6_FILTER_SETBLOCKALL(&f);
1015 ICMP6_FILTER_SETPASS(p1, &f);
1016 ICMP6_FILTER_SETPASS(p2, &f);
1017
1018 if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
1019 ERR("ICMP6_FILTER");
1020
1021 return 0;
1022 }
1023
1024 void
1025 sk_log_error(sock *s, const char *p)
1026 {
1027 log(L_ERR "%s: Socket error: %s%#m", p, s->err);
1028 }
1029
1030
1031 /*
1032 * Actual struct birdsock code
1033 */
1034
1035 static list sock_list;
1036 static struct birdsock *current_sock;
1037 static struct birdsock *stored_sock;
1038
1039 static inline sock *
1040 sk_next(sock *s)
1041 {
1042 if (!s->n.next->next)
1043 return NULL;
1044 else
1045 return SKIP_BACK(sock, n, s->n.next);
1046 }
1047
1048 static void
1049 sk_alloc_bufs(sock *s)
1050 {
1051 if (!s->rbuf && s->rbsize)
1052 s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
1053 s->rpos = s->rbuf;
1054 if (!s->tbuf && s->tbsize)
1055 s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
1056 s->tpos = s->ttx = s->tbuf;
1057 }
1058
1059 static void
1060 sk_free_bufs(sock *s)
1061 {
1062 if (s->rbuf_alloc)
1063 {
1064 xfree(s->rbuf_alloc);
1065 s->rbuf = s->rbuf_alloc = NULL;
1066 }
1067 if (s->tbuf_alloc)
1068 {
1069 xfree(s->tbuf_alloc);
1070 s->tbuf = s->tbuf_alloc = NULL;
1071 }
1072 }
1073
1074 #ifdef HAVE_LIBSSH
1075 static void
1076 sk_ssh_free(sock *s)
1077 {
1078 struct ssh_sock *ssh = s->ssh;
1079
1080 if (s->ssh == NULL)
1081 return;
1082
1083 s->ssh = NULL;
1084
1085 if (ssh->channel)
1086 {
1087 if (ssh_channel_is_open(ssh->channel))
1088 ssh_channel_close(ssh->channel);
1089 ssh_channel_free(ssh->channel);
1090 ssh->channel = NULL;
1091 }
1092
1093 if (ssh->session)
1094 {
1095 ssh_disconnect(ssh->session);
1096 ssh_free(ssh->session);
1097 ssh->session = NULL;
1098 }
1099 }
1100 #endif
1101
1102 static void
1103 sk_free(resource *r)
1104 {
1105 sock *s = (sock *) r;
1106
1107 sk_free_bufs(s);
1108
1109 #ifdef HAVE_LIBSSH
1110 if (s->type == SK_SSH || s->type == SK_SSH_ACTIVE)
1111 sk_ssh_free(s);
1112 #endif
1113
1114 if (s->fd < 0)
1115 return;
1116
1117 /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
1118 if (!(s->flags & SKF_THREAD))
1119 {
1120 if (s == current_sock)
1121 current_sock = sk_next(s);
1122 if (s == stored_sock)
1123 stored_sock = sk_next(s);
1124 rem_node(&s->n);
1125 }
1126
1127 if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE)
1128 close(s->fd);
1129
1130 s->fd = -1;
1131 }
1132
1133 void
1134 sk_set_rbsize(sock *s, uint val)
1135 {
1136 ASSERT(s->rbuf_alloc == s->rbuf);
1137
1138 if (s->rbsize == val)
1139 return;
1140
1141 s->rbsize = val;
1142 xfree(s->rbuf_alloc);
1143 s->rbuf_alloc = xmalloc(val);
1144 s->rpos = s->rbuf = s->rbuf_alloc;
1145 }
1146
1147 void
1148 sk_set_tbsize(sock *s, uint val)
1149 {
1150 ASSERT(s->tbuf_alloc == s->tbuf);
1151
1152 if (s->tbsize == val)
1153 return;
1154
1155 byte *old_tbuf = s->tbuf;
1156
1157 s->tbsize = val;
1158 s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
1159 s->tpos = s->tbuf + (s->tpos - old_tbuf);
1160 s->ttx = s->tbuf + (s->ttx - old_tbuf);
1161 }
1162
1163 void
1164 sk_set_tbuf(sock *s, void *tbuf)
1165 {
1166 s->tbuf = tbuf ?: s->tbuf_alloc;
1167 s->ttx = s->tpos = s->tbuf;
1168 }
1169
1170 void
1171 sk_reallocate(sock *s)
1172 {
1173 sk_free_bufs(s);
1174 sk_alloc_bufs(s);
1175 }
1176
1177 static void
1178 sk_dump(resource *r)
1179 {
1180 sock *s = (sock *) r;
1181 static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "SSH>", "SSH", "DEL!" };
1182
1183 debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
1184 sk_type_names[s->type],
1185 s->data,
1186 s->saddr,
1187 s->sport,
1188 s->daddr,
1189 s->dport,
1190 s->tos,
1191 s->ttl,
1192 s->iface ? s->iface->name : "none");
1193 }
1194
1195 static struct resclass sk_class = {
1196 "Socket",
1197 sizeof(sock),
1198 sk_free,
1199 sk_dump,
1200 NULL,
1201 NULL
1202 };
1203
1204 /**
1205 * sk_new - create a socket
1206 * @p: pool
1207 *
1208 * This function creates a new socket resource. If you want to use it,
1209 * you need to fill in all the required fields of the structure and
1210 * call sk_open() to do the actual opening of the socket.
1211 *
1212 * The real function name is sock_new(), sk_new() is a macro wrapper
1213 * to avoid collision with OpenSSL.
1214 */
1215 sock *
1216 sock_new(pool *p)
1217 {
1218 sock *s = ralloc(p, &sk_class);
1219 s->pool = p;
1220 // s->saddr = s->daddr = IPA_NONE;
1221 s->tos = s->priority = s->ttl = -1;
1222 s->fd = -1;
1223 return s;
1224 }
1225
1226 static int
1227 sk_setup(sock *s)
1228 {
1229 int y = 1;
1230 int fd = s->fd;
1231
1232 if (s->type == SK_SSH_ACTIVE)
1233 return 0;
1234
1235 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1236 ERR("O_NONBLOCK");
1237
1238 if (!s->af)
1239 return 0;
1240
1241 if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
1242 s->flags |= SKF_PKTINFO;
1243
1244 #ifdef CONFIG_USE_HDRINCL
1245 if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
1246 {
1247 s->flags &= ~SKF_PKTINFO;
1248 s->flags |= SKF_HDRINCL;
1249 if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
1250 ERR("IP_HDRINCL");
1251 }
1252 #endif
1253
1254 if (s->iface)
1255 {
1256 #ifdef SO_BINDTODEVICE
1257 struct ifreq ifr = {};
1258 strcpy(ifr.ifr_name, s->iface->name);
1259 if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
1260 ERR("SO_BINDTODEVICE");
1261 #endif
1262
1263 #ifdef CONFIG_UNIX_DONTROUTE
1264 if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
1265 ERR("SO_DONTROUTE");
1266 #endif
1267 }
1268
1269 if (s->priority >= 0)
1270 if (sk_set_priority(s, s->priority) < 0)
1271 return -1;
1272
1273 if (sk_is_ipv4(s))
1274 {
1275 if (s->flags & SKF_LADDR_RX)
1276 if (sk_request_cmsg4_pktinfo(s) < 0)
1277 return -1;
1278
1279 if (s->flags & SKF_TTL_RX)
1280 if (sk_request_cmsg4_ttl(s) < 0)
1281 return -1;
1282
1283 if ((s->type == SK_UDP) || (s->type == SK_IP))
1284 if (sk_disable_mtu_disc4(s) < 0)
1285 return -1;
1286
1287 if (s->ttl >= 0)
1288 if (sk_set_ttl4(s, s->ttl) < 0)
1289 return -1;
1290
1291 if (s->tos >= 0)
1292 if (sk_set_tos4(s, s->tos) < 0)
1293 return -1;
1294 }
1295
1296 if (sk_is_ipv6(s))
1297 {
1298 if ((s->type == SK_TCP_PASSIVE) || (s->type == SK_TCP_ACTIVE) || (s->type == SK_UDP))
1299 if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
1300 ERR("IPV6_V6ONLY");
1301
1302 if (s->flags & SKF_LADDR_RX)
1303 if (sk_request_cmsg6_pktinfo(s) < 0)
1304 return -1;
1305
1306 if (s->flags & SKF_TTL_RX)
1307 if (sk_request_cmsg6_ttl(s) < 0)
1308 return -1;
1309
1310 if ((s->type == SK_UDP) || (s->type == SK_IP))
1311 if (sk_disable_mtu_disc6(s) < 0)
1312 return -1;
1313
1314 if (s->ttl >= 0)
1315 if (sk_set_ttl6(s, s->ttl) < 0)
1316 return -1;
1317
1318 if (s->tos >= 0)
1319 if (sk_set_tos6(s, s->tos) < 0)
1320 return -1;
1321 }
1322
1323 return 0;
1324 }
1325
1326 static void
1327 sk_insert(sock *s)
1328 {
1329 add_tail(&sock_list, &s->n);
1330 }
1331
1332 static void
1333 sk_tcp_connected(sock *s)
1334 {
1335 sockaddr sa;
1336 int sa_len = sizeof(sa);
1337
1338 if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
1339 (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
1340 log(L_WARN "SOCK: Cannot get local IP address for TCP>");
1341
1342 s->type = SK_TCP;
1343 sk_alloc_bufs(s);
1344 s->tx_hook(s);
1345 }
1346
1347 static void
1348 sk_ssh_connected(sock *s)
1349 {
1350 sk_alloc_bufs(s);
1351 s->type = SK_SSH;
1352 s->tx_hook(s);
1353 }
1354
1355 static int
1356 sk_passive_connected(sock *s, int type)
1357 {
1358 sockaddr loc_sa, rem_sa;
1359 int loc_sa_len = sizeof(loc_sa);
1360 int rem_sa_len = sizeof(rem_sa);
1361
1362 int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
1363 if (fd < 0)
1364 {
1365 if ((errno != EINTR) && (errno != EAGAIN))
1366 s->err_hook(s, errno);
1367 return 0;
1368 }
1369
1370 sock *t = sk_new(s->pool);
1371 t->type = type;
1372 t->af = s->af;
1373 t->fd = fd;
1374 t->ttl = s->ttl;
1375 t->tos = s->tos;
1376 t->rbsize = s->rbsize;
1377 t->tbsize = s->tbsize;
1378
1379 if (type == SK_TCP)
1380 {
1381 if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
1382 (sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
1383 log(L_WARN "SOCK: Cannot get local IP address for TCP<");
1384
1385 if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
1386 log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
1387 }
1388
1389 if (sk_setup(t) < 0)
1390 {
1391 /* FIXME: Call err_hook instead ? */
1392 log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
1393
1394 /* FIXME: handle it better in rfree() */
1395 close(t->fd);
1396 t->fd = -1;
1397 rfree(t);
1398 return 1;
1399 }
1400
1401 sk_insert(t);
1402 sk_alloc_bufs(t);
1403 s->rx_hook(t, 0);
1404 return 1;
1405 }
1406
1407 #ifdef HAVE_LIBSSH
1408 /*
1409 * Return SSH_OK or SSH_AGAIN or SSH_ERROR
1410 */
1411 static int
1412 sk_ssh_connect(sock *s)
1413 {
1414 s->fd = ssh_get_fd(s->ssh->session);
1415
1416 /* Big fall thru automata */
1417 switch (s->ssh->state)
1418 {
1419 case SK_SSH_CONNECT:
1420 {
1421 switch (ssh_connect(s->ssh->session))
1422 {
1423 case SSH_AGAIN:
1424 /* A quick look into libSSH shows that ssh_get_fd() should return non-(-1)
1425 * after SSH_AGAIN is returned by ssh_connect(). This is however nowhere
1426 * documented but our code relies on that.
1427 */
1428 return SSH_AGAIN;
1429
1430 case SSH_OK:
1431 break;
1432
1433 default:
1434 return SSH_ERROR;
1435 }
1436 }
1437
1438 case SK_SSH_SERVER_KNOWN:
1439 {
1440 s->ssh->state = SK_SSH_SERVER_KNOWN;
1441
1442 if (s->ssh->server_hostkey_path)
1443 {
1444 int server_identity_is_ok = 1;
1445
1446 /* Check server identity */
1447 switch (ssh_is_server_known(s->ssh->session))
1448 {
1449 #define LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s,msg,args...) log(L_WARN "SSH Identity %s@%s:%u: " msg, (s)->ssh->username, (s)->host, (s)->dport, ## args);
1450 case SSH_SERVER_KNOWN_OK:
1451 /* The server is known and has not changed. */
1452 break;
1453
1454 case SSH_SERVER_NOT_KNOWN:
1455 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server is unknown, its public key was not found in the known host file %s", s->ssh->server_hostkey_path);
1456 break;
1457
1458 case SSH_SERVER_KNOWN_CHANGED:
1459 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server key has changed. Either you are under attack or the administrator changed the key.");
1460 server_identity_is_ok = 0;
1461 break;
1462
1463 case SSH_SERVER_FILE_NOT_FOUND:
1464 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The known host file %s does not exist", s->ssh->server_hostkey_path);
1465 server_identity_is_ok = 0;
1466 break;
1467
1468 case SSH_SERVER_ERROR:
1469 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "Some error happened");
1470 server_identity_is_ok = 0;
1471 break;
1472
1473 case SSH_SERVER_FOUND_OTHER:
1474 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server gave use a key of a type while we had an other type recorded. " \
1475 "It is a possible attack.");
1476 server_identity_is_ok = 0;
1477 break;
1478 }
1479
1480 if (!server_identity_is_ok)
1481 return SSH_ERROR;
1482 }
1483 }
1484
1485 case SK_SSH_USERAUTH:
1486 {
1487 s->ssh->state = SK_SSH_USERAUTH;
1488 switch (ssh_userauth_publickey_auto(s->ssh->session, NULL, NULL))
1489 {
1490 case SSH_AUTH_AGAIN:
1491 return SSH_AGAIN;
1492
1493 case SSH_AUTH_SUCCESS:
1494 break;
1495
1496 default:
1497 return SSH_ERROR;
1498 }
1499 }
1500
1501 case SK_SSH_CHANNEL:
1502 {
1503 s->ssh->state = SK_SSH_CHANNEL;
1504 s->ssh->channel = ssh_channel_new(s->ssh->session);
1505 if (s->ssh->channel == NULL)
1506 return SSH_ERROR;
1507 }
1508
1509 case SK_SSH_SESSION:
1510 {
1511 s->ssh->state = SK_SSH_SESSION;
1512 switch (ssh_channel_open_session(s->ssh->channel))
1513 {
1514 case SSH_AGAIN:
1515 return SSH_AGAIN;
1516
1517 case SSH_OK:
1518 break;
1519
1520 default:
1521 return SSH_ERROR;
1522 }
1523 }
1524
1525 case SK_SSH_SUBSYSTEM:
1526 {
1527 s->ssh->state = SK_SSH_SUBSYSTEM;
1528 if (s->ssh->subsystem)
1529 {
1530 switch (ssh_channel_request_subsystem(s->ssh->channel, s->ssh->subsystem))
1531 {
1532 case SSH_AGAIN:
1533 return SSH_AGAIN;
1534
1535 case SSH_OK:
1536 break;
1537
1538 default:
1539 return SSH_ERROR;
1540 }
1541 }
1542 }
1543
1544 case SK_SSH_ESTABLISHED:
1545 s->ssh->state = SK_SSH_ESTABLISHED;
1546 }
1547
1548 return SSH_OK;
1549 }
1550
1551 /*
1552 * Return file descriptor number if success
1553 * Return -1 if failed
1554 */
1555 static int
1556 sk_open_ssh(sock *s)
1557 {
1558 if (!s->ssh)
1559 bug("sk_open() sock->ssh is not allocated");
1560
1561 ssh_session sess = ssh_new();
1562 if (sess == NULL)
1563 ERR2("Cannot create a ssh session");
1564 s->ssh->session = sess;
1565
1566 const int verbosity = SSH_LOG_NOLOG;
1567 ssh_options_set(sess, SSH_OPTIONS_LOG_VERBOSITY, &verbosity);
1568 ssh_options_set(sess, SSH_OPTIONS_HOST, s->host);
1569 ssh_options_set(sess, SSH_OPTIONS_PORT, &(s->dport));
1570 /* TODO: Add SSH_OPTIONS_BINDADDR */
1571 ssh_options_set(sess, SSH_OPTIONS_USER, s->ssh->username);
1572
1573 if (s->ssh->server_hostkey_path)
1574 ssh_options_set(sess, SSH_OPTIONS_KNOWNHOSTS, s->ssh->server_hostkey_path);
1575
1576 if (s->ssh->client_privkey_path)
1577 ssh_options_set(sess, SSH_OPTIONS_IDENTITY, s->ssh->client_privkey_path);
1578
1579 ssh_set_blocking(sess, 0);
1580
1581 switch (sk_ssh_connect(s))
1582 {
1583 case SSH_AGAIN:
1584 break;
1585
1586 case SSH_OK:
1587 sk_ssh_connected(s);
1588 break;
1589
1590 case SSH_ERROR:
1591 ERR2(ssh_get_error(sess));
1592 break;
1593 }
1594
1595 return ssh_get_fd(sess);
1596
1597 err:
1598 return -1;
1599 }
1600 #endif
1601
1602 /**
1603 * sk_open - open a socket
1604 * @s: socket
1605 *
1606 * This function takes a socket resource created by sk_new() and
1607 * initialized by the user and binds a corresponding network connection
1608 * to it.
1609 *
1610 * Result: 0 for success, -1 for an error.
1611 */
1612 int
1613 sk_open(sock *s)
1614 {
1615 int af = AF_UNSPEC;
1616 int fd = -1;
1617 int do_bind = 0;
1618 int bind_port = 0;
1619 ip_addr bind_addr = IPA_NONE;
1620 sockaddr sa;
1621
1622 if (s->type <= SK_IP)
1623 {
1624 /*
1625 * For TCP/IP sockets, Address family (IPv4 or IPv6) can be specified either
1626 * explicitly (SK_IPV4 or SK_IPV6) or implicitly (based on saddr, daddr).
1627 * But the specifications have to be consistent.
1628 */
1629
1630 switch (s->subtype)
1631 {
1632 case 0:
1633 ASSERT(ipa_zero(s->saddr) || ipa_zero(s->daddr) ||
1634 (ipa_is_ip4(s->saddr) == ipa_is_ip4(s->daddr)));
1635 af = (ipa_is_ip4(s->saddr) || ipa_is_ip4(s->daddr)) ? AF_INET : AF_INET6;
1636 break;
1637
1638 case SK_IPV4:
1639 ASSERT(ipa_zero(s->saddr) || ipa_is_ip4(s->saddr));
1640 ASSERT(ipa_zero(s->daddr) || ipa_is_ip4(s->daddr));
1641 af = AF_INET;
1642 break;
1643
1644 case SK_IPV6:
1645 ASSERT(ipa_zero(s->saddr) || !ipa_is_ip4(s->saddr));
1646 ASSERT(ipa_zero(s->daddr) || !ipa_is_ip4(s->daddr));
1647 af = AF_INET6;
1648 break;
1649
1650 default:
1651 bug("Invalid subtype %d", s->subtype);
1652 }
1653 }
1654
1655 switch (s->type)
1656 {
1657 case SK_TCP_ACTIVE:
1658 s->ttx = ""; /* Force s->ttx != s->tpos */
1659 /* Fall thru */
1660 case SK_TCP_PASSIVE:
1661 fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
1662 bind_port = s->sport;
1663 bind_addr = s->saddr;
1664 do_bind = bind_port || ipa_nonzero(bind_addr);
1665 break;
1666
1667 #ifdef HAVE_LIBSSH
1668 case SK_SSH_ACTIVE:
1669 s->ttx = ""; /* Force s->ttx != s->tpos */
1670 fd = sk_open_ssh(s);
1671 break;
1672 #endif
1673
1674 case SK_UDP:
1675 fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
1676 bind_port = s->sport;
1677 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1678 do_bind = 1;
1679 break;
1680
1681 case SK_IP:
1682 fd = socket(af, SOCK_RAW, s->dport);
1683 bind_port = 0;
1684 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1685 do_bind = ipa_nonzero(bind_addr);
1686 break;
1687
1688 case SK_MAGIC:
1689 af = 0;
1690 fd = s->fd;
1691 break;
1692
1693 default:
1694 bug("sk_open() called for invalid sock type %d", s->type);
1695 }
1696
1697 if (fd < 0)
1698 ERR("socket");
1699
1700 s->af = af;
1701 s->fd = fd;
1702
1703 if (sk_setup(s) < 0)
1704 goto err;
1705
1706 if (do_bind)
1707 {
1708 if (bind_port)
1709 {
1710 int y = 1;
1711
1712 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
1713 ERR2("SO_REUSEADDR");
1714
1715 #ifdef CONFIG_NO_IFACE_BIND
1716 /* Workaround missing ability to bind to an iface */
1717 if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
1718 {
1719 if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
1720 ERR2("SO_REUSEPORT");
1721 }
1722 #endif
1723 }
1724 else
1725 if (s->flags & SKF_HIGH_PORT)
1726 if (sk_set_high_port(s) < 0)
1727 log(L_WARN "Socket error: %s%#m", s->err);
1728
1729 sockaddr_fill(&sa, s->af, bind_addr, s->iface, bind_port);
1730 if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
1731 ERR2("bind");
1732 }
1733
1734 if (s->password)
1735 if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
1736 goto err;
1737
1738 switch (s->type)
1739 {
1740 case SK_TCP_ACTIVE:
1741 sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
1742 if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
1743 sk_tcp_connected(s);
1744 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
1745 errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
1746 ERR2("connect");
1747 break;
1748
1749 case SK_TCP_PASSIVE:
1750 if (listen(fd, 8) < 0)
1751 ERR2("listen");
1752 break;
1753
1754 case SK_SSH_ACTIVE:
1755 case SK_MAGIC:
1756 break;
1757
1758 default:
1759 sk_alloc_bufs(s);
1760 }
1761
1762 if (!(s->flags & SKF_THREAD))
1763 sk_insert(s);
1764
1765 return 0;
1766
1767 err:
1768 close(fd);
1769 s->fd = -1;
1770 return -1;
1771 }
1772
1773 int
1774 sk_open_unix(sock *s, char *name)
1775 {
1776 struct sockaddr_un sa;
1777 int fd;
1778
1779 /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
1780
1781 fd = socket(AF_UNIX, SOCK_STREAM, 0);
1782 if (fd < 0)
1783 return -1;
1784
1785 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1786 return -1;
1787
1788 /* Path length checked in test_old_bird() */
1789 sa.sun_family = AF_UNIX;
1790 strcpy(sa.sun_path, name);
1791
1792 if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
1793 return -1;
1794
1795 if (listen(fd, 8) < 0)
1796 return -1;
1797
1798 s->fd = fd;
1799 sk_insert(s);
1800 return 0;
1801 }
1802
1803
1804 #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
1805 CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
1806 #define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
1807
1808 static void
1809 sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
1810 {
1811 if (sk_is_ipv4(s))
1812 sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
1813 else
1814 sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
1815 }
1816
1817 static void
1818 sk_process_cmsgs(sock *s, struct msghdr *msg)
1819 {
1820 struct cmsghdr *cm;
1821
1822 s->laddr = IPA_NONE;
1823 s->lifindex = 0;
1824 s->rcv_ttl = -1;
1825
1826 for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
1827 {
1828 if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
1829 {
1830 sk_process_cmsg4_pktinfo(s, cm);
1831 sk_process_cmsg4_ttl(s, cm);
1832 }
1833
1834 if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
1835 {
1836 sk_process_cmsg6_pktinfo(s, cm);
1837 sk_process_cmsg6_ttl(s, cm);
1838 }
1839 }
1840 }
1841
1842
1843 static inline int
1844 sk_sendmsg(sock *s)
1845 {
1846 struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
1847 byte cmsg_buf[CMSG_TX_SPACE];
1848 sockaddr dst;
1849
1850 sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
1851
1852 struct msghdr msg = {
1853 .msg_name = &dst.sa,
1854 .msg_namelen = SA_LEN(dst),
1855 .msg_iov = &iov,
1856 .msg_iovlen = 1
1857 };
1858
1859 #ifdef CONFIG_USE_HDRINCL
1860 byte hdr[20];
1861 struct iovec iov2[2] = { {hdr, 20}, iov };
1862
1863 if (s->flags & SKF_HDRINCL)
1864 {
1865 sk_prepare_ip_header(s, hdr, iov.iov_len);
1866 msg.msg_iov = iov2;
1867 msg.msg_iovlen = 2;
1868 }
1869 #endif
1870
1871 if (s->flags & SKF_PKTINFO)
1872 sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
1873
1874 return sendmsg(s->fd, &msg, 0);
1875 }
1876
1877 static inline int
1878 sk_recvmsg(sock *s)
1879 {
1880 struct iovec iov = {s->rbuf, s->rbsize};
1881 byte cmsg_buf[CMSG_RX_SPACE];
1882 sockaddr src;
1883
1884 struct msghdr msg = {
1885 .msg_name = &src.sa,
1886 .msg_namelen = sizeof(src), // XXXX ??
1887 .msg_iov = &iov,
1888 .msg_iovlen = 1,
1889 .msg_control = cmsg_buf,
1890 .msg_controllen = sizeof(cmsg_buf),
1891 .msg_flags = 0
1892 };
1893
1894 int rv = recvmsg(s->fd, &msg, 0);
1895 if (rv < 0)
1896 return rv;
1897
1898 //ifdef IPV4
1899 // if (cf_type == SK_IP)
1900 // rv = ipv4_skip_header(pbuf, rv);
1901 //endif
1902
1903 sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
1904 sk_process_cmsgs(s, &msg);
1905
1906 if (msg.msg_flags & MSG_TRUNC)
1907 s->flags |= SKF_TRUNCATED;
1908 else
1909 s->flags &= ~SKF_TRUNCATED;
1910
1911 return rv;
1912 }
1913
1914
1915 static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
1916
1917 static int
1918 sk_maybe_write(sock *s)
1919 {
1920 int e;
1921
1922 switch (s->type)
1923 {
1924 case SK_TCP:
1925 case SK_MAGIC:
1926 case SK_UNIX:
1927 while (s->ttx != s->tpos)
1928 {
1929 e = write(s->fd, s->ttx, s->tpos - s->ttx);
1930
1931 if (e < 0)
1932 {
1933 if (errno != EINTR && errno != EAGAIN)
1934 {
1935 reset_tx_buffer(s);
1936 /* EPIPE is just a connection close notification during TX */
1937 s->err_hook(s, (errno != EPIPE) ? errno : 0);
1938 return -1;
1939 }
1940 return 0;
1941 }
1942 s->ttx += e;
1943 }
1944 reset_tx_buffer(s);
1945 return 1;
1946
1947 #ifdef HAVE_LIBSSH
1948 case SK_SSH:
1949 while (s->ttx != s->tpos)
1950 {
1951 e = ssh_channel_write(s->ssh->channel, s->ttx, s->tpos - s->ttx);
1952
1953 if (e < 0)
1954 {
1955 s->err = ssh_get_error(s->ssh->session);
1956 s->err_hook(s, ssh_get_error_code(s->ssh->session));
1957
1958 reset_tx_buffer(s);
1959 /* EPIPE is just a connection close notification during TX */
1960 s->err_hook(s, (errno != EPIPE) ? errno : 0);
1961 return -1;
1962 }
1963 s->ttx += e;
1964 }
1965 reset_tx_buffer(s);
1966 return 1;
1967 #endif
1968
1969 case SK_UDP:
1970 case SK_IP:
1971 {
1972 if (s->tbuf == s->tpos)
1973 return 1;
1974
1975 e = sk_sendmsg(s);
1976
1977 if (e < 0)
1978 {
1979 if (errno != EINTR && errno != EAGAIN)
1980 {
1981 reset_tx_buffer(s);
1982 s->err_hook(s, errno);
1983 return -1;
1984 }
1985
1986 if (!s->tx_hook)
1987 reset_tx_buffer(s);
1988 return 0;
1989 }
1990 reset_tx_buffer(s);
1991 return 1;
1992 }
1993
1994 default:
1995 bug("sk_maybe_write: unknown socket type %d", s->type);
1996 }
1997 }
1998
1999 int
2000 sk_rx_ready(sock *s)
2001 {
2002 int rv;
2003 struct pollfd pfd = { .fd = s->fd };
2004 pfd.events |= POLLIN;
2005
2006 redo:
2007 rv = poll(&pfd, 1, 0);
2008
2009 if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
2010 goto redo;
2011
2012 return rv;
2013 }
2014
2015 /**
2016 * sk_send - send data to a socket
2017 * @s: socket
2018 * @len: number of bytes to send
2019 *
2020 * This function sends @len bytes of data prepared in the
2021 * transmit buffer of the socket @s to the network connection.
2022 * If the packet can be sent immediately, it does so and returns
2023 * 1, else it queues the packet for later processing, returns 0
2024 * and calls the @tx_hook of the socket when the tranmission
2025 * takes place.
2026 */
2027 int
2028 sk_send(sock *s, unsigned len)
2029 {
2030 s->ttx = s->tbuf;
2031 s->tpos = s->tbuf + len;
2032 return sk_maybe_write(s);
2033 }
2034
2035 /**
2036 * sk_send_to - send data to a specific destination
2037 * @s: socket
2038 * @len: number of bytes to send
2039 * @addr: IP address to send the packet to
2040 * @port: port to send the packet to
2041 *
2042 * This is a sk_send() replacement for connection-less packet sockets
2043 * which allows destination of the packet to be chosen dynamically.
2044 * Raw IP sockets should use 0 for @port.
2045 */
2046 int
2047 sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
2048 {
2049 s->daddr = addr;
2050 if (port)
2051 s->dport = port;
2052
2053 s->ttx = s->tbuf;
2054 s->tpos = s->tbuf + len;
2055 return sk_maybe_write(s);
2056 }
2057
2058 /*
2059 int
2060 sk_send_full(sock *s, unsigned len, struct iface *ifa,
2061 ip_addr saddr, ip_addr daddr, unsigned dport)
2062 {
2063 s->iface = ifa;
2064 s->saddr = saddr;
2065 s->daddr = daddr;
2066 s->dport = dport;
2067 s->ttx = s->tbuf;
2068 s->tpos = s->tbuf + len;
2069 return sk_maybe_write(s);
2070 }
2071 */
2072
2073 static void
2074 call_rx_hook(sock *s, int size)
2075 {
2076 if (s->rx_hook(s, size))
2077 {
2078 /* We need to be careful since the socket could have been deleted by the hook */
2079 if (current_sock == s)
2080 s->rpos = s->rbuf;
2081 }
2082 }
2083
2084 #ifdef HAVE_LIBSSH
2085 static int
2086 sk_read_ssh(sock *s)
2087 {
2088 ssh_channel rchans[2] = { s->ssh->channel, NULL };
2089 struct timeval timev = { 1, 0 };
2090
2091 if (ssh_channel_select(rchans, NULL, NULL, &timev) == SSH_EINTR)
2092 return 1; /* Try again */
2093
2094 if (ssh_channel_is_eof(s->ssh->channel) != 0)
2095 {
2096 /* The remote side is closing the connection */
2097 s->err_hook(s, 0);
2098 return 0;
2099 }
2100
2101 if (rchans[0] == NULL)
2102 return 0; /* No data is available on the socket */
2103
2104 const uint used_bytes = s->rpos - s->rbuf;
2105 const int read_bytes = ssh_channel_read_nonblocking(s->ssh->channel, s->rpos, s->rbsize - used_bytes, 0);
2106 if (read_bytes > 0)
2107 {
2108 /* Received data */
2109 s->rpos += read_bytes;
2110 call_rx_hook(s, used_bytes + read_bytes);
2111 return 1;
2112 }
2113 else if (read_bytes == 0)
2114 {
2115 if (ssh_channel_is_eof(s->ssh->channel) != 0)
2116 {
2117 /* The remote side is closing the connection */
2118 s->err_hook(s, 0);
2119 }
2120 }
2121 else
2122 {
2123 s->err = ssh_get_error(s->ssh->session);
2124 s->err_hook(s, ssh_get_error_code(s->ssh->session));
2125 }
2126
2127 return 0; /* No data is available on the socket */
2128 }
2129 #endif
2130
2131 /* sk_read() and sk_write() are called from BFD's event loop */
2132
2133 int
2134 sk_read(sock *s, int revents)
2135 {
2136 switch (s->type)
2137 {
2138 case SK_TCP_PASSIVE:
2139 return sk_passive_connected(s, SK_TCP);
2140
2141 case SK_UNIX_PASSIVE:
2142 return sk_passive_connected(s, SK_UNIX);
2143
2144 case SK_TCP:
2145 case SK_UNIX:
2146 {
2147 int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
2148
2149 if (c < 0)
2150 {
2151 if (errno != EINTR && errno != EAGAIN)
2152 s->err_hook(s, errno);
2153 else if (errno == EAGAIN && !(revents & POLLIN))
2154 {
2155 log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
2156 s->err_hook(s, 0);
2157 }
2158 }
2159 else if (!c)
2160 s->err_hook(s, 0);
2161 else
2162 {
2163 s->rpos += c;
2164 call_rx_hook(s, s->rpos - s->rbuf);
2165 return 1;
2166 }
2167 return 0;
2168 }
2169
2170 #ifdef HAVE_LIBSSH
2171 case SK_SSH:
2172 return sk_read_ssh(s);
2173 #endif
2174
2175 case SK_MAGIC:
2176 return s->rx_hook(s, 0);
2177
2178 default:
2179 {
2180 int e = sk_recvmsg(s);
2181
2182 if (e < 0)
2183 {
2184 if (errno != EINTR && errno != EAGAIN)
2185 s->err_hook(s, errno);
2186 return 0;
2187 }
2188
2189 s->rpos = s->rbuf + e;
2190 s->rx_hook(s, e);
2191 return 1;
2192 }
2193 }
2194 }
2195
2196 int
2197 sk_write(sock *s)
2198 {
2199 switch (s->type)
2200 {
2201 case SK_TCP_ACTIVE:
2202 {
2203 sockaddr sa;
2204 sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
2205
2206 if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
2207 sk_tcp_connected(s);
2208 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
2209 s->err_hook(s, errno);
2210 return 0;
2211 }
2212
2213 #ifdef HAVE_LIBSSH
2214 case SK_SSH_ACTIVE:
2215 {
2216 switch (sk_ssh_connect(s))
2217 {
2218 case SSH_OK:
2219 sk_ssh_connected(s);
2220 break;
2221
2222 case SSH_AGAIN:
2223 return 1;
2224
2225 case SSH_ERROR:
2226 s->err = ssh_get_error(s->ssh->session);
2227 s->err_hook(s, ssh_get_error_code(s->ssh->session));
2228 break;
2229 }
2230 return 0;
2231 }
2232 #endif
2233
2234 default:
2235 if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
2236 {
2237 if (s->tx_hook)
2238 s->tx_hook(s);
2239 return 1;
2240 }
2241 return 0;
2242 }
2243 }
2244
2245 int sk_is_ipv4(sock *s)
2246 { return s->af == AF_INET; }
2247
2248 int sk_is_ipv6(sock *s)
2249 { return s->af == AF_INET6; }
2250
2251 void
2252 sk_err(sock *s, int revents)
2253 {
2254 int se = 0, sse = sizeof(se);
2255 if ((s->type != SK_MAGIC) && (revents & POLLERR))
2256 if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
2257 {
2258 log(L_ERR "IO: Socket error: SO_ERROR: %m");
2259 se = 0;
2260 }
2261
2262 s->err_hook(s, se);
2263 }
2264
2265 void
2266 sk_dump_all(void)
2267 {
2268 node *n;
2269 sock *s;
2270
2271 debug("Open sockets:\n");
2272 WALK_LIST(n, sock_list)
2273 {
2274 s = SKIP_BACK(sock, n, n);
2275 debug("%p ", s);
2276 sk_dump(&s->r);
2277 }
2278 debug("\n");
2279 }
2280
2281
2282 /*
2283 * Internal event log and watchdog
2284 */
2285
2286 #define EVENT_LOG_LENGTH 32
2287
2288 struct event_log_entry
2289 {
2290 void *hook;
2291 void *data;
2292 btime timestamp;
2293 btime duration;
2294 };
2295
2296 static struct event_log_entry event_log[EVENT_LOG_LENGTH];
2297 static struct event_log_entry *event_open;
2298 static int event_log_pos, event_log_num, watchdog_active;
2299 static btime last_time;
2300 static btime loop_time;
2301
2302 static void
2303 io_update_time(void)
2304 {
2305 struct timespec ts;
2306 int rv;
2307
2308 if (!clock_monotonic_available)
2309 return;
2310
2311 /*
2312 * This is third time-tracking procedure (after update_times() above and
2313 * times_update() in BFD), dedicated to internal event log and latency
2314 * tracking. Hopefully, we consolidate these sometimes.
2315 */
2316
2317 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
2318 if (rv < 0)
2319 die("clock_gettime: %m");
2320
2321 last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
2322
2323 if (event_open)
2324 {
2325 event_open->duration = last_time - event_open->timestamp;
2326
2327 if (event_open->duration > config->latency_limit)
2328 log(L_WARN "Event 0x%p 0x%p took %d ms",
2329 event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
2330
2331 event_open = NULL;
2332 }
2333 }
2334
2335 /**
2336 * io_log_event - mark approaching event into event log
2337 * @hook: event hook address
2338 * @data: event data address
2339 *
2340 * Store info (hook, data, timestamp) about the following internal event into
2341 * a circular event log (@event_log). When latency tracking is enabled, the log
2342 * entry is kept open (in @event_open) so the duration can be filled later.
2343 */
2344 void
2345 io_log_event(void *hook, void *data)
2346 {
2347 if (config->latency_debug)
2348 io_update_time();
2349
2350 struct event_log_entry *en = event_log + event_log_pos;
2351
2352 en->hook = hook;
2353 en->data = data;
2354 en->timestamp = last_time;
2355 en->duration = 0;
2356
2357 event_log_num++;
2358 event_log_pos++;
2359 event_log_pos %= EVENT_LOG_LENGTH;
2360
2361 event_open = config->latency_debug ? en : NULL;
2362 }
2363
2364 static inline void
2365 io_close_event(void)
2366 {
2367 if (event_open)
2368 io_update_time();
2369 }
2370
2371 void
2372 io_log_dump(void)
2373 {
2374 int i;
2375
2376 log(L_DEBUG "Event log:");
2377 for (i = 0; i < EVENT_LOG_LENGTH; i++)
2378 {
2379 struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
2380 if (en->hook)
2381 log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
2382 (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
2383 }
2384 }
2385
2386 void
2387 watchdog_sigalrm(int sig UNUSED)
2388 {
2389 /* Update last_time and duration, but skip latency check */
2390 config->latency_limit = 0xffffffff;
2391 io_update_time();
2392
2393 /* We want core dump */
2394 abort();
2395 }
2396
2397 static inline void
2398 watchdog_start1(void)
2399 {
2400 io_update_time();
2401
2402 loop_time = last_time;
2403 }
2404
2405 static inline void
2406 watchdog_start(void)
2407 {
2408 io_update_time();
2409
2410 loop_time = last_time;
2411 event_log_num = 0;
2412
2413 if (config->watchdog_timeout)
2414 {
2415 alarm(config->watchdog_timeout);
2416 watchdog_active = 1;
2417 }
2418 }
2419
2420 static inline void
2421 watchdog_stop(void)
2422 {
2423 io_update_time();
2424
2425 if (watchdog_active)
2426 {
2427 alarm(0);
2428 watchdog_active = 0;
2429 }
2430
2431 btime duration = last_time - loop_time;
2432 if (duration > config->watchdog_warning)
2433 log(L_WARN "I/O loop cycle took %d ms for %d events",
2434 (int) (duration TO_MS), event_log_num);
2435 }
2436
2437
2438 /*
2439 * Main I/O Loop
2440 */
2441
2442 volatile int async_config_flag; /* Asynchronous reconfiguration/dump scheduled */
2443 volatile int async_dump_flag;
2444 volatile int async_shutdown_flag;
2445
2446 void
2447 io_init(void)
2448 {
2449 init_list(&near_timers);
2450 init_list(&far_timers);
2451 init_list(&sock_list);
2452 init_list(&global_event_list);
2453 krt_io_init();
2454 init_times();
2455 update_times();
2456 boot_time = now;
2457 srandom((int) now_real);
2458 }
2459
2460 static int short_loops = 0;
2461 #define SHORT_LOOP_MAX 10
2462
2463 void
2464 io_loop(void)
2465 {
2466 int poll_tout;
2467 time_t tout;
2468 int nfds, events, pout;
2469 sock *s;
2470 node *n;
2471 int fdmax = 256;
2472 struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
2473
2474 watchdog_start1();
2475 for(;;)
2476 {
2477 events = ev_run_list(&global_event_list);
2478 timers:
2479 update_times();
2480 tout = tm_first_shot();
2481 if (tout <= now)
2482 {
2483 tm_shot();
2484 goto timers;
2485 }
2486 poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */
2487
2488 io_close_event();
2489
2490 nfds = 0;
2491 WALK_LIST(n, sock_list)
2492 {
2493 pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
2494 s = SKIP_BACK(sock, n, n);
2495 if (s->rx_hook)
2496 {
2497 pfd[nfds].fd = s->fd;
2498 pfd[nfds].events |= POLLIN;
2499 }
2500 if (s->tx_hook && s->ttx != s->tpos)
2501 {
2502 pfd[nfds].fd = s->fd;
2503 pfd[nfds].events |= POLLOUT;
2504 }
2505 if (pfd[nfds].fd != -1)
2506 {
2507 s->index = nfds;
2508 nfds++;
2509 }
2510 else
2511 s->index = -1;
2512
2513 if (nfds >= fdmax)
2514 {
2515 fdmax *= 2;
2516 pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
2517 }
2518 }
2519
2520 /*
2521 * Yes, this is racy. But even if the signal comes before this test
2522 * and entering poll(), it gets caught on the next timer tick.
2523 */
2524
2525 if (async_config_flag)
2526 {
2527 io_log_event(async_config, NULL);
2528 async_config();
2529 async_config_flag = 0;
2530 continue;
2531 }
2532 if (async_dump_flag)
2533 {
2534 io_log_event(async_dump, NULL);
2535 async_dump();
2536 async_dump_flag = 0;
2537 continue;
2538 }
2539 if (async_shutdown_flag)
2540 {
2541 io_log_event(async_shutdown, NULL);
2542 async_shutdown();
2543 async_shutdown_flag = 0;
2544 continue;
2545 }
2546
2547 /* And finally enter poll() to find active sockets */
2548 watchdog_stop();
2549 pout = poll(pfd, nfds, poll_tout);
2550 watchdog_start();
2551
2552 if (pout < 0)
2553 {
2554 if (errno == EINTR || errno == EAGAIN)
2555 continue;
2556 die("poll: %m");
2557 }
2558 if (pout)
2559 {
2560 /* guaranteed to be non-empty */
2561 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2562
2563 while (current_sock)
2564 {
2565 sock *s = current_sock;
2566 if (s->index == -1)
2567 {
2568 current_sock = sk_next(s);
2569 goto next;
2570 }
2571
2572 int e;
2573 int steps;
2574
2575 steps = MAX_STEPS;
2576 if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2577 do
2578 {
2579 steps--;
2580 io_log_event(s->rx_hook, s->data);
2581 e = sk_read(s, pfd[s->index].revents);
2582 if (s != current_sock)
2583 goto next;
2584 }
2585 while (e && s->rx_hook && steps);
2586
2587 steps = MAX_STEPS;
2588 if (pfd[s->index].revents & POLLOUT)
2589 do
2590 {
2591 steps--;
2592 io_log_event(s->tx_hook, s->data);
2593 e = sk_write(s);
2594 if (s != current_sock)
2595 goto next;
2596 }
2597 while (e && steps);
2598
2599 current_sock = sk_next(s);
2600 next: ;
2601 }
2602
2603 short_loops++;
2604 if (events && (short_loops < SHORT_LOOP_MAX))
2605 continue;
2606 short_loops = 0;
2607
2608 int count = 0;
2609 current_sock = stored_sock;
2610 if (current_sock == NULL)
2611 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2612
2613 while (current_sock && count < MAX_RX_STEPS)
2614 {
2615 sock *s = current_sock;
2616 if (s->index == -1)
2617 {
2618 current_sock = sk_next(s);
2619 goto next2;
2620 }
2621
2622 if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2623 {
2624 count++;
2625 io_log_event(s->rx_hook, s->data);
2626 sk_read(s, pfd[s->index].revents);
2627 if (s != current_sock)
2628 goto next2;
2629 }
2630
2631 if (pfd[s->index].revents & (POLLHUP | POLLERR))
2632 {
2633 sk_err(s, pfd[s->index].revents);
2634 goto next2;
2635 }
2636
2637 current_sock = sk_next(s);
2638 next2: ;
2639 }
2640
2641
2642 stored_sock = current_sock;
2643 }
2644 }
2645 }
2646
2647 void
2648 test_old_bird(char *path)
2649 {
2650 int fd;
2651 struct sockaddr_un sa;
2652
2653 fd = socket(AF_UNIX, SOCK_STREAM, 0);
2654 if (fd < 0)
2655 die("Cannot create socket: %m");
2656 if (strlen(path) >= sizeof(sa.sun_path))
2657 die("Socket path too long");
2658 bzero(&sa, sizeof(sa));
2659 sa.sun_family = AF_UNIX;
2660 strcpy(sa.sun_path, path);
2661 if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
2662 die("I found another BIRD running.");
2663 close(fd);
2664 }