]> git.ipfire.org Git - thirdparty/bird.git/blob - sysdep/unix/io.c
Merge branch 'master' into int-new
[thirdparty/bird.git] / sysdep / unix / io.c
1 /*
2 * BIRD Internet Routing Daemon -- Unix I/O
3 *
4 * (c) 1998--2004 Martin Mares <mj@ucw.cz>
5 * (c) 2004 Ondrej Filip <feela@network.cz>
6 *
7 * Can be freely distributed and used under the terms of the GNU GPL.
8 */
9
10 /* Unfortunately, some glibc versions hide parts of RFC 3542 API
11 if _GNU_SOURCE is not defined. */
12 #ifndef _GNU_SOURCE
13 #define _GNU_SOURCE
14 #endif
15
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <time.h>
19 #include <sys/time.h>
20 #include <sys/types.h>
21 #include <sys/socket.h>
22 #include <sys/uio.h>
23 #include <sys/un.h>
24 #include <poll.h>
25 #include <unistd.h>
26 #include <fcntl.h>
27 #include <errno.h>
28 #include <net/if.h>
29 #include <netinet/in.h>
30 #include <netinet/tcp.h>
31 #include <netinet/udp.h>
32 #include <netinet/icmp6.h>
33
34 #include "nest/bird.h"
35 #include "lib/lists.h"
36 #include "lib/resource.h"
37 #include "sysdep/unix/timer.h"
38 #include "lib/socket.h"
39 #include "lib/event.h"
40 #include "lib/string.h"
41 #include "nest/iface.h"
42
43 #include "sysdep/unix/unix.h"
44 #include CONFIG_INCLUDE_SYSIO_H
45
46 /* Maximum number of calls of tx handler for one socket in one
47 * poll iteration. Should be small enough to not monopolize CPU by
48 * one protocol instance.
49 */
50 #define MAX_STEPS 4
51
52 /* Maximum number of calls of rx handler for all sockets in one poll
53 iteration. RX callbacks are often much more costly so we limit
54 this to gen small latencies */
55 #define MAX_RX_STEPS 4
56
57 /*
58 * Tracked Files
59 */
60
61 struct rfile {
62 resource r;
63 FILE *f;
64 };
65
66 static void
67 rf_free(resource *r)
68 {
69 struct rfile *a = (struct rfile *) r;
70
71 fclose(a->f);
72 }
73
74 static void
75 rf_dump(resource *r)
76 {
77 struct rfile *a = (struct rfile *) r;
78
79 debug("(FILE *%p)\n", a->f);
80 }
81
82 static struct resclass rf_class = {
83 "FILE",
84 sizeof(struct rfile),
85 rf_free,
86 rf_dump,
87 NULL,
88 NULL
89 };
90
91 void *
92 tracked_fopen(pool *p, char *name, char *mode)
93 {
94 FILE *f = fopen(name, mode);
95
96 if (f)
97 {
98 struct rfile *r = ralloc(p, &rf_class);
99 r->f = f;
100 }
101 return f;
102 }
103
104 /**
105 * DOC: Timers
106 *
107 * Timers are resources which represent a wish of a module to call
108 * a function at the specified time. The platform dependent code
109 * doesn't guarantee exact timing, only that a timer function
110 * won't be called before the requested time.
111 *
112 * In BIRD, time is represented by values of the &bird_clock_t type
113 * which are integral numbers interpreted as a relative number of seconds since
114 * some fixed time point in past. The current time can be read
115 * from variable @now with reasonable accuracy and is monotonic. There is also
116 * a current 'absolute' time in variable @now_real reported by OS.
117 *
118 * Each timer is described by a &timer structure containing a pointer
119 * to the handler function (@hook), data private to this function (@data),
120 * time the function should be called at (@expires, 0 for inactive timers),
121 * for the other fields see |timer.h|.
122 */
123
124 #define NEAR_TIMER_LIMIT 4
125
126 static list near_timers, far_timers;
127 static bird_clock_t first_far_timer = TIME_INFINITY;
128
129 /* now must be different from 0, because 0 is a special value in timer->expires */
130 bird_clock_t now = 1, now_real, boot_time;
131
132 static void
133 update_times_plain(void)
134 {
135 bird_clock_t new_time = time(NULL);
136 int delta = new_time - now_real;
137
138 if ((delta >= 0) && (delta < 60))
139 now += delta;
140 else if (now_real != 0)
141 log(L_WARN "Time jump, delta %d s", delta);
142
143 now_real = new_time;
144 }
145
146 static void
147 update_times_gettime(void)
148 {
149 struct timespec ts;
150 int rv;
151
152 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
153 if (rv != 0)
154 die("clock_gettime: %m");
155
156 if (ts.tv_sec != now) {
157 if (ts.tv_sec < now)
158 log(L_ERR "Monotonic timer is broken");
159
160 now = ts.tv_sec;
161 now_real = time(NULL);
162 }
163 }
164
165 static int clock_monotonic_available;
166
167 static inline void
168 update_times(void)
169 {
170 if (clock_monotonic_available)
171 update_times_gettime();
172 else
173 update_times_plain();
174 }
175
176 static inline void
177 init_times(void)
178 {
179 struct timespec ts;
180 clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
181 if (!clock_monotonic_available)
182 log(L_WARN "Monotonic timer is missing");
183 }
184
185
186 static void
187 tm_free(resource *r)
188 {
189 timer *t = (timer *) r;
190
191 tm_stop(t);
192 }
193
194 static void
195 tm_dump(resource *r)
196 {
197 timer *t = (timer *) r;
198
199 debug("(code %p, data %p, ", t->hook, t->data);
200 if (t->randomize)
201 debug("rand %d, ", t->randomize);
202 if (t->recurrent)
203 debug("recur %d, ", t->recurrent);
204 if (t->expires)
205 debug("expires in %d sec)\n", t->expires - now);
206 else
207 debug("inactive)\n");
208 }
209
210 static struct resclass tm_class = {
211 "Timer",
212 sizeof(timer),
213 tm_free,
214 tm_dump,
215 NULL,
216 NULL
217 };
218
219 /**
220 * tm_new - create a timer
221 * @p: pool
222 *
223 * This function creates a new timer resource and returns
224 * a pointer to it. To use the timer, you need to fill in
225 * the structure fields and call tm_start() to start timing.
226 */
227 timer *
228 tm_new(pool *p)
229 {
230 timer *t = ralloc(p, &tm_class);
231 return t;
232 }
233
234 static inline void
235 tm_insert_near(timer *t)
236 {
237 node *n = HEAD(near_timers);
238
239 while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
240 n = n->next;
241 insert_node(&t->n, n->prev);
242 }
243
244 /**
245 * tm_start - start a timer
246 * @t: timer
247 * @after: number of seconds the timer should be run after
248 *
249 * This function schedules the hook function of the timer to
250 * be called after @after seconds. If the timer has been already
251 * started, it's @expire time is replaced by the new value.
252 *
253 * You can have set the @randomize field of @t, the timeout
254 * will be increased by a random number of seconds chosen
255 * uniformly from range 0 .. @randomize.
256 *
257 * You can call tm_start() from the handler function of the timer
258 * to request another run of the timer. Also, you can set the @recurrent
259 * field to have the timer re-added automatically with the same timeout.
260 */
261 void
262 tm_start(timer *t, unsigned after)
263 {
264 bird_clock_t when;
265
266 if (t->randomize)
267 after += random() % (t->randomize + 1);
268 when = now + after;
269 if (t->expires == when)
270 return;
271 if (t->expires)
272 rem_node(&t->n);
273 t->expires = when;
274 if (after <= NEAR_TIMER_LIMIT)
275 tm_insert_near(t);
276 else
277 {
278 if (!first_far_timer || first_far_timer > when)
279 first_far_timer = when;
280 add_tail(&far_timers, &t->n);
281 }
282 }
283
284 /**
285 * tm_stop - stop a timer
286 * @t: timer
287 *
288 * This function stops a timer. If the timer is already stopped,
289 * nothing happens.
290 */
291 void
292 tm_stop(timer *t)
293 {
294 if (t->expires)
295 {
296 rem_node(&t->n);
297 t->expires = 0;
298 }
299 }
300
301 static void
302 tm_dump_them(char *name, list *l)
303 {
304 node *n;
305 timer *t;
306
307 debug("%s timers:\n", name);
308 WALK_LIST(n, *l)
309 {
310 t = SKIP_BACK(timer, n, n);
311 debug("%p ", t);
312 tm_dump(&t->r);
313 }
314 debug("\n");
315 }
316
317 void
318 tm_dump_all(void)
319 {
320 tm_dump_them("Near", &near_timers);
321 tm_dump_them("Far", &far_timers);
322 }
323
324 static inline time_t
325 tm_first_shot(void)
326 {
327 time_t x = first_far_timer;
328
329 if (!EMPTY_LIST(near_timers))
330 {
331 timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
332 if (t->expires < x)
333 x = t->expires;
334 }
335 return x;
336 }
337
338 void io_log_event(void *hook, void *data);
339
340 static void
341 tm_shot(void)
342 {
343 timer *t;
344 node *n, *m;
345
346 if (first_far_timer <= now)
347 {
348 bird_clock_t limit = now + NEAR_TIMER_LIMIT;
349 first_far_timer = TIME_INFINITY;
350 n = HEAD(far_timers);
351 while (m = n->next)
352 {
353 t = SKIP_BACK(timer, n, n);
354 if (t->expires <= limit)
355 {
356 rem_node(n);
357 tm_insert_near(t);
358 }
359 else if (t->expires < first_far_timer)
360 first_far_timer = t->expires;
361 n = m;
362 }
363 }
364 while ((n = HEAD(near_timers)) -> next)
365 {
366 int delay;
367 t = SKIP_BACK(timer, n, n);
368 if (t->expires > now)
369 break;
370 rem_node(n);
371 delay = t->expires - now;
372 t->expires = 0;
373 if (t->recurrent)
374 {
375 int i = t->recurrent - delay;
376 if (i < 0)
377 i = 0;
378 tm_start(t, i);
379 }
380 io_log_event(t->hook, t->data);
381 t->hook(t);
382 }
383 }
384
385 /**
386 * tm_parse_datetime - parse a date and time
387 * @x: datetime string
388 *
389 * tm_parse_datetime() takes a textual representation of
390 * a date and time (dd-mm-yyyy hh:mm:ss)
391 * and converts it to the corresponding value of type &bird_clock_t.
392 */
393 bird_clock_t
394 tm_parse_datetime(char *x)
395 {
396 struct tm tm;
397 int n;
398 time_t t;
399
400 if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
401 return tm_parse_date(x);
402 tm.tm_mon--;
403 tm.tm_year -= 1900;
404 t = mktime(&tm);
405 if (t == (time_t) -1)
406 return 0;
407 return t;
408 }
409 /**
410 * tm_parse_date - parse a date
411 * @x: date string
412 *
413 * tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
414 * and converts it to the corresponding value of type &bird_clock_t.
415 */
416 bird_clock_t
417 tm_parse_date(char *x)
418 {
419 struct tm tm;
420 int n;
421 time_t t;
422
423 if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
424 return 0;
425 tm.tm_mon--;
426 tm.tm_year -= 1900;
427 tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
428 t = mktime(&tm);
429 if (t == (time_t) -1)
430 return 0;
431 return t;
432 }
433
434 static void
435 tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
436 {
437 static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
438 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
439
440 if (delta < 20*3600)
441 bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
442 else if (delta < 360*86400)
443 bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
444 else
445 bsprintf(x, "%d", tm->tm_year+1900);
446 }
447
448 #include "conf/conf.h"
449
450 /**
451 * tm_format_datetime - convert date and time to textual representation
452 * @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
453 * @fmt_spec: specification of resulting textual representation of the time
454 * @t: time
455 *
456 * This function formats the given relative time value @t to a textual
457 * date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
458 */
459 void
460 tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
461 {
462 const char *fmt_used;
463 struct tm *tm;
464 bird_clock_t delta = now - t;
465 t = now_real - delta;
466 tm = localtime(&t);
467
468 if (fmt_spec->fmt1 == NULL)
469 return tm_format_reltime(x, tm, delta);
470
471 if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
472 fmt_used = fmt_spec->fmt1;
473 else
474 fmt_used = fmt_spec->fmt2;
475
476 int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
477 if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
478 strcpy(x, "<too-long>");
479 }
480
481
482 /**
483 * DOC: Sockets
484 *
485 * Socket resources represent network connections. Their data structure (&socket)
486 * contains a lot of fields defining the exact type of the socket, the local and
487 * remote addresses and ports, pointers to socket buffers and finally pointers to
488 * hook functions to be called when new data have arrived to the receive buffer
489 * (@rx_hook), when the contents of the transmit buffer have been transmitted
490 * (@tx_hook) and when an error or connection close occurs (@err_hook).
491 *
492 * Freeing of sockets from inside socket hooks is perfectly safe.
493 */
494
495 #ifndef SOL_IP
496 #define SOL_IP IPPROTO_IP
497 #endif
498
499 #ifndef SOL_IPV6
500 #define SOL_IPV6 IPPROTO_IPV6
501 #endif
502
503 #ifndef SOL_ICMPV6
504 #define SOL_ICMPV6 IPPROTO_ICMPV6
505 #endif
506
507
508 /*
509 * Sockaddr helper functions
510 */
511
512 static inline int UNUSED sockaddr_length(int af)
513 { return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
514
515 static inline void
516 sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
517 {
518 memset(sa, 0, sizeof(struct sockaddr_in));
519 #ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
520 sa->sin_len = sizeof(struct sockaddr_in);
521 #endif
522 sa->sin_family = AF_INET;
523 sa->sin_port = htons(port);
524 sa->sin_addr = ipa_to_in4(a);
525 }
526
527 static inline void
528 sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
529 {
530 memset(sa, 0, sizeof(struct sockaddr_in6));
531 #ifdef SIN6_LEN
532 sa->sin6_len = sizeof(struct sockaddr_in6);
533 #endif
534 sa->sin6_family = AF_INET6;
535 sa->sin6_port = htons(port);
536 sa->sin6_flowinfo = 0;
537 sa->sin6_addr = ipa_to_in6(a);
538
539 if (ifa && ipa_is_link_local(a))
540 sa->sin6_scope_id = ifa->index;
541 }
542
543 void
544 sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
545 {
546 if (af == AF_INET)
547 sockaddr_fill4((struct sockaddr_in *) sa, a, port);
548 else if (af == AF_INET6)
549 sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
550 else
551 bug("Unknown AF");
552 }
553
554 static inline void
555 sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
556 {
557 *port = ntohs(sa->sin_port);
558 *a = ipa_from_in4(sa->sin_addr);
559 }
560
561 static inline void
562 sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
563 {
564 *port = ntohs(sa->sin6_port);
565 *a = ipa_from_in6(sa->sin6_addr);
566
567 if (ifa && ipa_is_link_local(*a))
568 *ifa = if_find_by_index(sa->sin6_scope_id);
569 }
570
571 int
572 sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
573 {
574 if (sa->sa.sa_family != af)
575 goto fail;
576
577 if (af == AF_INET)
578 sockaddr_read4((struct sockaddr_in *) sa, a, port);
579 else if (af == AF_INET6)
580 sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
581 else
582 goto fail;
583
584 return 0;
585
586 fail:
587 *a = IPA_NONE;
588 *port = 0;
589 return -1;
590 }
591
592
593 /*
594 * IPv6 multicast syscalls
595 */
596
597 /* Fortunately standardized in RFC 3493 */
598
599 #define INIT_MREQ6(maddr,ifa) \
600 { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
601
602 static inline int
603 sk_setup_multicast6(sock *s)
604 {
605 int index = s->iface->index;
606 int ttl = s->ttl;
607 int n = 0;
608
609 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
610 ERR("IPV6_MULTICAST_IF");
611
612 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
613 ERR("IPV6_MULTICAST_HOPS");
614
615 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
616 ERR("IPV6_MULTICAST_LOOP");
617
618 return 0;
619 }
620
621 static inline int
622 sk_join_group6(sock *s, ip_addr maddr)
623 {
624 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
625
626 if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
627 ERR("IPV6_JOIN_GROUP");
628
629 return 0;
630 }
631
632 static inline int
633 sk_leave_group6(sock *s, ip_addr maddr)
634 {
635 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
636
637 if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
638 ERR("IPV6_LEAVE_GROUP");
639
640 return 0;
641 }
642
643
644 /*
645 * IPv6 packet control messages
646 */
647
648 /* Also standardized, in RFC 3542 */
649
650 /*
651 * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
652 * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
653 * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
654 * RFC and we use IPV6_PKTINFO.
655 */
656 #ifndef IPV6_RECVPKTINFO
657 #define IPV6_RECVPKTINFO IPV6_PKTINFO
658 #endif
659 /*
660 * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
661 */
662 #ifndef IPV6_RECVHOPLIMIT
663 #define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
664 #endif
665
666
667 #define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
668 #define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
669
670 static inline int
671 sk_request_cmsg6_pktinfo(sock *s)
672 {
673 int y = 1;
674
675 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
676 ERR("IPV6_RECVPKTINFO");
677
678 return 0;
679 }
680
681 static inline int
682 sk_request_cmsg6_ttl(sock *s)
683 {
684 int y = 1;
685
686 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
687 ERR("IPV6_RECVHOPLIMIT");
688
689 return 0;
690 }
691
692 static inline void
693 sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
694 {
695 if (cm->cmsg_type == IPV6_PKTINFO)
696 {
697 struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
698 s->laddr = ipa_from_in6(pi->ipi6_addr);
699 s->lifindex = pi->ipi6_ifindex;
700 }
701 }
702
703 static inline void
704 sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
705 {
706 if (cm->cmsg_type == IPV6_HOPLIMIT)
707 s->rcv_ttl = * (int *) CMSG_DATA(cm);
708 }
709
710 static inline void
711 sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
712 {
713 struct cmsghdr *cm;
714 struct in6_pktinfo *pi;
715 int controllen = 0;
716
717 msg->msg_control = cbuf;
718 msg->msg_controllen = cbuflen;
719
720 cm = CMSG_FIRSTHDR(msg);
721 cm->cmsg_level = SOL_IPV6;
722 cm->cmsg_type = IPV6_PKTINFO;
723 cm->cmsg_len = CMSG_LEN(sizeof(*pi));
724 controllen += CMSG_SPACE(sizeof(*pi));
725
726 pi = (struct in6_pktinfo *) CMSG_DATA(cm);
727 pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
728 pi->ipi6_addr = ipa_to_in6(s->saddr);
729
730 msg->msg_controllen = controllen;
731 }
732
733
734 /*
735 * Miscellaneous socket syscalls
736 */
737
738 static inline int
739 sk_set_ttl4(sock *s, int ttl)
740 {
741 if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
742 ERR("IP_TTL");
743
744 return 0;
745 }
746
747 static inline int
748 sk_set_ttl6(sock *s, int ttl)
749 {
750 if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
751 ERR("IPV6_UNICAST_HOPS");
752
753 return 0;
754 }
755
756 static inline int
757 sk_set_tos4(sock *s, int tos)
758 {
759 if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
760 ERR("IP_TOS");
761
762 return 0;
763 }
764
765 static inline int
766 sk_set_tos6(sock *s, int tos)
767 {
768 if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
769 ERR("IPV6_TCLASS");
770
771 return 0;
772 }
773
774 static inline int
775 sk_set_high_port(sock *s UNUSED)
776 {
777 /* Port range setting is optional, ignore it if not supported */
778
779 #ifdef IP_PORTRANGE
780 if (sk_is_ipv4(s))
781 {
782 int range = IP_PORTRANGE_HIGH;
783 if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
784 ERR("IP_PORTRANGE");
785 }
786 #endif
787
788 #ifdef IPV6_PORTRANGE
789 if (sk_is_ipv6(s))
790 {
791 int range = IPV6_PORTRANGE_HIGH;
792 if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
793 ERR("IPV6_PORTRANGE");
794 }
795 #endif
796
797 return 0;
798 }
799
800 static inline byte *
801 sk_skip_ip_header(byte *pkt, int *len)
802 {
803 if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
804 return NULL;
805
806 int hlen = (*pkt & 0x0f) * 4;
807 if ((hlen < 20) || (hlen > *len))
808 return NULL;
809
810 *len -= hlen;
811 return pkt + hlen;
812 }
813
814 byte *
815 sk_rx_buffer(sock *s, int *len)
816 {
817 if (sk_is_ipv4(s) && (s->type == SK_IP))
818 return sk_skip_ip_header(s->rbuf, len);
819 else
820 return s->rbuf;
821 }
822
823
824 /*
825 * Public socket functions
826 */
827
828 /**
829 * sk_setup_multicast - enable multicast for given socket
830 * @s: socket
831 *
832 * Prepare transmission of multicast packets for given datagram socket.
833 * The socket must have defined @iface.
834 *
835 * Result: 0 for success, -1 for an error.
836 */
837
838 int
839 sk_setup_multicast(sock *s)
840 {
841 ASSERT(s->iface);
842
843 if (sk_is_ipv4(s))
844 return sk_setup_multicast4(s);
845 else
846 return sk_setup_multicast6(s);
847 }
848
849 /**
850 * sk_join_group - join multicast group for given socket
851 * @s: socket
852 * @maddr: multicast address
853 *
854 * Join multicast group for given datagram socket and associated interface.
855 * The socket must have defined @iface.
856 *
857 * Result: 0 for success, -1 for an error.
858 */
859
860 int
861 sk_join_group(sock *s, ip_addr maddr)
862 {
863 if (sk_is_ipv4(s))
864 return sk_join_group4(s, maddr);
865 else
866 return sk_join_group6(s, maddr);
867 }
868
869 /**
870 * sk_leave_group - leave multicast group for given socket
871 * @s: socket
872 * @maddr: multicast address
873 *
874 * Leave multicast group for given datagram socket and associated interface.
875 * The socket must have defined @iface.
876 *
877 * Result: 0 for success, -1 for an error.
878 */
879
880 int
881 sk_leave_group(sock *s, ip_addr maddr)
882 {
883 if (sk_is_ipv4(s))
884 return sk_leave_group4(s, maddr);
885 else
886 return sk_leave_group6(s, maddr);
887 }
888
889 /**
890 * sk_setup_broadcast - enable broadcast for given socket
891 * @s: socket
892 *
893 * Allow reception and transmission of broadcast packets for given datagram
894 * socket. The socket must have defined @iface. For transmission, packets should
895 * be send to @brd address of @iface.
896 *
897 * Result: 0 for success, -1 for an error.
898 */
899
900 int
901 sk_setup_broadcast(sock *s)
902 {
903 int y = 1;
904
905 if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
906 ERR("SO_BROADCAST");
907
908 return 0;
909 }
910
911 /**
912 * sk_set_ttl - set transmit TTL for given socket
913 * @s: socket
914 * @ttl: TTL value
915 *
916 * Set TTL for already opened connections when TTL was not set before. Useful
917 * for accepted connections when different ones should have different TTL.
918 *
919 * Result: 0 for success, -1 for an error.
920 */
921
922 int
923 sk_set_ttl(sock *s, int ttl)
924 {
925 s->ttl = ttl;
926
927 if (sk_is_ipv4(s))
928 return sk_set_ttl4(s, ttl);
929 else
930 return sk_set_ttl6(s, ttl);
931 }
932
933 /**
934 * sk_set_min_ttl - set minimal accepted TTL for given socket
935 * @s: socket
936 * @ttl: TTL value
937 *
938 * Set minimal accepted TTL for given socket. Can be used for TTL security.
939 * implementations.
940 *
941 * Result: 0 for success, -1 for an error.
942 */
943
944 int
945 sk_set_min_ttl(sock *s, int ttl)
946 {
947 if (sk_is_ipv4(s))
948 return sk_set_min_ttl4(s, ttl);
949 else
950 return sk_set_min_ttl6(s, ttl);
951 }
952
953 #if 0
954 /**
955 * sk_set_md5_auth - add / remove MD5 security association for given socket
956 * @s: socket
957 * @local: IP address of local side
958 * @remote: IP address of remote side
959 * @ifa: Interface for link-local IP address
960 * @passwd: Password used for MD5 authentication
961 * @setkey: Update also system SA/SP database
962 *
963 * In TCP MD5 handling code in kernel, there is a set of security associations
964 * used for choosing password and other authentication parameters according to
965 * the local and remote address. This function is useful for listening socket,
966 * for active sockets it may be enough to set s->password field.
967 *
968 * When called with passwd != NULL, the new pair is added,
969 * When called with passwd == NULL, the existing pair is removed.
970 *
971 * Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
972 * stored in global SA/SP database (but the behavior also must be enabled on
973 * per-socket basis). In case of multiple sockets to the same neighbor, the
974 * socket-specific state must be configured for each socket while global state
975 * just once per src-dst pair. The @setkey argument controls whether the global
976 * state (SA/SP database) is also updated.
977 *
978 * Result: 0 for success, -1 for an error.
979 */
980
981 int
982 sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
983 { DUMMY; }
984 #endif
985
986 /**
987 * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
988 * @s: socket
989 * @offset: offset
990 *
991 * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
992 * kernel will automatically fill it for outgoing packets and check it for
993 * incoming packets. Should not be used on ICMPv6 sockets, where the position is
994 * known to the kernel.
995 *
996 * Result: 0 for success, -1 for an error.
997 */
998
999 int
1000 sk_set_ipv6_checksum(sock *s, int offset)
1001 {
1002 if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
1003 ERR("IPV6_CHECKSUM");
1004
1005 return 0;
1006 }
1007
1008 int
1009 sk_set_icmp6_filter(sock *s, int p1, int p2)
1010 {
1011 /* a bit of lame interface, but it is here only for Radv */
1012 struct icmp6_filter f;
1013
1014 ICMP6_FILTER_SETBLOCKALL(&f);
1015 ICMP6_FILTER_SETPASS(p1, &f);
1016 ICMP6_FILTER_SETPASS(p2, &f);
1017
1018 if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
1019 ERR("ICMP6_FILTER");
1020
1021 return 0;
1022 }
1023
1024 void
1025 sk_log_error(sock *s, const char *p)
1026 {
1027 log(L_ERR "%s: Socket error: %s%#m", p, s->err);
1028 }
1029
1030
1031 /*
1032 * Actual struct birdsock code
1033 */
1034
1035 static list sock_list;
1036 static struct birdsock *current_sock;
1037 static struct birdsock *stored_sock;
1038
1039 static inline sock *
1040 sk_next(sock *s)
1041 {
1042 if (!s->n.next->next)
1043 return NULL;
1044 else
1045 return SKIP_BACK(sock, n, s->n.next);
1046 }
1047
1048 static void
1049 sk_alloc_bufs(sock *s)
1050 {
1051 if (!s->rbuf && s->rbsize)
1052 s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
1053 s->rpos = s->rbuf;
1054 if (!s->tbuf && s->tbsize)
1055 s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
1056 s->tpos = s->ttx = s->tbuf;
1057 }
1058
1059 static void
1060 sk_free_bufs(sock *s)
1061 {
1062 if (s->rbuf_alloc)
1063 {
1064 xfree(s->rbuf_alloc);
1065 s->rbuf = s->rbuf_alloc = NULL;
1066 }
1067 if (s->tbuf_alloc)
1068 {
1069 xfree(s->tbuf_alloc);
1070 s->tbuf = s->tbuf_alloc = NULL;
1071 }
1072 }
1073
1074 #ifdef HAVE_LIBSSH
1075 static void
1076 sk_ssh_free(sock *s)
1077 {
1078 struct ssh_sock *ssh = s->ssh;
1079
1080 if (s->ssh == NULL)
1081 return;
1082
1083 s->ssh = NULL;
1084
1085 if (ssh->channel)
1086 {
1087 if (ssh_channel_is_open(ssh->channel))
1088 ssh_channel_close(ssh->channel);
1089 ssh_channel_free(ssh->channel);
1090 ssh->channel = NULL;
1091 }
1092
1093 if (ssh->session)
1094 {
1095 ssh_disconnect(ssh->session);
1096 ssh_free(ssh->session);
1097 ssh->session = NULL;
1098 }
1099 }
1100 #endif
1101
1102 static void
1103 sk_free(resource *r)
1104 {
1105 sock *s = (sock *) r;
1106
1107 sk_free_bufs(s);
1108
1109 #ifdef HAVE_LIBSSH
1110 if (s->type == SK_SSH || s->type == SK_SSH_ACTIVE)
1111 sk_ssh_free(s);
1112 #endif
1113
1114 if (s->fd < 0)
1115 return;
1116
1117 /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
1118 if (!(s->flags & SKF_THREAD))
1119 {
1120 if (s == current_sock)
1121 current_sock = sk_next(s);
1122 if (s == stored_sock)
1123 stored_sock = sk_next(s);
1124 rem_node(&s->n);
1125 }
1126
1127 if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE)
1128 close(s->fd);
1129
1130 s->fd = -1;
1131 }
1132
1133 void
1134 sk_set_rbsize(sock *s, uint val)
1135 {
1136 ASSERT(s->rbuf_alloc == s->rbuf);
1137
1138 if (s->rbsize == val)
1139 return;
1140
1141 s->rbsize = val;
1142 xfree(s->rbuf_alloc);
1143 s->rbuf_alloc = xmalloc(val);
1144 s->rpos = s->rbuf = s->rbuf_alloc;
1145 }
1146
1147 void
1148 sk_set_tbsize(sock *s, uint val)
1149 {
1150 ASSERT(s->tbuf_alloc == s->tbuf);
1151
1152 if (s->tbsize == val)
1153 return;
1154
1155 byte *old_tbuf = s->tbuf;
1156
1157 s->tbsize = val;
1158 s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
1159 s->tpos = s->tbuf + (s->tpos - old_tbuf);
1160 s->ttx = s->tbuf + (s->ttx - old_tbuf);
1161 }
1162
1163 void
1164 sk_set_tbuf(sock *s, void *tbuf)
1165 {
1166 s->tbuf = tbuf ?: s->tbuf_alloc;
1167 s->ttx = s->tpos = s->tbuf;
1168 }
1169
1170 void
1171 sk_reallocate(sock *s)
1172 {
1173 sk_free_bufs(s);
1174 sk_alloc_bufs(s);
1175 }
1176
1177 static void
1178 sk_dump(resource *r)
1179 {
1180 sock *s = (sock *) r;
1181 static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "SSH>", "SSH", "DEL!" };
1182
1183 debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
1184 sk_type_names[s->type],
1185 s->data,
1186 s->saddr,
1187 s->sport,
1188 s->daddr,
1189 s->dport,
1190 s->tos,
1191 s->ttl,
1192 s->iface ? s->iface->name : "none");
1193 }
1194
1195 static struct resclass sk_class = {
1196 "Socket",
1197 sizeof(sock),
1198 sk_free,
1199 sk_dump,
1200 NULL,
1201 NULL
1202 };
1203
1204 /**
1205 * sk_new - create a socket
1206 * @p: pool
1207 *
1208 * This function creates a new socket resource. If you want to use it,
1209 * you need to fill in all the required fields of the structure and
1210 * call sk_open() to do the actual opening of the socket.
1211 *
1212 * The real function name is sock_new(), sk_new() is a macro wrapper
1213 * to avoid collision with OpenSSL.
1214 */
1215 sock *
1216 sock_new(pool *p)
1217 {
1218 sock *s = ralloc(p, &sk_class);
1219 s->pool = p;
1220 // s->saddr = s->daddr = IPA_NONE;
1221 s->tos = s->priority = s->ttl = -1;
1222 s->fd = -1;
1223 return s;
1224 }
1225
1226 static int
1227 sk_setup(sock *s)
1228 {
1229 int y = 1;
1230 int fd = s->fd;
1231
1232 if (s->type == SK_SSH_ACTIVE)
1233 return 0;
1234
1235 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1236 ERR("O_NONBLOCK");
1237
1238 if (!s->af)
1239 return 0;
1240
1241 if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
1242 s->flags |= SKF_PKTINFO;
1243
1244 #ifdef CONFIG_USE_HDRINCL
1245 if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
1246 {
1247 s->flags &= ~SKF_PKTINFO;
1248 s->flags |= SKF_HDRINCL;
1249 if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
1250 ERR("IP_HDRINCL");
1251 }
1252 #endif
1253
1254 if (s->iface)
1255 {
1256 #ifdef SO_BINDTODEVICE
1257 struct ifreq ifr = {};
1258 strcpy(ifr.ifr_name, s->iface->name);
1259 if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
1260 ERR("SO_BINDTODEVICE");
1261 #endif
1262
1263 #ifdef CONFIG_UNIX_DONTROUTE
1264 if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
1265 ERR("SO_DONTROUTE");
1266 #endif
1267 }
1268
1269 if (s->priority >= 0)
1270 if (sk_set_priority(s, s->priority) < 0)
1271 return -1;
1272
1273 if (sk_is_ipv4(s))
1274 {
1275 if (s->flags & SKF_LADDR_RX)
1276 if (sk_request_cmsg4_pktinfo(s) < 0)
1277 return -1;
1278
1279 if (s->flags & SKF_TTL_RX)
1280 if (sk_request_cmsg4_ttl(s) < 0)
1281 return -1;
1282
1283 if ((s->type == SK_UDP) || (s->type == SK_IP))
1284 if (sk_disable_mtu_disc4(s) < 0)
1285 return -1;
1286
1287 if (s->ttl >= 0)
1288 if (sk_set_ttl4(s, s->ttl) < 0)
1289 return -1;
1290
1291 if (s->tos >= 0)
1292 if (sk_set_tos4(s, s->tos) < 0)
1293 return -1;
1294 }
1295
1296 if (sk_is_ipv6(s))
1297 {
1298 if ((s->type == SK_TCP_PASSIVE) || (s->type == SK_TCP_ACTIVE) || (s->type == SK_UDP))
1299 if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
1300 ERR("IPV6_V6ONLY");
1301
1302 if (s->flags & SKF_LADDR_RX)
1303 if (sk_request_cmsg6_pktinfo(s) < 0)
1304 return -1;
1305
1306 if (s->flags & SKF_TTL_RX)
1307 if (sk_request_cmsg6_ttl(s) < 0)
1308 return -1;
1309
1310 if ((s->type == SK_UDP) || (s->type == SK_IP))
1311 if (sk_disable_mtu_disc6(s) < 0)
1312 return -1;
1313
1314 if (s->ttl >= 0)
1315 if (sk_set_ttl6(s, s->ttl) < 0)
1316 return -1;
1317
1318 if (s->tos >= 0)
1319 if (sk_set_tos6(s, s->tos) < 0)
1320 return -1;
1321 }
1322
1323 return 0;
1324 }
1325
1326 static void
1327 sk_insert(sock *s)
1328 {
1329 add_tail(&sock_list, &s->n);
1330 }
1331
1332 static void
1333 sk_tcp_connected(sock *s)
1334 {
1335 sockaddr sa;
1336 int sa_len = sizeof(sa);
1337
1338 if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
1339 (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
1340 log(L_WARN "SOCK: Cannot get local IP address for TCP>");
1341
1342 s->type = SK_TCP;
1343 sk_alloc_bufs(s);
1344 s->tx_hook(s);
1345 }
1346
1347 #ifdef HAVE_LIBSSH
1348 static void
1349 sk_ssh_connected(sock *s)
1350 {
1351 sk_alloc_bufs(s);
1352 s->type = SK_SSH;
1353 s->tx_hook(s);
1354 }
1355 #endif
1356
1357 static int
1358 sk_passive_connected(sock *s, int type)
1359 {
1360 sockaddr loc_sa, rem_sa;
1361 int loc_sa_len = sizeof(loc_sa);
1362 int rem_sa_len = sizeof(rem_sa);
1363
1364 int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
1365 if (fd < 0)
1366 {
1367 if ((errno != EINTR) && (errno != EAGAIN))
1368 s->err_hook(s, errno);
1369 return 0;
1370 }
1371
1372 sock *t = sk_new(s->pool);
1373 t->type = type;
1374 t->af = s->af;
1375 t->fd = fd;
1376 t->ttl = s->ttl;
1377 t->tos = s->tos;
1378 t->rbsize = s->rbsize;
1379 t->tbsize = s->tbsize;
1380
1381 if (type == SK_TCP)
1382 {
1383 if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
1384 (sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
1385 log(L_WARN "SOCK: Cannot get local IP address for TCP<");
1386
1387 if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
1388 log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
1389 }
1390
1391 if (sk_setup(t) < 0)
1392 {
1393 /* FIXME: Call err_hook instead ? */
1394 log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
1395
1396 /* FIXME: handle it better in rfree() */
1397 close(t->fd);
1398 t->fd = -1;
1399 rfree(t);
1400 return 1;
1401 }
1402
1403 sk_insert(t);
1404 sk_alloc_bufs(t);
1405 s->rx_hook(t, 0);
1406 return 1;
1407 }
1408
1409 #ifdef HAVE_LIBSSH
1410 /*
1411 * Return SSH_OK or SSH_AGAIN or SSH_ERROR
1412 */
1413 static int
1414 sk_ssh_connect(sock *s)
1415 {
1416 s->fd = ssh_get_fd(s->ssh->session);
1417
1418 /* Big fall thru automata */
1419 switch (s->ssh->state)
1420 {
1421 case SK_SSH_CONNECT:
1422 {
1423 switch (ssh_connect(s->ssh->session))
1424 {
1425 case SSH_AGAIN:
1426 /* A quick look into libSSH shows that ssh_get_fd() should return non-(-1)
1427 * after SSH_AGAIN is returned by ssh_connect(). This is however nowhere
1428 * documented but our code relies on that.
1429 */
1430 return SSH_AGAIN;
1431
1432 case SSH_OK:
1433 break;
1434
1435 default:
1436 return SSH_ERROR;
1437 }
1438 }
1439
1440 case SK_SSH_SERVER_KNOWN:
1441 {
1442 s->ssh->state = SK_SSH_SERVER_KNOWN;
1443
1444 if (s->ssh->server_hostkey_path)
1445 {
1446 int server_identity_is_ok = 1;
1447
1448 /* Check server identity */
1449 switch (ssh_is_server_known(s->ssh->session))
1450 {
1451 #define LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s,msg,args...) log(L_WARN "SSH Identity %s@%s:%u: " msg, (s)->ssh->username, (s)->host, (s)->dport, ## args);
1452 case SSH_SERVER_KNOWN_OK:
1453 /* The server is known and has not changed. */
1454 break;
1455
1456 case SSH_SERVER_NOT_KNOWN:
1457 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server is unknown, its public key was not found in the known host file %s", s->ssh->server_hostkey_path);
1458 break;
1459
1460 case SSH_SERVER_KNOWN_CHANGED:
1461 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server key has changed. Either you are under attack or the administrator changed the key.");
1462 server_identity_is_ok = 0;
1463 break;
1464
1465 case SSH_SERVER_FILE_NOT_FOUND:
1466 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The known host file %s does not exist", s->ssh->server_hostkey_path);
1467 server_identity_is_ok = 0;
1468 break;
1469
1470 case SSH_SERVER_ERROR:
1471 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "Some error happened");
1472 server_identity_is_ok = 0;
1473 break;
1474
1475 case SSH_SERVER_FOUND_OTHER:
1476 LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server gave use a key of a type while we had an other type recorded. " \
1477 "It is a possible attack.");
1478 server_identity_is_ok = 0;
1479 break;
1480 }
1481
1482 if (!server_identity_is_ok)
1483 return SSH_ERROR;
1484 }
1485 }
1486
1487 case SK_SSH_USERAUTH:
1488 {
1489 s->ssh->state = SK_SSH_USERAUTH;
1490 switch (ssh_userauth_publickey_auto(s->ssh->session, NULL, NULL))
1491 {
1492 case SSH_AUTH_AGAIN:
1493 return SSH_AGAIN;
1494
1495 case SSH_AUTH_SUCCESS:
1496 break;
1497
1498 default:
1499 return SSH_ERROR;
1500 }
1501 }
1502
1503 case SK_SSH_CHANNEL:
1504 {
1505 s->ssh->state = SK_SSH_CHANNEL;
1506 s->ssh->channel = ssh_channel_new(s->ssh->session);
1507 if (s->ssh->channel == NULL)
1508 return SSH_ERROR;
1509 }
1510
1511 case SK_SSH_SESSION:
1512 {
1513 s->ssh->state = SK_SSH_SESSION;
1514 switch (ssh_channel_open_session(s->ssh->channel))
1515 {
1516 case SSH_AGAIN:
1517 return SSH_AGAIN;
1518
1519 case SSH_OK:
1520 break;
1521
1522 default:
1523 return SSH_ERROR;
1524 }
1525 }
1526
1527 case SK_SSH_SUBSYSTEM:
1528 {
1529 s->ssh->state = SK_SSH_SUBSYSTEM;
1530 if (s->ssh->subsystem)
1531 {
1532 switch (ssh_channel_request_subsystem(s->ssh->channel, s->ssh->subsystem))
1533 {
1534 case SSH_AGAIN:
1535 return SSH_AGAIN;
1536
1537 case SSH_OK:
1538 break;
1539
1540 default:
1541 return SSH_ERROR;
1542 }
1543 }
1544 }
1545
1546 case SK_SSH_ESTABLISHED:
1547 s->ssh->state = SK_SSH_ESTABLISHED;
1548 }
1549
1550 return SSH_OK;
1551 }
1552
1553 /*
1554 * Return file descriptor number if success
1555 * Return -1 if failed
1556 */
1557 static int
1558 sk_open_ssh(sock *s)
1559 {
1560 if (!s->ssh)
1561 bug("sk_open() sock->ssh is not allocated");
1562
1563 ssh_session sess = ssh_new();
1564 if (sess == NULL)
1565 ERR2("Cannot create a ssh session");
1566 s->ssh->session = sess;
1567
1568 const int verbosity = SSH_LOG_NOLOG;
1569 ssh_options_set(sess, SSH_OPTIONS_LOG_VERBOSITY, &verbosity);
1570 ssh_options_set(sess, SSH_OPTIONS_HOST, s->host);
1571 ssh_options_set(sess, SSH_OPTIONS_PORT, &(s->dport));
1572 /* TODO: Add SSH_OPTIONS_BINDADDR */
1573 ssh_options_set(sess, SSH_OPTIONS_USER, s->ssh->username);
1574
1575 if (s->ssh->server_hostkey_path)
1576 ssh_options_set(sess, SSH_OPTIONS_KNOWNHOSTS, s->ssh->server_hostkey_path);
1577
1578 if (s->ssh->client_privkey_path)
1579 ssh_options_set(sess, SSH_OPTIONS_IDENTITY, s->ssh->client_privkey_path);
1580
1581 ssh_set_blocking(sess, 0);
1582
1583 switch (sk_ssh_connect(s))
1584 {
1585 case SSH_AGAIN:
1586 break;
1587
1588 case SSH_OK:
1589 sk_ssh_connected(s);
1590 break;
1591
1592 case SSH_ERROR:
1593 ERR2(ssh_get_error(sess));
1594 break;
1595 }
1596
1597 return ssh_get_fd(sess);
1598
1599 err:
1600 return -1;
1601 }
1602 #endif
1603
1604 /**
1605 * sk_open - open a socket
1606 * @s: socket
1607 *
1608 * This function takes a socket resource created by sk_new() and
1609 * initialized by the user and binds a corresponding network connection
1610 * to it.
1611 *
1612 * Result: 0 for success, -1 for an error.
1613 */
1614 int
1615 sk_open(sock *s)
1616 {
1617 int af = AF_UNSPEC;
1618 int fd = -1;
1619 int do_bind = 0;
1620 int bind_port = 0;
1621 ip_addr bind_addr = IPA_NONE;
1622 sockaddr sa;
1623
1624 if (s->type <= SK_IP)
1625 {
1626 /*
1627 * For TCP/IP sockets, Address family (IPv4 or IPv6) can be specified either
1628 * explicitly (SK_IPV4 or SK_IPV6) or implicitly (based on saddr, daddr).
1629 * But the specifications have to be consistent.
1630 */
1631
1632 switch (s->subtype)
1633 {
1634 case 0:
1635 ASSERT(ipa_zero(s->saddr) || ipa_zero(s->daddr) ||
1636 (ipa_is_ip4(s->saddr) == ipa_is_ip4(s->daddr)));
1637 af = (ipa_is_ip4(s->saddr) || ipa_is_ip4(s->daddr)) ? AF_INET : AF_INET6;
1638 break;
1639
1640 case SK_IPV4:
1641 ASSERT(ipa_zero(s->saddr) || ipa_is_ip4(s->saddr));
1642 ASSERT(ipa_zero(s->daddr) || ipa_is_ip4(s->daddr));
1643 af = AF_INET;
1644 break;
1645
1646 case SK_IPV6:
1647 ASSERT(ipa_zero(s->saddr) || !ipa_is_ip4(s->saddr));
1648 ASSERT(ipa_zero(s->daddr) || !ipa_is_ip4(s->daddr));
1649 af = AF_INET6;
1650 break;
1651
1652 default:
1653 bug("Invalid subtype %d", s->subtype);
1654 }
1655 }
1656
1657 switch (s->type)
1658 {
1659 case SK_TCP_ACTIVE:
1660 s->ttx = ""; /* Force s->ttx != s->tpos */
1661 /* Fall thru */
1662 case SK_TCP_PASSIVE:
1663 fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
1664 bind_port = s->sport;
1665 bind_addr = s->saddr;
1666 do_bind = bind_port || ipa_nonzero(bind_addr);
1667 break;
1668
1669 #ifdef HAVE_LIBSSH
1670 case SK_SSH_ACTIVE:
1671 s->ttx = ""; /* Force s->ttx != s->tpos */
1672 fd = sk_open_ssh(s);
1673 break;
1674 #endif
1675
1676 case SK_UDP:
1677 fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
1678 bind_port = s->sport;
1679 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1680 do_bind = 1;
1681 break;
1682
1683 case SK_IP:
1684 fd = socket(af, SOCK_RAW, s->dport);
1685 bind_port = 0;
1686 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1687 do_bind = ipa_nonzero(bind_addr);
1688 break;
1689
1690 case SK_MAGIC:
1691 af = 0;
1692 fd = s->fd;
1693 break;
1694
1695 default:
1696 bug("sk_open() called for invalid sock type %d", s->type);
1697 }
1698
1699 if (fd < 0)
1700 ERR("socket");
1701
1702 s->af = af;
1703 s->fd = fd;
1704
1705 if (sk_setup(s) < 0)
1706 goto err;
1707
1708 if (do_bind)
1709 {
1710 if (bind_port)
1711 {
1712 int y = 1;
1713
1714 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
1715 ERR2("SO_REUSEADDR");
1716
1717 #ifdef CONFIG_NO_IFACE_BIND
1718 /* Workaround missing ability to bind to an iface */
1719 if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
1720 {
1721 if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
1722 ERR2("SO_REUSEPORT");
1723 }
1724 #endif
1725 }
1726 else
1727 if (s->flags & SKF_HIGH_PORT)
1728 if (sk_set_high_port(s) < 0)
1729 log(L_WARN "Socket error: %s%#m", s->err);
1730
1731 sockaddr_fill(&sa, s->af, bind_addr, s->iface, bind_port);
1732 if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
1733 ERR2("bind");
1734 }
1735
1736 if (s->password)
1737 if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
1738 goto err;
1739
1740 switch (s->type)
1741 {
1742 case SK_TCP_ACTIVE:
1743 sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
1744 if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
1745 sk_tcp_connected(s);
1746 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
1747 errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
1748 ERR2("connect");
1749 break;
1750
1751 case SK_TCP_PASSIVE:
1752 if (listen(fd, 8) < 0)
1753 ERR2("listen");
1754 break;
1755
1756 case SK_SSH_ACTIVE:
1757 case SK_MAGIC:
1758 break;
1759
1760 default:
1761 sk_alloc_bufs(s);
1762 }
1763
1764 if (!(s->flags & SKF_THREAD))
1765 sk_insert(s);
1766
1767 return 0;
1768
1769 err:
1770 close(fd);
1771 s->fd = -1;
1772 return -1;
1773 }
1774
1775 int
1776 sk_open_unix(sock *s, char *name)
1777 {
1778 struct sockaddr_un sa;
1779 int fd;
1780
1781 /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
1782
1783 fd = socket(AF_UNIX, SOCK_STREAM, 0);
1784 if (fd < 0)
1785 return -1;
1786
1787 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1788 return -1;
1789
1790 /* Path length checked in test_old_bird() */
1791 sa.sun_family = AF_UNIX;
1792 strcpy(sa.sun_path, name);
1793
1794 if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
1795 return -1;
1796
1797 if (listen(fd, 8) < 0)
1798 return -1;
1799
1800 s->fd = fd;
1801 sk_insert(s);
1802 return 0;
1803 }
1804
1805
1806 #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
1807 CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
1808 #define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
1809
1810 static void
1811 sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
1812 {
1813 if (sk_is_ipv4(s))
1814 sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
1815 else
1816 sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
1817 }
1818
1819 static void
1820 sk_process_cmsgs(sock *s, struct msghdr *msg)
1821 {
1822 struct cmsghdr *cm;
1823
1824 s->laddr = IPA_NONE;
1825 s->lifindex = 0;
1826 s->rcv_ttl = -1;
1827
1828 for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
1829 {
1830 if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
1831 {
1832 sk_process_cmsg4_pktinfo(s, cm);
1833 sk_process_cmsg4_ttl(s, cm);
1834 }
1835
1836 if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
1837 {
1838 sk_process_cmsg6_pktinfo(s, cm);
1839 sk_process_cmsg6_ttl(s, cm);
1840 }
1841 }
1842 }
1843
1844
1845 static inline int
1846 sk_sendmsg(sock *s)
1847 {
1848 struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
1849 byte cmsg_buf[CMSG_TX_SPACE];
1850 sockaddr dst;
1851
1852 sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
1853
1854 struct msghdr msg = {
1855 .msg_name = &dst.sa,
1856 .msg_namelen = SA_LEN(dst),
1857 .msg_iov = &iov,
1858 .msg_iovlen = 1
1859 };
1860
1861 #ifdef CONFIG_USE_HDRINCL
1862 byte hdr[20];
1863 struct iovec iov2[2] = { {hdr, 20}, iov };
1864
1865 if (s->flags & SKF_HDRINCL)
1866 {
1867 sk_prepare_ip_header(s, hdr, iov.iov_len);
1868 msg.msg_iov = iov2;
1869 msg.msg_iovlen = 2;
1870 }
1871 #endif
1872
1873 if (s->flags & SKF_PKTINFO)
1874 sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
1875
1876 return sendmsg(s->fd, &msg, 0);
1877 }
1878
1879 static inline int
1880 sk_recvmsg(sock *s)
1881 {
1882 struct iovec iov = {s->rbuf, s->rbsize};
1883 byte cmsg_buf[CMSG_RX_SPACE];
1884 sockaddr src;
1885
1886 struct msghdr msg = {
1887 .msg_name = &src.sa,
1888 .msg_namelen = sizeof(src), // XXXX ??
1889 .msg_iov = &iov,
1890 .msg_iovlen = 1,
1891 .msg_control = cmsg_buf,
1892 .msg_controllen = sizeof(cmsg_buf),
1893 .msg_flags = 0
1894 };
1895
1896 int rv = recvmsg(s->fd, &msg, 0);
1897 if (rv < 0)
1898 return rv;
1899
1900 //ifdef IPV4
1901 // if (cf_type == SK_IP)
1902 // rv = ipv4_skip_header(pbuf, rv);
1903 //endif
1904
1905 sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
1906 sk_process_cmsgs(s, &msg);
1907
1908 if (msg.msg_flags & MSG_TRUNC)
1909 s->flags |= SKF_TRUNCATED;
1910 else
1911 s->flags &= ~SKF_TRUNCATED;
1912
1913 return rv;
1914 }
1915
1916
1917 static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
1918
1919 static int
1920 sk_maybe_write(sock *s)
1921 {
1922 int e;
1923
1924 switch (s->type)
1925 {
1926 case SK_TCP:
1927 case SK_MAGIC:
1928 case SK_UNIX:
1929 while (s->ttx != s->tpos)
1930 {
1931 e = write(s->fd, s->ttx, s->tpos - s->ttx);
1932
1933 if (e < 0)
1934 {
1935 if (errno != EINTR && errno != EAGAIN)
1936 {
1937 reset_tx_buffer(s);
1938 /* EPIPE is just a connection close notification during TX */
1939 s->err_hook(s, (errno != EPIPE) ? errno : 0);
1940 return -1;
1941 }
1942 return 0;
1943 }
1944 s->ttx += e;
1945 }
1946 reset_tx_buffer(s);
1947 return 1;
1948
1949 #ifdef HAVE_LIBSSH
1950 case SK_SSH:
1951 while (s->ttx != s->tpos)
1952 {
1953 e = ssh_channel_write(s->ssh->channel, s->ttx, s->tpos - s->ttx);
1954
1955 if (e < 0)
1956 {
1957 s->err = ssh_get_error(s->ssh->session);
1958 s->err_hook(s, ssh_get_error_code(s->ssh->session));
1959
1960 reset_tx_buffer(s);
1961 /* EPIPE is just a connection close notification during TX */
1962 s->err_hook(s, (errno != EPIPE) ? errno : 0);
1963 return -1;
1964 }
1965 s->ttx += e;
1966 }
1967 reset_tx_buffer(s);
1968 return 1;
1969 #endif
1970
1971 case SK_UDP:
1972 case SK_IP:
1973 {
1974 if (s->tbuf == s->tpos)
1975 return 1;
1976
1977 e = sk_sendmsg(s);
1978
1979 if (e < 0)
1980 {
1981 if (errno != EINTR && errno != EAGAIN)
1982 {
1983 reset_tx_buffer(s);
1984 s->err_hook(s, errno);
1985 return -1;
1986 }
1987
1988 if (!s->tx_hook)
1989 reset_tx_buffer(s);
1990 return 0;
1991 }
1992 reset_tx_buffer(s);
1993 return 1;
1994 }
1995
1996 default:
1997 bug("sk_maybe_write: unknown socket type %d", s->type);
1998 }
1999 }
2000
2001 int
2002 sk_rx_ready(sock *s)
2003 {
2004 int rv;
2005 struct pollfd pfd = { .fd = s->fd };
2006 pfd.events |= POLLIN;
2007
2008 redo:
2009 rv = poll(&pfd, 1, 0);
2010
2011 if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
2012 goto redo;
2013
2014 return rv;
2015 }
2016
2017 /**
2018 * sk_send - send data to a socket
2019 * @s: socket
2020 * @len: number of bytes to send
2021 *
2022 * This function sends @len bytes of data prepared in the
2023 * transmit buffer of the socket @s to the network connection.
2024 * If the packet can be sent immediately, it does so and returns
2025 * 1, else it queues the packet for later processing, returns 0
2026 * and calls the @tx_hook of the socket when the tranmission
2027 * takes place.
2028 */
2029 int
2030 sk_send(sock *s, unsigned len)
2031 {
2032 s->ttx = s->tbuf;
2033 s->tpos = s->tbuf + len;
2034 return sk_maybe_write(s);
2035 }
2036
2037 /**
2038 * sk_send_to - send data to a specific destination
2039 * @s: socket
2040 * @len: number of bytes to send
2041 * @addr: IP address to send the packet to
2042 * @port: port to send the packet to
2043 *
2044 * This is a sk_send() replacement for connection-less packet sockets
2045 * which allows destination of the packet to be chosen dynamically.
2046 * Raw IP sockets should use 0 for @port.
2047 */
2048 int
2049 sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
2050 {
2051 s->daddr = addr;
2052 if (port)
2053 s->dport = port;
2054
2055 s->ttx = s->tbuf;
2056 s->tpos = s->tbuf + len;
2057 return sk_maybe_write(s);
2058 }
2059
2060 /*
2061 int
2062 sk_send_full(sock *s, unsigned len, struct iface *ifa,
2063 ip_addr saddr, ip_addr daddr, unsigned dport)
2064 {
2065 s->iface = ifa;
2066 s->saddr = saddr;
2067 s->daddr = daddr;
2068 s->dport = dport;
2069 s->ttx = s->tbuf;
2070 s->tpos = s->tbuf + len;
2071 return sk_maybe_write(s);
2072 }
2073 */
2074
2075 static void
2076 call_rx_hook(sock *s, int size)
2077 {
2078 if (s->rx_hook(s, size))
2079 {
2080 /* We need to be careful since the socket could have been deleted by the hook */
2081 if (current_sock == s)
2082 s->rpos = s->rbuf;
2083 }
2084 }
2085
2086 #ifdef HAVE_LIBSSH
2087 static int
2088 sk_read_ssh(sock *s)
2089 {
2090 ssh_channel rchans[2] = { s->ssh->channel, NULL };
2091 struct timeval timev = { 1, 0 };
2092
2093 if (ssh_channel_select(rchans, NULL, NULL, &timev) == SSH_EINTR)
2094 return 1; /* Try again */
2095
2096 if (ssh_channel_is_eof(s->ssh->channel) != 0)
2097 {
2098 /* The remote side is closing the connection */
2099 s->err_hook(s, 0);
2100 return 0;
2101 }
2102
2103 if (rchans[0] == NULL)
2104 return 0; /* No data is available on the socket */
2105
2106 const uint used_bytes = s->rpos - s->rbuf;
2107 const int read_bytes = ssh_channel_read_nonblocking(s->ssh->channel, s->rpos, s->rbsize - used_bytes, 0);
2108 if (read_bytes > 0)
2109 {
2110 /* Received data */
2111 s->rpos += read_bytes;
2112 call_rx_hook(s, used_bytes + read_bytes);
2113 return 1;
2114 }
2115 else if (read_bytes == 0)
2116 {
2117 if (ssh_channel_is_eof(s->ssh->channel) != 0)
2118 {
2119 /* The remote side is closing the connection */
2120 s->err_hook(s, 0);
2121 }
2122 }
2123 else
2124 {
2125 s->err = ssh_get_error(s->ssh->session);
2126 s->err_hook(s, ssh_get_error_code(s->ssh->session));
2127 }
2128
2129 return 0; /* No data is available on the socket */
2130 }
2131 #endif
2132
2133 /* sk_read() and sk_write() are called from BFD's event loop */
2134
2135 int
2136 sk_read(sock *s, int revents)
2137 {
2138 switch (s->type)
2139 {
2140 case SK_TCP_PASSIVE:
2141 return sk_passive_connected(s, SK_TCP);
2142
2143 case SK_UNIX_PASSIVE:
2144 return sk_passive_connected(s, SK_UNIX);
2145
2146 case SK_TCP:
2147 case SK_UNIX:
2148 {
2149 int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
2150
2151 if (c < 0)
2152 {
2153 if (errno != EINTR && errno != EAGAIN)
2154 s->err_hook(s, errno);
2155 else if (errno == EAGAIN && !(revents & POLLIN))
2156 {
2157 log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
2158 s->err_hook(s, 0);
2159 }
2160 }
2161 else if (!c)
2162 s->err_hook(s, 0);
2163 else
2164 {
2165 s->rpos += c;
2166 call_rx_hook(s, s->rpos - s->rbuf);
2167 return 1;
2168 }
2169 return 0;
2170 }
2171
2172 #ifdef HAVE_LIBSSH
2173 case SK_SSH:
2174 return sk_read_ssh(s);
2175 #endif
2176
2177 case SK_MAGIC:
2178 return s->rx_hook(s, 0);
2179
2180 default:
2181 {
2182 int e = sk_recvmsg(s);
2183
2184 if (e < 0)
2185 {
2186 if (errno != EINTR && errno != EAGAIN)
2187 s->err_hook(s, errno);
2188 return 0;
2189 }
2190
2191 s->rpos = s->rbuf + e;
2192 s->rx_hook(s, e);
2193 return 1;
2194 }
2195 }
2196 }
2197
2198 int
2199 sk_write(sock *s)
2200 {
2201 switch (s->type)
2202 {
2203 case SK_TCP_ACTIVE:
2204 {
2205 sockaddr sa;
2206 sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
2207
2208 if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
2209 sk_tcp_connected(s);
2210 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
2211 s->err_hook(s, errno);
2212 return 0;
2213 }
2214
2215 #ifdef HAVE_LIBSSH
2216 case SK_SSH_ACTIVE:
2217 {
2218 switch (sk_ssh_connect(s))
2219 {
2220 case SSH_OK:
2221 sk_ssh_connected(s);
2222 break;
2223
2224 case SSH_AGAIN:
2225 return 1;
2226
2227 case SSH_ERROR:
2228 s->err = ssh_get_error(s->ssh->session);
2229 s->err_hook(s, ssh_get_error_code(s->ssh->session));
2230 break;
2231 }
2232 return 0;
2233 }
2234 #endif
2235
2236 default:
2237 if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
2238 {
2239 if (s->tx_hook)
2240 s->tx_hook(s);
2241 return 1;
2242 }
2243 return 0;
2244 }
2245 }
2246
2247 int sk_is_ipv4(sock *s)
2248 { return s->af == AF_INET; }
2249
2250 int sk_is_ipv6(sock *s)
2251 { return s->af == AF_INET6; }
2252
2253 void
2254 sk_err(sock *s, int revents)
2255 {
2256 int se = 0, sse = sizeof(se);
2257 if ((s->type != SK_MAGIC) && (revents & POLLERR))
2258 if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
2259 {
2260 log(L_ERR "IO: Socket error: SO_ERROR: %m");
2261 se = 0;
2262 }
2263
2264 s->err_hook(s, se);
2265 }
2266
2267 void
2268 sk_dump_all(void)
2269 {
2270 node *n;
2271 sock *s;
2272
2273 debug("Open sockets:\n");
2274 WALK_LIST(n, sock_list)
2275 {
2276 s = SKIP_BACK(sock, n, n);
2277 debug("%p ", s);
2278 sk_dump(&s->r);
2279 }
2280 debug("\n");
2281 }
2282
2283
2284 /*
2285 * Internal event log and watchdog
2286 */
2287
2288 #define EVENT_LOG_LENGTH 32
2289
2290 struct event_log_entry
2291 {
2292 void *hook;
2293 void *data;
2294 btime timestamp;
2295 btime duration;
2296 };
2297
2298 static struct event_log_entry event_log[EVENT_LOG_LENGTH];
2299 static struct event_log_entry *event_open;
2300 static int event_log_pos, event_log_num, watchdog_active;
2301 static btime last_time;
2302 static btime loop_time;
2303
2304 static void
2305 io_update_time(void)
2306 {
2307 struct timespec ts;
2308 int rv;
2309
2310 if (!clock_monotonic_available)
2311 return;
2312
2313 /*
2314 * This is third time-tracking procedure (after update_times() above and
2315 * times_update() in BFD), dedicated to internal event log and latency
2316 * tracking. Hopefully, we consolidate these sometimes.
2317 */
2318
2319 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
2320 if (rv < 0)
2321 die("clock_gettime: %m");
2322
2323 last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
2324
2325 if (event_open)
2326 {
2327 event_open->duration = last_time - event_open->timestamp;
2328
2329 if (event_open->duration > config->latency_limit)
2330 log(L_WARN "Event 0x%p 0x%p took %d ms",
2331 event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
2332
2333 event_open = NULL;
2334 }
2335 }
2336
2337 /**
2338 * io_log_event - mark approaching event into event log
2339 * @hook: event hook address
2340 * @data: event data address
2341 *
2342 * Store info (hook, data, timestamp) about the following internal event into
2343 * a circular event log (@event_log). When latency tracking is enabled, the log
2344 * entry is kept open (in @event_open) so the duration can be filled later.
2345 */
2346 void
2347 io_log_event(void *hook, void *data)
2348 {
2349 if (config->latency_debug)
2350 io_update_time();
2351
2352 struct event_log_entry *en = event_log + event_log_pos;
2353
2354 en->hook = hook;
2355 en->data = data;
2356 en->timestamp = last_time;
2357 en->duration = 0;
2358
2359 event_log_num++;
2360 event_log_pos++;
2361 event_log_pos %= EVENT_LOG_LENGTH;
2362
2363 event_open = config->latency_debug ? en : NULL;
2364 }
2365
2366 static inline void
2367 io_close_event(void)
2368 {
2369 if (event_open)
2370 io_update_time();
2371 }
2372
2373 void
2374 io_log_dump(void)
2375 {
2376 int i;
2377
2378 log(L_DEBUG "Event log:");
2379 for (i = 0; i < EVENT_LOG_LENGTH; i++)
2380 {
2381 struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
2382 if (en->hook)
2383 log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
2384 (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
2385 }
2386 }
2387
2388 void
2389 watchdog_sigalrm(int sig UNUSED)
2390 {
2391 /* Update last_time and duration, but skip latency check */
2392 config->latency_limit = 0xffffffff;
2393 io_update_time();
2394
2395 /* We want core dump */
2396 abort();
2397 }
2398
2399 static inline void
2400 watchdog_start1(void)
2401 {
2402 io_update_time();
2403
2404 loop_time = last_time;
2405 }
2406
2407 static inline void
2408 watchdog_start(void)
2409 {
2410 io_update_time();
2411
2412 loop_time = last_time;
2413 event_log_num = 0;
2414
2415 if (config->watchdog_timeout)
2416 {
2417 alarm(config->watchdog_timeout);
2418 watchdog_active = 1;
2419 }
2420 }
2421
2422 static inline void
2423 watchdog_stop(void)
2424 {
2425 io_update_time();
2426
2427 if (watchdog_active)
2428 {
2429 alarm(0);
2430 watchdog_active = 0;
2431 }
2432
2433 btime duration = last_time - loop_time;
2434 if (duration > config->watchdog_warning)
2435 log(L_WARN "I/O loop cycle took %d ms for %d events",
2436 (int) (duration TO_MS), event_log_num);
2437 }
2438
2439
2440 /*
2441 * Main I/O Loop
2442 */
2443
2444 volatile int async_config_flag; /* Asynchronous reconfiguration/dump scheduled */
2445 volatile int async_dump_flag;
2446 volatile int async_shutdown_flag;
2447
2448 void
2449 io_init(void)
2450 {
2451 init_list(&near_timers);
2452 init_list(&far_timers);
2453 init_list(&sock_list);
2454 init_list(&global_event_list);
2455 krt_io_init();
2456 init_times();
2457 update_times();
2458 boot_time = now;
2459 srandom((int) now_real);
2460 }
2461
2462 static int short_loops = 0;
2463 #define SHORT_LOOP_MAX 10
2464
2465 void
2466 io_loop(void)
2467 {
2468 int poll_tout;
2469 time_t tout;
2470 int nfds, events, pout;
2471 sock *s;
2472 node *n;
2473 int fdmax = 256;
2474 struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
2475
2476 watchdog_start1();
2477 for(;;)
2478 {
2479 events = ev_run_list(&global_event_list);
2480 timers:
2481 update_times();
2482 tout = tm_first_shot();
2483 if (tout <= now)
2484 {
2485 tm_shot();
2486 goto timers;
2487 }
2488 poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */
2489
2490 io_close_event();
2491
2492 nfds = 0;
2493 WALK_LIST(n, sock_list)
2494 {
2495 pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
2496 s = SKIP_BACK(sock, n, n);
2497 if (s->rx_hook)
2498 {
2499 pfd[nfds].fd = s->fd;
2500 pfd[nfds].events |= POLLIN;
2501 }
2502 if (s->tx_hook && s->ttx != s->tpos)
2503 {
2504 pfd[nfds].fd = s->fd;
2505 pfd[nfds].events |= POLLOUT;
2506 }
2507 if (pfd[nfds].fd != -1)
2508 {
2509 s->index = nfds;
2510 nfds++;
2511 }
2512 else
2513 s->index = -1;
2514
2515 if (nfds >= fdmax)
2516 {
2517 fdmax *= 2;
2518 pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
2519 }
2520 }
2521
2522 /*
2523 * Yes, this is racy. But even if the signal comes before this test
2524 * and entering poll(), it gets caught on the next timer tick.
2525 */
2526
2527 if (async_config_flag)
2528 {
2529 io_log_event(async_config, NULL);
2530 async_config();
2531 async_config_flag = 0;
2532 continue;
2533 }
2534 if (async_dump_flag)
2535 {
2536 io_log_event(async_dump, NULL);
2537 async_dump();
2538 async_dump_flag = 0;
2539 continue;
2540 }
2541 if (async_shutdown_flag)
2542 {
2543 io_log_event(async_shutdown, NULL);
2544 async_shutdown();
2545 async_shutdown_flag = 0;
2546 continue;
2547 }
2548
2549 /* And finally enter poll() to find active sockets */
2550 watchdog_stop();
2551 pout = poll(pfd, nfds, poll_tout);
2552 watchdog_start();
2553
2554 if (pout < 0)
2555 {
2556 if (errno == EINTR || errno == EAGAIN)
2557 continue;
2558 die("poll: %m");
2559 }
2560 if (pout)
2561 {
2562 /* guaranteed to be non-empty */
2563 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2564
2565 while (current_sock)
2566 {
2567 sock *s = current_sock;
2568 if (s->index == -1)
2569 {
2570 current_sock = sk_next(s);
2571 goto next;
2572 }
2573
2574 int e;
2575 int steps;
2576
2577 steps = MAX_STEPS;
2578 if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2579 do
2580 {
2581 steps--;
2582 io_log_event(s->rx_hook, s->data);
2583 e = sk_read(s, pfd[s->index].revents);
2584 if (s != current_sock)
2585 goto next;
2586 }
2587 while (e && s->rx_hook && steps);
2588
2589 steps = MAX_STEPS;
2590 if (pfd[s->index].revents & POLLOUT)
2591 do
2592 {
2593 steps--;
2594 io_log_event(s->tx_hook, s->data);
2595 e = sk_write(s);
2596 if (s != current_sock)
2597 goto next;
2598 }
2599 while (e && steps);
2600
2601 current_sock = sk_next(s);
2602 next: ;
2603 }
2604
2605 short_loops++;
2606 if (events && (short_loops < SHORT_LOOP_MAX))
2607 continue;
2608 short_loops = 0;
2609
2610 int count = 0;
2611 current_sock = stored_sock;
2612 if (current_sock == NULL)
2613 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2614
2615 while (current_sock && count < MAX_RX_STEPS)
2616 {
2617 sock *s = current_sock;
2618 if (s->index == -1)
2619 {
2620 current_sock = sk_next(s);
2621 goto next2;
2622 }
2623
2624 if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2625 {
2626 count++;
2627 io_log_event(s->rx_hook, s->data);
2628 sk_read(s, pfd[s->index].revents);
2629 if (s != current_sock)
2630 goto next2;
2631 }
2632
2633 if (pfd[s->index].revents & (POLLHUP | POLLERR))
2634 {
2635 sk_err(s, pfd[s->index].revents);
2636 if (s != current_sock)
2637 goto next2;
2638 }
2639
2640 current_sock = sk_next(s);
2641 next2: ;
2642 }
2643
2644
2645 stored_sock = current_sock;
2646 }
2647 }
2648 }
2649
2650 void
2651 test_old_bird(char *path)
2652 {
2653 int fd;
2654 struct sockaddr_un sa;
2655
2656 fd = socket(AF_UNIX, SOCK_STREAM, 0);
2657 if (fd < 0)
2658 die("Cannot create socket: %m");
2659 if (strlen(path) >= sizeof(sa.sun_path))
2660 die("Socket path too long");
2661 bzero(&sa, sizeof(sa));
2662 sa.sun_family = AF_UNIX;
2663 strcpy(sa.sun_path, path);
2664 if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
2665 die("I found another BIRD running.");
2666 close(fd);
2667 }