]> git.ipfire.org Git - thirdparty/bird.git/blob - sysdep/unix/io.c
IO: Fix the previous bugfix
[thirdparty/bird.git] / sysdep / unix / io.c
1 /*
2 * BIRD Internet Routing Daemon -- Unix I/O
3 *
4 * (c) 1998--2004 Martin Mares <mj@ucw.cz>
5 * (c) 2004 Ondrej Filip <feela@network.cz>
6 *
7 * Can be freely distributed and used under the terms of the GNU GPL.
8 */
9
10 /* Unfortunately, some glibc versions hide parts of RFC 3542 API
11 if _GNU_SOURCE is not defined. */
12 #define _GNU_SOURCE 1
13
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <time.h>
17 #include <sys/time.h>
18 #include <sys/types.h>
19 #include <sys/socket.h>
20 #include <sys/uio.h>
21 #include <sys/un.h>
22 #include <unistd.h>
23 #include <fcntl.h>
24 #include <errno.h>
25 #include <net/if.h>
26 #include <netinet/in.h>
27 #include <netinet/tcp.h>
28 #include <netinet/udp.h>
29 #include <netinet/icmp6.h>
30
31 #include "nest/bird.h"
32 #include "lib/lists.h"
33 #include "lib/resource.h"
34 #include "lib/timer.h"
35 #include "lib/socket.h"
36 #include "lib/event.h"
37 #include "lib/string.h"
38 #include "nest/iface.h"
39
40 #include "lib/unix.h"
41 #include "lib/sysio.h"
42
43 /* Maximum number of calls of tx handler for one socket in one
44 * select iteration. Should be small enough to not monopolize CPU by
45 * one protocol instance.
46 */
47 #define MAX_STEPS 4
48
49 /* Maximum number of calls of rx handler for all sockets in one select
50 iteration. RX callbacks are often much more costly so we limit
51 this to gen small latencies */
52 #define MAX_RX_STEPS 4
53
54 /*
55 * Tracked Files
56 */
57
58 struct rfile {
59 resource r;
60 FILE *f;
61 };
62
63 static void
64 rf_free(resource *r)
65 {
66 struct rfile *a = (struct rfile *) r;
67
68 fclose(a->f);
69 }
70
71 static void
72 rf_dump(resource *r)
73 {
74 struct rfile *a = (struct rfile *) r;
75
76 debug("(FILE *%p)\n", a->f);
77 }
78
79 static struct resclass rf_class = {
80 "FILE",
81 sizeof(struct rfile),
82 rf_free,
83 rf_dump,
84 NULL,
85 NULL
86 };
87
88 void *
89 tracked_fopen(pool *p, char *name, char *mode)
90 {
91 FILE *f = fopen(name, mode);
92
93 if (f)
94 {
95 struct rfile *r = ralloc(p, &rf_class);
96 r->f = f;
97 }
98 return f;
99 }
100
101 /**
102 * DOC: Timers
103 *
104 * Timers are resources which represent a wish of a module to call
105 * a function at the specified time. The platform dependent code
106 * doesn't guarantee exact timing, only that a timer function
107 * won't be called before the requested time.
108 *
109 * In BIRD, time is represented by values of the &bird_clock_t type
110 * which are integral numbers interpreted as a relative number of seconds since
111 * some fixed time point in past. The current time can be read
112 * from variable @now with reasonable accuracy and is monotonic. There is also
113 * a current 'absolute' time in variable @now_real reported by OS.
114 *
115 * Each timer is described by a &timer structure containing a pointer
116 * to the handler function (@hook), data private to this function (@data),
117 * time the function should be called at (@expires, 0 for inactive timers),
118 * for the other fields see |timer.h|.
119 */
120
121 #define NEAR_TIMER_LIMIT 4
122
123 static list near_timers, far_timers;
124 static bird_clock_t first_far_timer = TIME_INFINITY;
125
126 /* now must be different from 0, because 0 is a special value in timer->expires */
127 bird_clock_t now = 1, now_real, boot_time;
128
129 static void
130 update_times_plain(void)
131 {
132 bird_clock_t new_time = time(NULL);
133 int delta = new_time - now_real;
134
135 if ((delta >= 0) && (delta < 60))
136 now += delta;
137 else if (now_real != 0)
138 log(L_WARN "Time jump, delta %d s", delta);
139
140 now_real = new_time;
141 }
142
143 static void
144 update_times_gettime(void)
145 {
146 struct timespec ts;
147 int rv;
148
149 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
150 if (rv != 0)
151 die("clock_gettime: %m");
152
153 if (ts.tv_sec != now) {
154 if (ts.tv_sec < now)
155 log(L_ERR "Monotonic timer is broken");
156
157 now = ts.tv_sec;
158 now_real = time(NULL);
159 }
160 }
161
162 static int clock_monotonic_available;
163
164 static inline void
165 update_times(void)
166 {
167 if (clock_monotonic_available)
168 update_times_gettime();
169 else
170 update_times_plain();
171 }
172
173 static inline void
174 init_times(void)
175 {
176 struct timespec ts;
177 clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
178 if (!clock_monotonic_available)
179 log(L_WARN "Monotonic timer is missing");
180 }
181
182
183 static void
184 tm_free(resource *r)
185 {
186 timer *t = (timer *) r;
187
188 tm_stop(t);
189 }
190
191 static void
192 tm_dump(resource *r)
193 {
194 timer *t = (timer *) r;
195
196 debug("(code %p, data %p, ", t->hook, t->data);
197 if (t->randomize)
198 debug("rand %d, ", t->randomize);
199 if (t->recurrent)
200 debug("recur %d, ", t->recurrent);
201 if (t->expires)
202 debug("expires in %d sec)\n", t->expires - now);
203 else
204 debug("inactive)\n");
205 }
206
207 static struct resclass tm_class = {
208 "Timer",
209 sizeof(timer),
210 tm_free,
211 tm_dump,
212 NULL,
213 NULL
214 };
215
216 /**
217 * tm_new - create a timer
218 * @p: pool
219 *
220 * This function creates a new timer resource and returns
221 * a pointer to it. To use the timer, you need to fill in
222 * the structure fields and call tm_start() to start timing.
223 */
224 timer *
225 tm_new(pool *p)
226 {
227 timer *t = ralloc(p, &tm_class);
228 return t;
229 }
230
231 static inline void
232 tm_insert_near(timer *t)
233 {
234 node *n = HEAD(near_timers);
235
236 while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
237 n = n->next;
238 insert_node(&t->n, n->prev);
239 }
240
241 /**
242 * tm_start - start a timer
243 * @t: timer
244 * @after: number of seconds the timer should be run after
245 *
246 * This function schedules the hook function of the timer to
247 * be called after @after seconds. If the timer has been already
248 * started, it's @expire time is replaced by the new value.
249 *
250 * You can have set the @randomize field of @t, the timeout
251 * will be increased by a random number of seconds chosen
252 * uniformly from range 0 .. @randomize.
253 *
254 * You can call tm_start() from the handler function of the timer
255 * to request another run of the timer. Also, you can set the @recurrent
256 * field to have the timer re-added automatically with the same timeout.
257 */
258 void
259 tm_start(timer *t, unsigned after)
260 {
261 bird_clock_t when;
262
263 if (t->randomize)
264 after += random() % (t->randomize + 1);
265 when = now + after;
266 if (t->expires == when)
267 return;
268 if (t->expires)
269 rem_node(&t->n);
270 t->expires = when;
271 if (after <= NEAR_TIMER_LIMIT)
272 tm_insert_near(t);
273 else
274 {
275 if (!first_far_timer || first_far_timer > when)
276 first_far_timer = when;
277 add_tail(&far_timers, &t->n);
278 }
279 }
280
281 /**
282 * tm_stop - stop a timer
283 * @t: timer
284 *
285 * This function stops a timer. If the timer is already stopped,
286 * nothing happens.
287 */
288 void
289 tm_stop(timer *t)
290 {
291 if (t->expires)
292 {
293 rem_node(&t->n);
294 t->expires = 0;
295 }
296 }
297
298 static void
299 tm_dump_them(char *name, list *l)
300 {
301 node *n;
302 timer *t;
303
304 debug("%s timers:\n", name);
305 WALK_LIST(n, *l)
306 {
307 t = SKIP_BACK(timer, n, n);
308 debug("%p ", t);
309 tm_dump(&t->r);
310 }
311 debug("\n");
312 }
313
314 void
315 tm_dump_all(void)
316 {
317 tm_dump_them("Near", &near_timers);
318 tm_dump_them("Far", &far_timers);
319 }
320
321 static inline time_t
322 tm_first_shot(void)
323 {
324 time_t x = first_far_timer;
325
326 if (!EMPTY_LIST(near_timers))
327 {
328 timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
329 if (t->expires < x)
330 x = t->expires;
331 }
332 return x;
333 }
334
335 void io_log_event(void *hook, void *data);
336
337 static void
338 tm_shot(void)
339 {
340 timer *t;
341 node *n, *m;
342
343 if (first_far_timer <= now)
344 {
345 bird_clock_t limit = now + NEAR_TIMER_LIMIT;
346 first_far_timer = TIME_INFINITY;
347 n = HEAD(far_timers);
348 while (m = n->next)
349 {
350 t = SKIP_BACK(timer, n, n);
351 if (t->expires <= limit)
352 {
353 rem_node(n);
354 tm_insert_near(t);
355 }
356 else if (t->expires < first_far_timer)
357 first_far_timer = t->expires;
358 n = m;
359 }
360 }
361 while ((n = HEAD(near_timers)) -> next)
362 {
363 int delay;
364 t = SKIP_BACK(timer, n, n);
365 if (t->expires > now)
366 break;
367 rem_node(n);
368 delay = t->expires - now;
369 t->expires = 0;
370 if (t->recurrent)
371 {
372 int i = t->recurrent - delay;
373 if (i < 0)
374 i = 0;
375 tm_start(t, i);
376 }
377 io_log_event(t->hook, t->data);
378 t->hook(t);
379 }
380 }
381
382 /**
383 * tm_parse_datetime - parse a date and time
384 * @x: datetime string
385 *
386 * tm_parse_datetime() takes a textual representation of
387 * a date and time (dd-mm-yyyy hh:mm:ss)
388 * and converts it to the corresponding value of type &bird_clock_t.
389 */
390 bird_clock_t
391 tm_parse_datetime(char *x)
392 {
393 struct tm tm;
394 int n;
395 time_t t;
396
397 if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
398 return tm_parse_date(x);
399 tm.tm_mon--;
400 tm.tm_year -= 1900;
401 t = mktime(&tm);
402 if (t == (time_t) -1)
403 return 0;
404 return t;
405 }
406 /**
407 * tm_parse_date - parse a date
408 * @x: date string
409 *
410 * tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
411 * and converts it to the corresponding value of type &bird_clock_t.
412 */
413 bird_clock_t
414 tm_parse_date(char *x)
415 {
416 struct tm tm;
417 int n;
418 time_t t;
419
420 if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
421 return 0;
422 tm.tm_mon--;
423 tm.tm_year -= 1900;
424 tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
425 t = mktime(&tm);
426 if (t == (time_t) -1)
427 return 0;
428 return t;
429 }
430
431 static void
432 tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
433 {
434 static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
435 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
436
437 if (delta < 20*3600)
438 bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
439 else if (delta < 360*86400)
440 bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
441 else
442 bsprintf(x, "%d", tm->tm_year+1900);
443 }
444
445 #include "conf/conf.h"
446
447 /**
448 * tm_format_datetime - convert date and time to textual representation
449 * @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
450 * @t: time
451 *
452 * This function formats the given relative time value @t to a textual
453 * date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
454 */
455 void
456 tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
457 {
458 const char *fmt_used;
459 struct tm *tm;
460 bird_clock_t delta = now - t;
461 t = now_real - delta;
462 tm = localtime(&t);
463
464 if (fmt_spec->fmt1 == NULL)
465 return tm_format_reltime(x, tm, delta);
466
467 if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
468 fmt_used = fmt_spec->fmt1;
469 else
470 fmt_used = fmt_spec->fmt2;
471
472 int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
473 if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
474 strcpy(x, "<too-long>");
475 }
476
477
478 /**
479 * DOC: Sockets
480 *
481 * Socket resources represent network connections. Their data structure (&socket)
482 * contains a lot of fields defining the exact type of the socket, the local and
483 * remote addresses and ports, pointers to socket buffers and finally pointers to
484 * hook functions to be called when new data have arrived to the receive buffer
485 * (@rx_hook), when the contents of the transmit buffer have been transmitted
486 * (@tx_hook) and when an error or connection close occurs (@err_hook).
487 *
488 * Freeing of sockets from inside socket hooks is perfectly safe.
489 */
490
491 #ifndef SOL_IP
492 #define SOL_IP IPPROTO_IP
493 #endif
494
495 #ifndef SOL_IPV6
496 #define SOL_IPV6 IPPROTO_IPV6
497 #endif
498
499 #ifndef SOL_ICMPV6
500 #define SOL_ICMPV6 IPPROTO_ICMPV6
501 #endif
502
503
504 /*
505 * Sockaddr helper functions
506 */
507
508 static inline int sockaddr_length(int af)
509 { return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
510
511 static inline void
512 sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, struct iface *ifa, uint port)
513 {
514 memset(sa, 0, sizeof(struct sockaddr_in));
515 #ifdef HAVE_SIN_LEN
516 sa->sin_len = sizeof(struct sockaddr_in);
517 #endif
518 sa->sin_family = AF_INET;
519 sa->sin_port = htons(port);
520 sa->sin_addr = ipa_to_in4(a);
521 }
522
523 static inline void
524 sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
525 {
526 memset(sa, 0, sizeof(struct sockaddr_in6));
527 #ifdef SIN6_LEN
528 sa->sin6_len = sizeof(struct sockaddr_in6);
529 #endif
530 sa->sin6_family = AF_INET6;
531 sa->sin6_port = htons(port);
532 sa->sin6_flowinfo = 0;
533 sa->sin6_addr = ipa_to_in6(a);
534
535 if (ifa && ipa_is_link_local(a))
536 sa->sin6_scope_id = ifa->index;
537 }
538
539 void
540 sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
541 {
542 if (af == AF_INET)
543 sockaddr_fill4((struct sockaddr_in *) sa, a, ifa, port);
544 else if (af == AF_INET6)
545 sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
546 else
547 bug("Unknown AF");
548 }
549
550 static inline void
551 sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, struct iface **ifa, uint *port)
552 {
553 *port = ntohs(sa->sin_port);
554 *a = ipa_from_in4(sa->sin_addr);
555 }
556
557 static inline void
558 sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
559 {
560 *port = ntohs(sa->sin6_port);
561 *a = ipa_from_in6(sa->sin6_addr);
562
563 if (ifa && ipa_is_link_local(*a))
564 *ifa = if_find_by_index(sa->sin6_scope_id);
565 }
566
567 int
568 sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
569 {
570 if (sa->sa.sa_family != af)
571 goto fail;
572
573 if (af == AF_INET)
574 sockaddr_read4((struct sockaddr_in *) sa, a, ifa, port);
575 else if (af == AF_INET6)
576 sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
577 else
578 goto fail;
579
580 return 0;
581
582 fail:
583 *a = IPA_NONE;
584 *port = 0;
585 return -1;
586 }
587
588
589 /*
590 * IPv6 multicast syscalls
591 */
592
593 /* Fortunately standardized in RFC 3493 */
594
595 #define INIT_MREQ6(maddr,ifa) \
596 { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
597
598 static inline int
599 sk_setup_multicast6(sock *s)
600 {
601 int index = s->iface->index;
602 int ttl = s->ttl;
603 int n = 0;
604
605 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
606 ERR("IPV6_MULTICAST_IF");
607
608 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
609 ERR("IPV6_MULTICAST_HOPS");
610
611 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
612 ERR("IPV6_MULTICAST_LOOP");
613
614 return 0;
615 }
616
617 static inline int
618 sk_join_group6(sock *s, ip_addr maddr)
619 {
620 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
621
622 if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
623 ERR("IPV6_JOIN_GROUP");
624
625 return 0;
626 }
627
628 static inline int
629 sk_leave_group6(sock *s, ip_addr maddr)
630 {
631 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
632
633 if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
634 ERR("IPV6_LEAVE_GROUP");
635
636 return 0;
637 }
638
639
640 /*
641 * IPv6 packet control messages
642 */
643
644 /* Also standardized, in RFC 3542 */
645
646 /*
647 * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
648 * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
649 * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
650 * RFC and we use IPV6_PKTINFO.
651 */
652 #ifndef IPV6_RECVPKTINFO
653 #define IPV6_RECVPKTINFO IPV6_PKTINFO
654 #endif
655 /*
656 * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
657 */
658 #ifndef IPV6_RECVHOPLIMIT
659 #define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
660 #endif
661
662
663 #define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
664 #define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
665
666 static inline int
667 sk_request_cmsg6_pktinfo(sock *s)
668 {
669 int y = 1;
670
671 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
672 ERR("IPV6_RECVPKTINFO");
673
674 return 0;
675 }
676
677 static inline int
678 sk_request_cmsg6_ttl(sock *s)
679 {
680 int y = 1;
681
682 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
683 ERR("IPV6_RECVHOPLIMIT");
684
685 return 0;
686 }
687
688 static inline void
689 sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
690 {
691 if (cm->cmsg_type == IPV6_PKTINFO)
692 {
693 struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
694 s->laddr = ipa_from_in6(pi->ipi6_addr);
695 s->lifindex = pi->ipi6_ifindex;
696 }
697 }
698
699 static inline void
700 sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
701 {
702 if (cm->cmsg_type == IPV6_HOPLIMIT)
703 s->rcv_ttl = * (int *) CMSG_DATA(cm);
704 }
705
706 static inline void
707 sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
708 {
709 struct cmsghdr *cm;
710 struct in6_pktinfo *pi;
711 int controllen = 0;
712
713 msg->msg_control = cbuf;
714 msg->msg_controllen = cbuflen;
715
716 cm = CMSG_FIRSTHDR(msg);
717 cm->cmsg_level = SOL_IPV6;
718 cm->cmsg_type = IPV6_PKTINFO;
719 cm->cmsg_len = CMSG_LEN(sizeof(*pi));
720 controllen += CMSG_SPACE(sizeof(*pi));
721
722 pi = (struct in6_pktinfo *) CMSG_DATA(cm);
723 pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
724 pi->ipi6_addr = ipa_to_in6(s->saddr);
725
726 msg->msg_controllen = controllen;
727 }
728
729
730 /*
731 * Miscellaneous socket syscalls
732 */
733
734 static inline int
735 sk_set_ttl4(sock *s, int ttl)
736 {
737 if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
738 ERR("IP_TTL");
739
740 return 0;
741 }
742
743 static inline int
744 sk_set_ttl6(sock *s, int ttl)
745 {
746 if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
747 ERR("IPV6_UNICAST_HOPS");
748
749 return 0;
750 }
751
752 static inline int
753 sk_set_tos4(sock *s, int tos)
754 {
755 if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
756 ERR("IP_TOS");
757
758 return 0;
759 }
760
761 static inline int
762 sk_set_tos6(sock *s, int tos)
763 {
764 if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
765 ERR("IPV6_TCLASS");
766
767 return 0;
768 }
769
770 static inline int
771 sk_set_high_port(sock *s)
772 {
773 /* Port range setting is optional, ignore it if not supported */
774
775 #ifdef IP_PORTRANGE
776 if (sk_is_ipv4(s))
777 {
778 int range = IP_PORTRANGE_HIGH;
779 if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
780 ERR("IP_PORTRANGE");
781 }
782 #endif
783
784 #ifdef IPV6_PORTRANGE
785 if (sk_is_ipv6(s))
786 {
787 int range = IPV6_PORTRANGE_HIGH;
788 if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
789 ERR("IPV6_PORTRANGE");
790 }
791 #endif
792
793 return 0;
794 }
795
796 static inline byte *
797 sk_skip_ip_header(byte *pkt, int *len)
798 {
799 if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
800 return NULL;
801
802 int hlen = (*pkt & 0x0f) * 4;
803 if ((hlen < 20) || (hlen > *len))
804 return NULL;
805
806 *len -= hlen;
807 return pkt + hlen;
808 }
809
810 byte *
811 sk_rx_buffer(sock *s, int *len)
812 {
813 if (sk_is_ipv4(s) && (s->type == SK_IP))
814 return sk_skip_ip_header(s->rbuf, len);
815 else
816 return s->rbuf;
817 }
818
819
820 /*
821 * Public socket functions
822 */
823
824 /**
825 * sk_setup_multicast - enable multicast for given socket
826 * @s: socket
827 *
828 * Prepare transmission of multicast packets for given datagram socket.
829 * The socket must have defined @iface.
830 *
831 * Result: 0 for success, -1 for an error.
832 */
833
834 int
835 sk_setup_multicast(sock *s)
836 {
837 ASSERT(s->iface);
838
839 if (sk_is_ipv4(s))
840 return sk_setup_multicast4(s);
841 else
842 return sk_setup_multicast6(s);
843 }
844
845 /**
846 * sk_join_group - join multicast group for given socket
847 * @s: socket
848 * @maddr: multicast address
849 *
850 * Join multicast group for given datagram socket and associated interface.
851 * The socket must have defined @iface.
852 *
853 * Result: 0 for success, -1 for an error.
854 */
855
856 int
857 sk_join_group(sock *s, ip_addr maddr)
858 {
859 if (sk_is_ipv4(s))
860 return sk_join_group4(s, maddr);
861 else
862 return sk_join_group6(s, maddr);
863 }
864
865 /**
866 * sk_leave_group - leave multicast group for given socket
867 * @s: socket
868 * @maddr: multicast address
869 *
870 * Leave multicast group for given datagram socket and associated interface.
871 * The socket must have defined @iface.
872 *
873 * Result: 0 for success, -1 for an error.
874 */
875
876 int
877 sk_leave_group(sock *s, ip_addr maddr)
878 {
879 if (sk_is_ipv4(s))
880 return sk_leave_group4(s, maddr);
881 else
882 return sk_leave_group6(s, maddr);
883 }
884
885 /**
886 * sk_setup_broadcast - enable broadcast for given socket
887 * @s: socket
888 *
889 * Allow reception and transmission of broadcast packets for given datagram
890 * socket. The socket must have defined @iface. For transmission, packets should
891 * be send to @brd address of @iface.
892 *
893 * Result: 0 for success, -1 for an error.
894 */
895
896 int
897 sk_setup_broadcast(sock *s)
898 {
899 int y = 1;
900
901 if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
902 ERR("SO_BROADCAST");
903
904 return 0;
905 }
906
907 /**
908 * sk_set_ttl - set transmit TTL for given socket
909 * @s: socket
910 * @ttl: TTL value
911 *
912 * Set TTL for already opened connections when TTL was not set before. Useful
913 * for accepted connections when different ones should have different TTL.
914 *
915 * Result: 0 for success, -1 for an error.
916 */
917
918 int
919 sk_set_ttl(sock *s, int ttl)
920 {
921 s->ttl = ttl;
922
923 if (sk_is_ipv4(s))
924 return sk_set_ttl4(s, ttl);
925 else
926 return sk_set_ttl6(s, ttl);
927 }
928
929 /**
930 * sk_set_min_ttl - set minimal accepted TTL for given socket
931 * @s: socket
932 * @ttl: TTL value
933 *
934 * Set minimal accepted TTL for given socket. Can be used for TTL security.
935 * implementations.
936 *
937 * Result: 0 for success, -1 for an error.
938 */
939
940 int
941 sk_set_min_ttl(sock *s, int ttl)
942 {
943 if (sk_is_ipv4(s))
944 return sk_set_min_ttl4(s, ttl);
945 else
946 return sk_set_min_ttl6(s, ttl);
947 }
948
949 #if 0
950 /**
951 * sk_set_md5_auth - add / remove MD5 security association for given socket
952 * @s: socket
953 * @a: IP address of the other side
954 * @ifa: Interface for link-local IP address
955 * @passwd: password used for MD5 authentication
956 *
957 * In TCP MD5 handling code in kernel, there is a set of pairs (address,
958 * password) used to choose password according to address of the other side.
959 * This function is useful for listening socket, for active sockets it is enough
960 * to set s->password field.
961 *
962 * When called with passwd != NULL, the new pair is added,
963 * When called with passwd == NULL, the existing pair is removed.
964 *
965 * Result: 0 for success, -1 for an error.
966 */
967
968 int
969 sk_set_md5_auth(sock *s, ip_addr a, struct iface *ifa, char *passwd)
970 { DUMMY; }
971 #endif
972
973 /**
974 * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
975 * @s: socket
976 * @offset: offset
977 *
978 * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
979 * kernel will automatically fill it for outgoing packets and check it for
980 * incoming packets. Should not be used on ICMPv6 sockets, where the position is
981 * known to the kernel.
982 *
983 * Result: 0 for success, -1 for an error.
984 */
985
986 int
987 sk_set_ipv6_checksum(sock *s, int offset)
988 {
989 if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
990 ERR("IPV6_CHECKSUM");
991
992 return 0;
993 }
994
995 int
996 sk_set_icmp6_filter(sock *s, int p1, int p2)
997 {
998 /* a bit of lame interface, but it is here only for Radv */
999 struct icmp6_filter f;
1000
1001 ICMP6_FILTER_SETBLOCKALL(&f);
1002 ICMP6_FILTER_SETPASS(p1, &f);
1003 ICMP6_FILTER_SETPASS(p2, &f);
1004
1005 if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
1006 ERR("ICMP6_FILTER");
1007
1008 return 0;
1009 }
1010
1011 void
1012 sk_log_error(sock *s, const char *p)
1013 {
1014 log(L_ERR "%s: Socket error: %s%#m", p, s->err);
1015 }
1016
1017
1018 /*
1019 * Actual struct birdsock code
1020 */
1021
1022 static list sock_list;
1023 static struct birdsock *current_sock;
1024 static struct birdsock *stored_sock;
1025 static int sock_recalc_fdsets_p;
1026
1027 static inline sock *
1028 sk_next(sock *s)
1029 {
1030 if (!s->n.next->next)
1031 return NULL;
1032 else
1033 return SKIP_BACK(sock, n, s->n.next);
1034 }
1035
1036 static void
1037 sk_alloc_bufs(sock *s)
1038 {
1039 if (!s->rbuf && s->rbsize)
1040 s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
1041 s->rpos = s->rbuf;
1042 if (!s->tbuf && s->tbsize)
1043 s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
1044 s->tpos = s->ttx = s->tbuf;
1045 }
1046
1047 static void
1048 sk_free_bufs(sock *s)
1049 {
1050 if (s->rbuf_alloc)
1051 {
1052 xfree(s->rbuf_alloc);
1053 s->rbuf = s->rbuf_alloc = NULL;
1054 }
1055 if (s->tbuf_alloc)
1056 {
1057 xfree(s->tbuf_alloc);
1058 s->tbuf = s->tbuf_alloc = NULL;
1059 }
1060 }
1061
1062 static void
1063 sk_free(resource *r)
1064 {
1065 sock *s = (sock *) r;
1066
1067 sk_free_bufs(s);
1068 if (s->fd >= 0)
1069 {
1070 close(s->fd);
1071
1072 /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
1073 if (s->flags & SKF_THREAD)
1074 return;
1075
1076 if (s == current_sock)
1077 current_sock = sk_next(s);
1078 if (s == stored_sock)
1079 stored_sock = sk_next(s);
1080 rem_node(&s->n);
1081 sock_recalc_fdsets_p = 1;
1082 }
1083 }
1084
1085 void
1086 sk_set_rbsize(sock *s, uint val)
1087 {
1088 ASSERT(s->rbuf_alloc == s->rbuf);
1089
1090 if (s->rbsize == val)
1091 return;
1092
1093 s->rbsize = val;
1094 xfree(s->rbuf_alloc);
1095 s->rbuf_alloc = xmalloc(val);
1096 s->rpos = s->rbuf = s->rbuf_alloc;
1097 }
1098
1099 void
1100 sk_set_tbsize(sock *s, uint val)
1101 {
1102 ASSERT(s->tbuf_alloc == s->tbuf);
1103
1104 if (s->tbsize == val)
1105 return;
1106
1107 byte *old_tbuf = s->tbuf;
1108
1109 s->tbsize = val;
1110 s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
1111 s->tpos = s->tbuf + (s->tpos - old_tbuf);
1112 s->ttx = s->tbuf + (s->ttx - old_tbuf);
1113 }
1114
1115 void
1116 sk_set_tbuf(sock *s, void *tbuf)
1117 {
1118 s->tbuf = tbuf ?: s->tbuf_alloc;
1119 s->ttx = s->tpos = s->tbuf;
1120 }
1121
1122 void
1123 sk_reallocate(sock *s)
1124 {
1125 sk_free_bufs(s);
1126 sk_alloc_bufs(s);
1127 }
1128
1129 static void
1130 sk_dump(resource *r)
1131 {
1132 sock *s = (sock *) r;
1133 static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "DEL!" };
1134
1135 debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
1136 sk_type_names[s->type],
1137 s->data,
1138 s->saddr,
1139 s->sport,
1140 s->daddr,
1141 s->dport,
1142 s->tos,
1143 s->ttl,
1144 s->iface ? s->iface->name : "none");
1145 }
1146
1147 static struct resclass sk_class = {
1148 "Socket",
1149 sizeof(sock),
1150 sk_free,
1151 sk_dump,
1152 NULL,
1153 NULL
1154 };
1155
1156 /**
1157 * sk_new - create a socket
1158 * @p: pool
1159 *
1160 * This function creates a new socket resource. If you want to use it,
1161 * you need to fill in all the required fields of the structure and
1162 * call sk_open() to do the actual opening of the socket.
1163 *
1164 * The real function name is sock_new(), sk_new() is a macro wrapper
1165 * to avoid collision with OpenSSL.
1166 */
1167 sock *
1168 sock_new(pool *p)
1169 {
1170 sock *s = ralloc(p, &sk_class);
1171 s->pool = p;
1172 // s->saddr = s->daddr = IPA_NONE;
1173 s->tos = s->priority = s->ttl = -1;
1174 s->fd = -1;
1175 return s;
1176 }
1177
1178 static int
1179 sk_setup(sock *s)
1180 {
1181 int y = 1;
1182 int fd = s->fd;
1183
1184 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1185 ERR("O_NONBLOCK");
1186
1187 if (!s->af)
1188 return 0;
1189
1190 if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
1191 s->flags |= SKF_PKTINFO;
1192
1193 #ifdef CONFIG_USE_HDRINCL
1194 if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
1195 {
1196 s->flags &= ~SKF_PKTINFO;
1197 s->flags |= SKF_HDRINCL;
1198 if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
1199 ERR("IP_HDRINCL");
1200 }
1201 #endif
1202
1203 if (s->iface)
1204 {
1205 #ifdef SO_BINDTODEVICE
1206 struct ifreq ifr;
1207 strcpy(ifr.ifr_name, s->iface->name);
1208 if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
1209 ERR("SO_BINDTODEVICE");
1210 #endif
1211
1212 #ifdef CONFIG_UNIX_DONTROUTE
1213 if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
1214 ERR("SO_DONTROUTE");
1215 #endif
1216 }
1217
1218 if (s->priority >= 0)
1219 if (sk_set_priority(s, s->priority) < 0)
1220 return -1;
1221
1222 if (sk_is_ipv4(s))
1223 {
1224 if (s->flags & SKF_LADDR_RX)
1225 if (sk_request_cmsg4_pktinfo(s) < 0)
1226 return -1;
1227
1228 if (s->flags & SKF_TTL_RX)
1229 if (sk_request_cmsg4_ttl(s) < 0)
1230 return -1;
1231
1232 if ((s->type == SK_UDP) || (s->type == SK_IP))
1233 if (sk_disable_mtu_disc4(s) < 0)
1234 return -1;
1235
1236 if (s->ttl >= 0)
1237 if (sk_set_ttl4(s, s->ttl) < 0)
1238 return -1;
1239
1240 if (s->tos >= 0)
1241 if (sk_set_tos4(s, s->tos) < 0)
1242 return -1;
1243 }
1244
1245 if (sk_is_ipv6(s))
1246 {
1247 if (s->flags & SKF_V6ONLY)
1248 if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
1249 ERR("IPV6_V6ONLY");
1250
1251 if (s->flags & SKF_LADDR_RX)
1252 if (sk_request_cmsg6_pktinfo(s) < 0)
1253 return -1;
1254
1255 if (s->flags & SKF_TTL_RX)
1256 if (sk_request_cmsg6_ttl(s) < 0)
1257 return -1;
1258
1259 if ((s->type == SK_UDP) || (s->type == SK_IP))
1260 if (sk_disable_mtu_disc6(s) < 0)
1261 return -1;
1262
1263 if (s->ttl >= 0)
1264 if (sk_set_ttl6(s, s->ttl) < 0)
1265 return -1;
1266
1267 if (s->tos >= 0)
1268 if (sk_set_tos6(s, s->tos) < 0)
1269 return -1;
1270 }
1271
1272 return 0;
1273 }
1274
1275 static void
1276 sk_insert(sock *s)
1277 {
1278 add_tail(&sock_list, &s->n);
1279 sock_recalc_fdsets_p = 1;
1280 }
1281
1282 static void
1283 sk_tcp_connected(sock *s)
1284 {
1285 sockaddr sa;
1286 int sa_len = sizeof(sa);
1287
1288 if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
1289 (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
1290 log(L_WARN "SOCK: Cannot get local IP address for TCP>");
1291
1292 s->type = SK_TCP;
1293 sk_alloc_bufs(s);
1294 s->tx_hook(s);
1295 }
1296
1297 static int
1298 sk_passive_connected(sock *s, int type)
1299 {
1300 sockaddr loc_sa, rem_sa;
1301 int loc_sa_len = sizeof(loc_sa);
1302 int rem_sa_len = sizeof(rem_sa);
1303
1304 int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
1305 if (fd < 0)
1306 {
1307 if ((errno != EINTR) && (errno != EAGAIN))
1308 s->err_hook(s, errno);
1309 return 0;
1310 }
1311
1312 sock *t = sk_new(s->pool);
1313 t->type = type;
1314 t->fd = fd;
1315 t->af = s->af;
1316 t->ttl = s->ttl;
1317 t->tos = s->tos;
1318 t->rbsize = s->rbsize;
1319 t->tbsize = s->tbsize;
1320
1321 if (type == SK_TCP)
1322 {
1323 if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
1324 (sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
1325 log(L_WARN "SOCK: Cannot get local IP address for TCP<");
1326
1327 if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
1328 log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
1329 }
1330
1331 if (fd >= FD_SETSIZE)
1332 {
1333 /* FIXME: Call err_hook instead ? */
1334 log(L_ERR "SOCK: Incoming connection from %I%J (port %d) %s",
1335 t->daddr, ipa_is_link_local(t->daddr) ? t->iface : NULL,
1336 t->dport, "rejected due to FD_SETSIZE limit");
1337 close(fd);
1338 t->fd = -1;
1339 rfree(t);
1340 return 1;
1341 }
1342
1343 if (sk_setup(t) < 0)
1344 {
1345 /* FIXME: Call err_hook instead ? */
1346 log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
1347
1348 /* FIXME: handle it better in rfree() */
1349 close(t->fd);
1350 t->fd = -1;
1351 rfree(t);
1352 return 1;
1353 }
1354
1355 sk_insert(t);
1356 sk_alloc_bufs(t);
1357 s->rx_hook(t, 0);
1358 return 1;
1359 }
1360
1361 /**
1362 * sk_open - open a socket
1363 * @s: socket
1364 *
1365 * This function takes a socket resource created by sk_new() and
1366 * initialized by the user and binds a corresponding network connection
1367 * to it.
1368 *
1369 * Result: 0 for success, -1 for an error.
1370 */
1371 int
1372 sk_open(sock *s)
1373 {
1374 int af = BIRD_AF;
1375 int fd = -1;
1376 int do_bind = 0;
1377 int bind_port = 0;
1378 ip_addr bind_addr = IPA_NONE;
1379 sockaddr sa;
1380
1381 switch (s->type)
1382 {
1383 case SK_TCP_ACTIVE:
1384 s->ttx = ""; /* Force s->ttx != s->tpos */
1385 /* Fall thru */
1386 case SK_TCP_PASSIVE:
1387 fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
1388 bind_port = s->sport;
1389 bind_addr = s->saddr;
1390 do_bind = bind_port || ipa_nonzero(bind_addr);
1391 break;
1392
1393 case SK_UDP:
1394 fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
1395 bind_port = s->sport;
1396 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1397 do_bind = 1;
1398 break;
1399
1400 case SK_IP:
1401 fd = socket(af, SOCK_RAW, s->dport);
1402 bind_port = 0;
1403 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1404 do_bind = ipa_nonzero(bind_addr);
1405 break;
1406
1407 case SK_MAGIC:
1408 af = 0;
1409 fd = s->fd;
1410 break;
1411
1412 default:
1413 bug("sk_open() called for invalid sock type %d", s->type);
1414 }
1415
1416 if (fd < 0)
1417 ERR("socket");
1418
1419 if (fd >= FD_SETSIZE)
1420 ERR2("FD_SETSIZE limit reached");
1421
1422 s->af = af;
1423 s->fd = fd;
1424
1425 if (sk_setup(s) < 0)
1426 goto err;
1427
1428 if (do_bind)
1429 {
1430 if (bind_port)
1431 {
1432 int y = 1;
1433
1434 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
1435 ERR2("SO_REUSEADDR");
1436
1437 #ifdef CONFIG_NO_IFACE_BIND
1438 /* Workaround missing ability to bind to an iface */
1439 if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
1440 {
1441 if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
1442 ERR2("SO_REUSEPORT");
1443 }
1444 #endif
1445 }
1446 else
1447 if (s->flags & SKF_HIGH_PORT)
1448 if (sk_set_high_port(s) < 0)
1449 log(L_WARN "Socket error: %s%#m", s->err);
1450
1451 sockaddr_fill(&sa, af, bind_addr, s->iface, bind_port);
1452 if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
1453 ERR2("bind");
1454 }
1455
1456 if (s->password)
1457 if (sk_set_md5_auth(s, s->daddr, s->iface, s->password) < 0)
1458 goto err;
1459
1460 switch (s->type)
1461 {
1462 case SK_TCP_ACTIVE:
1463 sockaddr_fill(&sa, af, s->daddr, s->iface, s->dport);
1464 if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
1465 sk_tcp_connected(s);
1466 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
1467 errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
1468 ERR2("connect");
1469 break;
1470
1471 case SK_TCP_PASSIVE:
1472 if (listen(fd, 8) < 0)
1473 ERR2("listen");
1474 break;
1475
1476 case SK_MAGIC:
1477 break;
1478
1479 default:
1480 sk_alloc_bufs(s);
1481 }
1482
1483 if (!(s->flags & SKF_THREAD))
1484 sk_insert(s);
1485 return 0;
1486
1487 err:
1488 close(fd);
1489 s->fd = -1;
1490 return -1;
1491 }
1492
1493 int
1494 sk_open_unix(sock *s, char *name)
1495 {
1496 struct sockaddr_un sa;
1497 int fd;
1498
1499 /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
1500
1501 fd = socket(AF_UNIX, SOCK_STREAM, 0);
1502 if (fd < 0)
1503 return -1;
1504
1505 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1506 return -1;
1507
1508 /* Path length checked in test_old_bird() */
1509 sa.sun_family = AF_UNIX;
1510 strcpy(sa.sun_path, name);
1511
1512 if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
1513 return -1;
1514
1515 if (listen(fd, 8) < 0)
1516 return -1;
1517
1518 s->fd = fd;
1519 sk_insert(s);
1520 return 0;
1521 }
1522
1523
1524 #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
1525 CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
1526 #define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
1527
1528 static void
1529 sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
1530 {
1531 if (sk_is_ipv4(s))
1532 sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
1533 else
1534 sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
1535 }
1536
1537 static void
1538 sk_process_cmsgs(sock *s, struct msghdr *msg)
1539 {
1540 struct cmsghdr *cm;
1541
1542 s->laddr = IPA_NONE;
1543 s->lifindex = 0;
1544 s->rcv_ttl = -1;
1545
1546 for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
1547 {
1548 if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
1549 {
1550 sk_process_cmsg4_pktinfo(s, cm);
1551 sk_process_cmsg4_ttl(s, cm);
1552 }
1553
1554 if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
1555 {
1556 sk_process_cmsg6_pktinfo(s, cm);
1557 sk_process_cmsg6_ttl(s, cm);
1558 }
1559 }
1560 }
1561
1562
1563 static inline int
1564 sk_sendmsg(sock *s)
1565 {
1566 struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
1567 byte cmsg_buf[CMSG_TX_SPACE];
1568 sockaddr dst;
1569
1570 sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
1571
1572 struct msghdr msg = {
1573 .msg_name = &dst.sa,
1574 .msg_namelen = SA_LEN(dst),
1575 .msg_iov = &iov,
1576 .msg_iovlen = 1
1577 };
1578
1579 #ifdef CONFIG_USE_HDRINCL
1580 byte hdr[20];
1581 struct iovec iov2[2] = { {hdr, 20}, iov };
1582
1583 if (s->flags & SKF_HDRINCL)
1584 {
1585 sk_prepare_ip_header(s, hdr, iov.iov_len);
1586 msg.msg_iov = iov2;
1587 msg.msg_iovlen = 2;
1588 }
1589 #endif
1590
1591 if (s->flags & SKF_PKTINFO)
1592 sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
1593
1594 return sendmsg(s->fd, &msg, 0);
1595 }
1596
1597 static inline int
1598 sk_recvmsg(sock *s)
1599 {
1600 struct iovec iov = {s->rbuf, s->rbsize};
1601 byte cmsg_buf[CMSG_RX_SPACE];
1602 sockaddr src;
1603
1604 struct msghdr msg = {
1605 .msg_name = &src.sa,
1606 .msg_namelen = sizeof(src), // XXXX ??
1607 .msg_iov = &iov,
1608 .msg_iovlen = 1,
1609 .msg_control = cmsg_buf,
1610 .msg_controllen = sizeof(cmsg_buf),
1611 .msg_flags = 0
1612 };
1613
1614 int rv = recvmsg(s->fd, &msg, 0);
1615 if (rv < 0)
1616 return rv;
1617
1618 //ifdef IPV4
1619 // if (cf_type == SK_IP)
1620 // rv = ipv4_skip_header(pbuf, rv);
1621 //endif
1622
1623 sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
1624 sk_process_cmsgs(s, &msg);
1625
1626 if (msg.msg_flags & MSG_TRUNC)
1627 s->flags |= SKF_TRUNCATED;
1628 else
1629 s->flags &= ~SKF_TRUNCATED;
1630
1631 return rv;
1632 }
1633
1634
1635 static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
1636
1637 static int
1638 sk_maybe_write(sock *s)
1639 {
1640 int e;
1641
1642 switch (s->type)
1643 {
1644 case SK_TCP:
1645 case SK_MAGIC:
1646 case SK_UNIX:
1647 while (s->ttx != s->tpos)
1648 {
1649 e = write(s->fd, s->ttx, s->tpos - s->ttx);
1650
1651 if (e < 0)
1652 {
1653 if (errno != EINTR && errno != EAGAIN)
1654 {
1655 reset_tx_buffer(s);
1656 /* EPIPE is just a connection close notification during TX */
1657 s->err_hook(s, (errno != EPIPE) ? errno : 0);
1658 return -1;
1659 }
1660 return 0;
1661 }
1662 s->ttx += e;
1663 }
1664 reset_tx_buffer(s);
1665 return 1;
1666
1667 case SK_UDP:
1668 case SK_IP:
1669 {
1670 if (s->tbuf == s->tpos)
1671 return 1;
1672
1673 e = sk_sendmsg(s);
1674
1675 if (e < 0)
1676 {
1677 if (errno != EINTR && errno != EAGAIN)
1678 {
1679 reset_tx_buffer(s);
1680 s->err_hook(s, errno);
1681 return -1;
1682 }
1683
1684 if (!s->tx_hook)
1685 reset_tx_buffer(s);
1686 return 0;
1687 }
1688 reset_tx_buffer(s);
1689 return 1;
1690 }
1691 default:
1692 bug("sk_maybe_write: unknown socket type %d", s->type);
1693 }
1694 }
1695
1696 int
1697 sk_rx_ready(sock *s)
1698 {
1699 fd_set rd, wr;
1700 struct timeval timo;
1701 int rv;
1702
1703 FD_ZERO(&rd);
1704 FD_ZERO(&wr);
1705 FD_SET(s->fd, &rd);
1706
1707 timo.tv_sec = 0;
1708 timo.tv_usec = 0;
1709
1710 redo:
1711 rv = select(s->fd+1, &rd, &wr, NULL, &timo);
1712
1713 if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
1714 goto redo;
1715
1716 return rv;
1717 }
1718
1719 /**
1720 * sk_send - send data to a socket
1721 * @s: socket
1722 * @len: number of bytes to send
1723 *
1724 * This function sends @len bytes of data prepared in the
1725 * transmit buffer of the socket @s to the network connection.
1726 * If the packet can be sent immediately, it does so and returns
1727 * 1, else it queues the packet for later processing, returns 0
1728 * and calls the @tx_hook of the socket when the tranmission
1729 * takes place.
1730 */
1731 int
1732 sk_send(sock *s, unsigned len)
1733 {
1734 s->ttx = s->tbuf;
1735 s->tpos = s->tbuf + len;
1736 return sk_maybe_write(s);
1737 }
1738
1739 /**
1740 * sk_send_to - send data to a specific destination
1741 * @s: socket
1742 * @len: number of bytes to send
1743 * @addr: IP address to send the packet to
1744 * @port: port to send the packet to
1745 *
1746 * This is a sk_send() replacement for connection-less packet sockets
1747 * which allows destination of the packet to be chosen dynamically.
1748 * Raw IP sockets should use 0 for @port.
1749 */
1750 int
1751 sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
1752 {
1753 s->daddr = addr;
1754 if (port)
1755 s->dport = port;
1756
1757 s->ttx = s->tbuf;
1758 s->tpos = s->tbuf + len;
1759 return sk_maybe_write(s);
1760 }
1761
1762 /*
1763 int
1764 sk_send_full(sock *s, unsigned len, struct iface *ifa,
1765 ip_addr saddr, ip_addr daddr, unsigned dport)
1766 {
1767 s->iface = ifa;
1768 s->saddr = saddr;
1769 s->daddr = daddr;
1770 s->dport = dport;
1771 s->ttx = s->tbuf;
1772 s->tpos = s->tbuf + len;
1773 return sk_maybe_write(s);
1774 }
1775 */
1776
1777 /* sk_read() and sk_write() are called from BFD's event loop */
1778
1779 int
1780 sk_read(sock *s)
1781 {
1782 switch (s->type)
1783 {
1784 case SK_TCP_PASSIVE:
1785 return sk_passive_connected(s, SK_TCP);
1786
1787 case SK_UNIX_PASSIVE:
1788 return sk_passive_connected(s, SK_UNIX);
1789
1790 case SK_TCP:
1791 case SK_UNIX:
1792 {
1793 int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
1794
1795 if (c < 0)
1796 {
1797 if (errno != EINTR && errno != EAGAIN)
1798 s->err_hook(s, errno);
1799 }
1800 else if (!c)
1801 s->err_hook(s, 0);
1802 else
1803 {
1804 s->rpos += c;
1805 if (s->rx_hook(s, s->rpos - s->rbuf))
1806 {
1807 /* We need to be careful since the socket could have been deleted by the hook */
1808 if (current_sock == s)
1809 s->rpos = s->rbuf;
1810 }
1811 return 1;
1812 }
1813 return 0;
1814 }
1815
1816 case SK_MAGIC:
1817 return s->rx_hook(s, 0);
1818
1819 default:
1820 {
1821 int e = sk_recvmsg(s);
1822
1823 if (e < 0)
1824 {
1825 if (errno != EINTR && errno != EAGAIN)
1826 s->err_hook(s, errno);
1827 return 0;
1828 }
1829
1830 s->rpos = s->rbuf + e;
1831 s->rx_hook(s, e);
1832 return 1;
1833 }
1834 }
1835 }
1836
1837 int
1838 sk_write(sock *s)
1839 {
1840 switch (s->type)
1841 {
1842 case SK_TCP_ACTIVE:
1843 {
1844 sockaddr sa;
1845 sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
1846
1847 if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
1848 sk_tcp_connected(s);
1849 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
1850 s->err_hook(s, errno);
1851 return 0;
1852 }
1853
1854 default:
1855 if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
1856 {
1857 if (s->tx_hook)
1858 s->tx_hook(s);
1859 return 1;
1860 }
1861 return 0;
1862 }
1863 }
1864
1865 void
1866 sk_dump_all(void)
1867 {
1868 node *n;
1869 sock *s;
1870
1871 debug("Open sockets:\n");
1872 WALK_LIST(n, sock_list)
1873 {
1874 s = SKIP_BACK(sock, n, n);
1875 debug("%p ", s);
1876 sk_dump(&s->r);
1877 }
1878 debug("\n");
1879 }
1880
1881
1882 /*
1883 * Internal event log and watchdog
1884 */
1885
1886 #define EVENT_LOG_LENGTH 32
1887
1888 struct event_log_entry
1889 {
1890 void *hook;
1891 void *data;
1892 btime timestamp;
1893 btime duration;
1894 };
1895
1896 static struct event_log_entry event_log[EVENT_LOG_LENGTH];
1897 static struct event_log_entry *event_open;
1898 static int event_log_pos, event_log_num, watchdog_active;
1899 static btime last_time;
1900 static btime loop_time;
1901
1902 static void
1903 io_update_time(void)
1904 {
1905 struct timespec ts;
1906 int rv;
1907
1908 if (!clock_monotonic_available)
1909 return;
1910
1911 /*
1912 * This is third time-tracking procedure (after update_times() above and
1913 * times_update() in BFD), dedicated to internal event log and latency
1914 * tracking. Hopefully, we consolidate these sometimes.
1915 */
1916
1917 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
1918 if (rv < 0)
1919 die("clock_gettime: %m");
1920
1921 last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
1922
1923 if (event_open)
1924 {
1925 event_open->duration = last_time - event_open->timestamp;
1926
1927 if (event_open->duration > config->latency_limit)
1928 log(L_WARN "Event 0x%p 0x%p took %d ms",
1929 event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
1930
1931 event_open = NULL;
1932 }
1933 }
1934
1935 /**
1936 * io_log_event - mark approaching event into event log
1937 * @hook: event hook address
1938 * @data: event data address
1939 *
1940 * Store info (hook, data, timestamp) about the following internal event into
1941 * a circular event log (@event_log). When latency tracking is enabled, the log
1942 * entry is kept open (in @event_open) so the duration can be filled later.
1943 */
1944 void
1945 io_log_event(void *hook, void *data)
1946 {
1947 if (config->latency_debug)
1948 io_update_time();
1949
1950 struct event_log_entry *en = event_log + event_log_pos;
1951
1952 en->hook = hook;
1953 en->data = data;
1954 en->timestamp = last_time;
1955 en->duration = 0;
1956
1957 event_log_num++;
1958 event_log_pos++;
1959 event_log_pos %= EVENT_LOG_LENGTH;
1960
1961 event_open = config->latency_debug ? en : NULL;
1962 }
1963
1964 static inline void
1965 io_close_event(void)
1966 {
1967 if (event_open)
1968 io_update_time();
1969 }
1970
1971 void
1972 io_log_dump(void)
1973 {
1974 int i;
1975
1976 log(L_DEBUG "Event log:");
1977 for (i = 0; i < EVENT_LOG_LENGTH; i++)
1978 {
1979 struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
1980 if (en->hook)
1981 log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
1982 (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
1983 }
1984 }
1985
1986 void
1987 watchdog_sigalrm(int sig UNUSED)
1988 {
1989 /* Update last_time and duration, but skip latency check */
1990 config->latency_limit = 0xffffffff;
1991 io_update_time();
1992
1993 /* We want core dump */
1994 abort();
1995 }
1996
1997 static inline void
1998 watchdog_start1(void)
1999 {
2000 io_update_time();
2001
2002 loop_time = last_time;
2003 }
2004
2005 static inline void
2006 watchdog_start(void)
2007 {
2008 io_update_time();
2009
2010 loop_time = last_time;
2011 event_log_num = 0;
2012
2013 if (config->watchdog_timeout)
2014 {
2015 alarm(config->watchdog_timeout);
2016 watchdog_active = 1;
2017 }
2018 }
2019
2020 static inline void
2021 watchdog_stop(void)
2022 {
2023 io_update_time();
2024
2025 if (watchdog_active)
2026 {
2027 alarm(0);
2028 watchdog_active = 0;
2029 }
2030
2031 btime duration = last_time - loop_time;
2032 if (duration > config->watchdog_warning)
2033 log(L_WARN "I/O loop cycle took %d ms for %d events",
2034 (int) (duration TO_MS), event_log_num);
2035 }
2036
2037
2038 /*
2039 * Main I/O Loop
2040 */
2041
2042 volatile int async_config_flag; /* Asynchronous reconfiguration/dump scheduled */
2043 volatile int async_dump_flag;
2044
2045 void
2046 io_init(void)
2047 {
2048 init_list(&near_timers);
2049 init_list(&far_timers);
2050 init_list(&sock_list);
2051 init_list(&global_event_list);
2052 krt_io_init();
2053 init_times();
2054 update_times();
2055 boot_time = now;
2056 srandom((int) now_real);
2057 }
2058
2059 static int short_loops = 0;
2060 #define SHORT_LOOP_MAX 10
2061
2062 void
2063 io_loop(void)
2064 {
2065 fd_set rd, wr;
2066 struct timeval timo;
2067 time_t tout;
2068 int hi, events;
2069 sock *s;
2070 node *n;
2071
2072 watchdog_start1();
2073 sock_recalc_fdsets_p = 1;
2074 for(;;)
2075 {
2076 events = ev_run_list(&global_event_list);
2077 update_times();
2078 tout = tm_first_shot();
2079 if (tout <= now)
2080 {
2081 tm_shot();
2082 continue;
2083 }
2084 timo.tv_sec = events ? 0 : MIN(tout - now, 3);
2085 timo.tv_usec = 0;
2086
2087 io_close_event();
2088
2089 if (sock_recalc_fdsets_p)
2090 {
2091 sock_recalc_fdsets_p = 0;
2092 FD_ZERO(&rd);
2093 FD_ZERO(&wr);
2094 }
2095
2096 hi = 0;
2097 WALK_LIST(n, sock_list)
2098 {
2099 s = SKIP_BACK(sock, n, n);
2100 if (s->rx_hook)
2101 {
2102 FD_SET(s->fd, &rd);
2103 if (s->fd > hi)
2104 hi = s->fd;
2105 }
2106 else
2107 FD_CLR(s->fd, &rd);
2108 if (s->tx_hook && s->ttx != s->tpos)
2109 {
2110 FD_SET(s->fd, &wr);
2111 if (s->fd > hi)
2112 hi = s->fd;
2113 }
2114 else
2115 FD_CLR(s->fd, &wr);
2116 }
2117
2118 /*
2119 * Yes, this is racy. But even if the signal comes before this test
2120 * and entering select(), it gets caught on the next timer tick.
2121 */
2122
2123 if (async_config_flag)
2124 {
2125 io_log_event(async_config, NULL);
2126 async_config();
2127 async_config_flag = 0;
2128 continue;
2129 }
2130 if (async_dump_flag)
2131 {
2132 io_log_event(async_dump, NULL);
2133 async_dump();
2134 async_dump_flag = 0;
2135 continue;
2136 }
2137 if (async_shutdown_flag)
2138 {
2139 io_log_event(async_shutdown, NULL);
2140 async_shutdown();
2141 async_shutdown_flag = 0;
2142 continue;
2143 }
2144
2145 /* And finally enter select() to find active sockets */
2146 watchdog_stop();
2147 hi = select(hi+1, &rd, &wr, NULL, &timo);
2148 watchdog_start();
2149
2150 if (hi < 0)
2151 {
2152 if (errno == EINTR || errno == EAGAIN)
2153 continue;
2154 die("select: %m");
2155 }
2156 if (hi)
2157 {
2158 /* guaranteed to be non-empty */
2159 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2160
2161 while (current_sock)
2162 {
2163 sock *s = current_sock;
2164 int e;
2165 int steps;
2166
2167 steps = MAX_STEPS;
2168 if ((s->type >= SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook)
2169 do
2170 {
2171 steps--;
2172 io_log_event(s->rx_hook, s->data);
2173 e = sk_read(s);
2174 if (s != current_sock)
2175 goto next;
2176 }
2177 while (e && s->rx_hook && steps);
2178
2179 steps = MAX_STEPS;
2180 if (FD_ISSET(s->fd, &wr))
2181 do
2182 {
2183 steps--;
2184 io_log_event(s->tx_hook, s->data);
2185 e = sk_write(s);
2186 if (s != current_sock)
2187 goto next;
2188 }
2189 while (e && steps);
2190 current_sock = sk_next(s);
2191 next: ;
2192 }
2193
2194 short_loops++;
2195 if (events && (short_loops < SHORT_LOOP_MAX))
2196 continue;
2197 short_loops = 0;
2198
2199 int count = 0;
2200 current_sock = stored_sock;
2201 if (current_sock == NULL)
2202 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2203
2204 while (current_sock && count < MAX_RX_STEPS)
2205 {
2206 sock *s = current_sock;
2207 int e UNUSED;
2208
2209 if ((s->type < SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook)
2210 {
2211 count++;
2212 io_log_event(s->rx_hook, s->data);
2213 e = sk_read(s);
2214 if (s != current_sock)
2215 goto next2;
2216 }
2217 current_sock = sk_next(s);
2218 next2: ;
2219 }
2220
2221 stored_sock = current_sock;
2222 }
2223 }
2224 }
2225
2226 void
2227 test_old_bird(char *path)
2228 {
2229 int fd;
2230 struct sockaddr_un sa;
2231
2232 fd = socket(AF_UNIX, SOCK_STREAM, 0);
2233 if (fd < 0)
2234 die("Cannot create socket: %m");
2235 if (strlen(path) >= sizeof(sa.sun_path))
2236 die("Socket path too long");
2237 bzero(&sa, sizeof(sa));
2238 sa.sun_family = AF_UNIX;
2239 strcpy(sa.sun_path, path);
2240 if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
2241 die("I found another BIRD running.");
2242 close(fd);
2243 }