]> git.ipfire.org Git - thirdparty/bird.git/blob - sysdep/unix/io.c
IO: Fix socket priority
[thirdparty/bird.git] / sysdep / unix / io.c
1 /*
2 * BIRD Internet Routing Daemon -- Unix I/O
3 *
4 * (c) 1998--2004 Martin Mares <mj@ucw.cz>
5 * (c) 2004 Ondrej Filip <feela@network.cz>
6 *
7 * Can be freely distributed and used under the terms of the GNU GPL.
8 */
9
10 /* Unfortunately, some glibc versions hide parts of RFC 3542 API
11 if _GNU_SOURCE is not defined. */
12 #ifndef _GNU_SOURCE
13 #define _GNU_SOURCE
14 #endif
15
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <time.h>
19 #include <sys/time.h>
20 #include <sys/types.h>
21 #include <sys/socket.h>
22 #include <sys/uio.h>
23 #include <sys/un.h>
24 #include <poll.h>
25 #include <unistd.h>
26 #include <fcntl.h>
27 #include <errno.h>
28 #include <net/if.h>
29 #include <netinet/in.h>
30 #include <netinet/tcp.h>
31 #include <netinet/udp.h>
32 #include <netinet/icmp6.h>
33
34 #include "nest/bird.h"
35 #include "lib/lists.h"
36 #include "lib/resource.h"
37 #include "lib/timer.h"
38 #include "lib/socket.h"
39 #include "lib/event.h"
40 #include "lib/string.h"
41 #include "nest/iface.h"
42
43 #include "lib/unix.h"
44 #include "lib/sysio.h"
45
46 /* Maximum number of calls of tx handler for one socket in one
47 * poll iteration. Should be small enough to not monopolize CPU by
48 * one protocol instance.
49 */
50 #define MAX_STEPS 4
51
52 /* Maximum number of calls of rx handler for all sockets in one poll
53 iteration. RX callbacks are often much more costly so we limit
54 this to gen small latencies */
55 #define MAX_RX_STEPS 4
56
57 /*
58 * Tracked Files
59 */
60
61 struct rfile {
62 resource r;
63 FILE *f;
64 };
65
66 static void
67 rf_free(resource *r)
68 {
69 struct rfile *a = (struct rfile *) r;
70
71 fclose(a->f);
72 }
73
74 static void
75 rf_dump(resource *r)
76 {
77 struct rfile *a = (struct rfile *) r;
78
79 debug("(FILE *%p)\n", a->f);
80 }
81
82 static struct resclass rf_class = {
83 "FILE",
84 sizeof(struct rfile),
85 rf_free,
86 rf_dump,
87 NULL,
88 NULL
89 };
90
91 void *
92 tracked_fopen(pool *p, char *name, char *mode)
93 {
94 FILE *f = fopen(name, mode);
95
96 if (f)
97 {
98 struct rfile *r = ralloc(p, &rf_class);
99 r->f = f;
100 }
101 return f;
102 }
103
104 /**
105 * DOC: Timers
106 *
107 * Timers are resources which represent a wish of a module to call
108 * a function at the specified time. The platform dependent code
109 * doesn't guarantee exact timing, only that a timer function
110 * won't be called before the requested time.
111 *
112 * In BIRD, time is represented by values of the &bird_clock_t type
113 * which are integral numbers interpreted as a relative number of seconds since
114 * some fixed time point in past. The current time can be read
115 * from variable @now with reasonable accuracy and is monotonic. There is also
116 * a current 'absolute' time in variable @now_real reported by OS.
117 *
118 * Each timer is described by a &timer structure containing a pointer
119 * to the handler function (@hook), data private to this function (@data),
120 * time the function should be called at (@expires, 0 for inactive timers),
121 * for the other fields see |timer.h|.
122 */
123
124 #define NEAR_TIMER_LIMIT 4
125
126 static list near_timers, far_timers;
127 static bird_clock_t first_far_timer = TIME_INFINITY;
128
129 /* now must be different from 0, because 0 is a special value in timer->expires */
130 bird_clock_t now = 1, now_real, boot_time;
131
132 static void
133 update_times_plain(void)
134 {
135 bird_clock_t new_time = time(NULL);
136 int delta = new_time - now_real;
137
138 if ((delta >= 0) && (delta < 60))
139 now += delta;
140 else if (now_real != 0)
141 log(L_WARN "Time jump, delta %d s", delta);
142
143 now_real = new_time;
144 }
145
146 static void
147 update_times_gettime(void)
148 {
149 struct timespec ts;
150 int rv;
151
152 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
153 if (rv != 0)
154 die("clock_gettime: %m");
155
156 if (ts.tv_sec != now) {
157 if (ts.tv_sec < now)
158 log(L_ERR "Monotonic timer is broken");
159
160 now = ts.tv_sec;
161 now_real = time(NULL);
162 }
163 }
164
165 static int clock_monotonic_available;
166
167 static inline void
168 update_times(void)
169 {
170 if (clock_monotonic_available)
171 update_times_gettime();
172 else
173 update_times_plain();
174 }
175
176 static inline void
177 init_times(void)
178 {
179 struct timespec ts;
180 clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
181 if (!clock_monotonic_available)
182 log(L_WARN "Monotonic timer is missing");
183 }
184
185
186 static void
187 tm_free(resource *r)
188 {
189 timer *t = (timer *) r;
190
191 tm_stop(t);
192 }
193
194 static void
195 tm_dump(resource *r)
196 {
197 timer *t = (timer *) r;
198
199 debug("(code %p, data %p, ", t->hook, t->data);
200 if (t->randomize)
201 debug("rand %d, ", t->randomize);
202 if (t->recurrent)
203 debug("recur %d, ", t->recurrent);
204 if (t->expires)
205 debug("expires in %d sec)\n", t->expires - now);
206 else
207 debug("inactive)\n");
208 }
209
210 static struct resclass tm_class = {
211 "Timer",
212 sizeof(timer),
213 tm_free,
214 tm_dump,
215 NULL,
216 NULL
217 };
218
219 /**
220 * tm_new - create a timer
221 * @p: pool
222 *
223 * This function creates a new timer resource and returns
224 * a pointer to it. To use the timer, you need to fill in
225 * the structure fields and call tm_start() to start timing.
226 */
227 timer *
228 tm_new(pool *p)
229 {
230 timer *t = ralloc(p, &tm_class);
231 return t;
232 }
233
234 static inline void
235 tm_insert_near(timer *t)
236 {
237 node *n = HEAD(near_timers);
238
239 while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
240 n = n->next;
241 insert_node(&t->n, n->prev);
242 }
243
244 /**
245 * tm_start - start a timer
246 * @t: timer
247 * @after: number of seconds the timer should be run after
248 *
249 * This function schedules the hook function of the timer to
250 * be called after @after seconds. If the timer has been already
251 * started, it's @expire time is replaced by the new value.
252 *
253 * You can have set the @randomize field of @t, the timeout
254 * will be increased by a random number of seconds chosen
255 * uniformly from range 0 .. @randomize.
256 *
257 * You can call tm_start() from the handler function of the timer
258 * to request another run of the timer. Also, you can set the @recurrent
259 * field to have the timer re-added automatically with the same timeout.
260 */
261 void
262 tm_start(timer *t, unsigned after)
263 {
264 bird_clock_t when;
265
266 if (t->randomize)
267 after += random() % (t->randomize + 1);
268 when = now + after;
269 if (t->expires == when)
270 return;
271 if (t->expires)
272 rem_node(&t->n);
273 t->expires = when;
274 if (after <= NEAR_TIMER_LIMIT)
275 tm_insert_near(t);
276 else
277 {
278 if (!first_far_timer || first_far_timer > when)
279 first_far_timer = when;
280 add_tail(&far_timers, &t->n);
281 }
282 }
283
284 /**
285 * tm_stop - stop a timer
286 * @t: timer
287 *
288 * This function stops a timer. If the timer is already stopped,
289 * nothing happens.
290 */
291 void
292 tm_stop(timer *t)
293 {
294 if (t->expires)
295 {
296 rem_node(&t->n);
297 t->expires = 0;
298 }
299 }
300
301 static void
302 tm_dump_them(char *name, list *l)
303 {
304 node *n;
305 timer *t;
306
307 debug("%s timers:\n", name);
308 WALK_LIST(n, *l)
309 {
310 t = SKIP_BACK(timer, n, n);
311 debug("%p ", t);
312 tm_dump(&t->r);
313 }
314 debug("\n");
315 }
316
317 void
318 tm_dump_all(void)
319 {
320 tm_dump_them("Near", &near_timers);
321 tm_dump_them("Far", &far_timers);
322 }
323
324 static inline time_t
325 tm_first_shot(void)
326 {
327 time_t x = first_far_timer;
328
329 if (!EMPTY_LIST(near_timers))
330 {
331 timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
332 if (t->expires < x)
333 x = t->expires;
334 }
335 return x;
336 }
337
338 void io_log_event(void *hook, void *data);
339
340 static void
341 tm_shot(void)
342 {
343 timer *t;
344 node *n, *m;
345
346 if (first_far_timer <= now)
347 {
348 bird_clock_t limit = now + NEAR_TIMER_LIMIT;
349 first_far_timer = TIME_INFINITY;
350 n = HEAD(far_timers);
351 while (m = n->next)
352 {
353 t = SKIP_BACK(timer, n, n);
354 if (t->expires <= limit)
355 {
356 rem_node(n);
357 tm_insert_near(t);
358 }
359 else if (t->expires < first_far_timer)
360 first_far_timer = t->expires;
361 n = m;
362 }
363 }
364 while ((n = HEAD(near_timers)) -> next)
365 {
366 int delay;
367 t = SKIP_BACK(timer, n, n);
368 if (t->expires > now)
369 break;
370 rem_node(n);
371 delay = t->expires - now;
372 t->expires = 0;
373 if (t->recurrent)
374 {
375 int i = t->recurrent - delay;
376 if (i < 0)
377 i = 0;
378 tm_start(t, i);
379 }
380 io_log_event(t->hook, t->data);
381 t->hook(t);
382 }
383 }
384
385 /**
386 * tm_parse_datetime - parse a date and time
387 * @x: datetime string
388 *
389 * tm_parse_datetime() takes a textual representation of
390 * a date and time (dd-mm-yyyy hh:mm:ss)
391 * and converts it to the corresponding value of type &bird_clock_t.
392 */
393 bird_clock_t
394 tm_parse_datetime(char *x)
395 {
396 struct tm tm;
397 int n;
398 time_t t;
399
400 if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
401 return tm_parse_date(x);
402 tm.tm_mon--;
403 tm.tm_year -= 1900;
404 t = mktime(&tm);
405 if (t == (time_t) -1)
406 return 0;
407 return t;
408 }
409 /**
410 * tm_parse_date - parse a date
411 * @x: date string
412 *
413 * tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
414 * and converts it to the corresponding value of type &bird_clock_t.
415 */
416 bird_clock_t
417 tm_parse_date(char *x)
418 {
419 struct tm tm;
420 int n;
421 time_t t;
422
423 if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
424 return 0;
425 tm.tm_mon--;
426 tm.tm_year -= 1900;
427 tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
428 t = mktime(&tm);
429 if (t == (time_t) -1)
430 return 0;
431 return t;
432 }
433
434 static void
435 tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
436 {
437 static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
438 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
439
440 if (delta < 20*3600)
441 bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
442 else if (delta < 360*86400)
443 bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
444 else
445 bsprintf(x, "%d", tm->tm_year+1900);
446 }
447
448 #include "conf/conf.h"
449
450 /**
451 * tm_format_datetime - convert date and time to textual representation
452 * @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
453 * @fmt_spec: specification of resulting textual representation of the time
454 * @t: time
455 *
456 * This function formats the given relative time value @t to a textual
457 * date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
458 */
459 void
460 tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
461 {
462 const char *fmt_used;
463 struct tm *tm;
464 bird_clock_t delta = now - t;
465 t = now_real - delta;
466 tm = localtime(&t);
467
468 if (fmt_spec->fmt1 == NULL)
469 return tm_format_reltime(x, tm, delta);
470
471 if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
472 fmt_used = fmt_spec->fmt1;
473 else
474 fmt_used = fmt_spec->fmt2;
475
476 int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
477 if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
478 strcpy(x, "<too-long>");
479 }
480
481
482 /**
483 * DOC: Sockets
484 *
485 * Socket resources represent network connections. Their data structure (&socket)
486 * contains a lot of fields defining the exact type of the socket, the local and
487 * remote addresses and ports, pointers to socket buffers and finally pointers to
488 * hook functions to be called when new data have arrived to the receive buffer
489 * (@rx_hook), when the contents of the transmit buffer have been transmitted
490 * (@tx_hook) and when an error or connection close occurs (@err_hook).
491 *
492 * Freeing of sockets from inside socket hooks is perfectly safe.
493 */
494
495 #ifndef SOL_IP
496 #define SOL_IP IPPROTO_IP
497 #endif
498
499 #ifndef SOL_IPV6
500 #define SOL_IPV6 IPPROTO_IPV6
501 #endif
502
503 #ifndef SOL_ICMPV6
504 #define SOL_ICMPV6 IPPROTO_ICMPV6
505 #endif
506
507
508 /*
509 * Sockaddr helper functions
510 */
511
512 static inline int UNUSED sockaddr_length(int af)
513 { return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
514
515 static inline void
516 sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
517 {
518 memset(sa, 0, sizeof(struct sockaddr_in));
519 #ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
520 sa->sin_len = sizeof(struct sockaddr_in);
521 #endif
522 sa->sin_family = AF_INET;
523 sa->sin_port = htons(port);
524 sa->sin_addr = ipa_to_in4(a);
525 }
526
527 static inline void
528 sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
529 {
530 memset(sa, 0, sizeof(struct sockaddr_in6));
531 #ifdef SIN6_LEN
532 sa->sin6_len = sizeof(struct sockaddr_in6);
533 #endif
534 sa->sin6_family = AF_INET6;
535 sa->sin6_port = htons(port);
536 sa->sin6_flowinfo = 0;
537 sa->sin6_addr = ipa_to_in6(a);
538
539 if (ifa && ipa_is_link_local(a))
540 sa->sin6_scope_id = ifa->index;
541 }
542
543 void
544 sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
545 {
546 if (af == AF_INET)
547 sockaddr_fill4((struct sockaddr_in *) sa, a, port);
548 else if (af == AF_INET6)
549 sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
550 else
551 bug("Unknown AF");
552 }
553
554 static inline void
555 sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
556 {
557 *port = ntohs(sa->sin_port);
558 *a = ipa_from_in4(sa->sin_addr);
559 }
560
561 static inline void
562 sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
563 {
564 *port = ntohs(sa->sin6_port);
565 *a = ipa_from_in6(sa->sin6_addr);
566
567 if (ifa && ipa_is_link_local(*a))
568 *ifa = if_find_by_index(sa->sin6_scope_id);
569 }
570
571 int
572 sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
573 {
574 if (sa->sa.sa_family != af)
575 goto fail;
576
577 if (af == AF_INET)
578 sockaddr_read4((struct sockaddr_in *) sa, a, port);
579 else if (af == AF_INET6)
580 sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
581 else
582 goto fail;
583
584 return 0;
585
586 fail:
587 *a = IPA_NONE;
588 *port = 0;
589 return -1;
590 }
591
592
593 /*
594 * IPv6 multicast syscalls
595 */
596
597 /* Fortunately standardized in RFC 3493 */
598
599 #define INIT_MREQ6(maddr,ifa) \
600 { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
601
602 static inline int
603 sk_setup_multicast6(sock *s)
604 {
605 int index = s->iface->index;
606 int ttl = s->ttl;
607 int n = 0;
608
609 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
610 ERR("IPV6_MULTICAST_IF");
611
612 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
613 ERR("IPV6_MULTICAST_HOPS");
614
615 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
616 ERR("IPV6_MULTICAST_LOOP");
617
618 return 0;
619 }
620
621 static inline int
622 sk_join_group6(sock *s, ip_addr maddr)
623 {
624 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
625
626 if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
627 ERR("IPV6_JOIN_GROUP");
628
629 return 0;
630 }
631
632 static inline int
633 sk_leave_group6(sock *s, ip_addr maddr)
634 {
635 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
636
637 if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
638 ERR("IPV6_LEAVE_GROUP");
639
640 return 0;
641 }
642
643
644 /*
645 * IPv6 packet control messages
646 */
647
648 /* Also standardized, in RFC 3542 */
649
650 /*
651 * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
652 * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
653 * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
654 * RFC and we use IPV6_PKTINFO.
655 */
656 #ifndef IPV6_RECVPKTINFO
657 #define IPV6_RECVPKTINFO IPV6_PKTINFO
658 #endif
659 /*
660 * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
661 */
662 #ifndef IPV6_RECVHOPLIMIT
663 #define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
664 #endif
665
666
667 #define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
668 #define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
669
670 static inline int
671 sk_request_cmsg6_pktinfo(sock *s)
672 {
673 int y = 1;
674
675 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
676 ERR("IPV6_RECVPKTINFO");
677
678 return 0;
679 }
680
681 static inline int
682 sk_request_cmsg6_ttl(sock *s)
683 {
684 int y = 1;
685
686 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
687 ERR("IPV6_RECVHOPLIMIT");
688
689 return 0;
690 }
691
692 static inline void
693 sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
694 {
695 if (cm->cmsg_type == IPV6_PKTINFO)
696 {
697 struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
698 s->laddr = ipa_from_in6(pi->ipi6_addr);
699 s->lifindex = pi->ipi6_ifindex;
700 }
701 }
702
703 static inline void
704 sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
705 {
706 if (cm->cmsg_type == IPV6_HOPLIMIT)
707 s->rcv_ttl = * (int *) CMSG_DATA(cm);
708 }
709
710 static inline void
711 sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
712 {
713 struct cmsghdr *cm;
714 struct in6_pktinfo *pi;
715 int controllen = 0;
716
717 msg->msg_control = cbuf;
718 msg->msg_controllen = cbuflen;
719
720 cm = CMSG_FIRSTHDR(msg);
721 cm->cmsg_level = SOL_IPV6;
722 cm->cmsg_type = IPV6_PKTINFO;
723 cm->cmsg_len = CMSG_LEN(sizeof(*pi));
724 controllen += CMSG_SPACE(sizeof(*pi));
725
726 pi = (struct in6_pktinfo *) CMSG_DATA(cm);
727 pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
728 pi->ipi6_addr = ipa_to_in6(s->saddr);
729
730 msg->msg_controllen = controllen;
731 }
732
733
734 /*
735 * Miscellaneous socket syscalls
736 */
737
738 static inline int
739 sk_set_ttl4(sock *s, int ttl)
740 {
741 if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
742 ERR("IP_TTL");
743
744 return 0;
745 }
746
747 static inline int
748 sk_set_ttl6(sock *s, int ttl)
749 {
750 if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
751 ERR("IPV6_UNICAST_HOPS");
752
753 return 0;
754 }
755
756 static inline int
757 sk_set_tos4(sock *s, int tos)
758 {
759 if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
760 ERR("IP_TOS");
761
762 return 0;
763 }
764
765 static inline int
766 sk_set_tos6(sock *s, int tos)
767 {
768 if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
769 ERR("IPV6_TCLASS");
770
771 return 0;
772 }
773
774 static inline int
775 sk_set_high_port(sock *s UNUSED)
776 {
777 /* Port range setting is optional, ignore it if not supported */
778
779 #ifdef IP_PORTRANGE
780 if (sk_is_ipv4(s))
781 {
782 int range = IP_PORTRANGE_HIGH;
783 if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
784 ERR("IP_PORTRANGE");
785 }
786 #endif
787
788 #ifdef IPV6_PORTRANGE
789 if (sk_is_ipv6(s))
790 {
791 int range = IPV6_PORTRANGE_HIGH;
792 if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
793 ERR("IPV6_PORTRANGE");
794 }
795 #endif
796
797 return 0;
798 }
799
800 static inline byte *
801 sk_skip_ip_header(byte *pkt, int *len)
802 {
803 if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
804 return NULL;
805
806 int hlen = (*pkt & 0x0f) * 4;
807 if ((hlen < 20) || (hlen > *len))
808 return NULL;
809
810 *len -= hlen;
811 return pkt + hlen;
812 }
813
814 byte *
815 sk_rx_buffer(sock *s, int *len)
816 {
817 if (sk_is_ipv4(s) && (s->type == SK_IP))
818 return sk_skip_ip_header(s->rbuf, len);
819 else
820 return s->rbuf;
821 }
822
823
824 /*
825 * Public socket functions
826 */
827
828 /**
829 * sk_setup_multicast - enable multicast for given socket
830 * @s: socket
831 *
832 * Prepare transmission of multicast packets for given datagram socket.
833 * The socket must have defined @iface.
834 *
835 * Result: 0 for success, -1 for an error.
836 */
837
838 int
839 sk_setup_multicast(sock *s)
840 {
841 ASSERT(s->iface);
842
843 if (sk_is_ipv4(s))
844 return sk_setup_multicast4(s);
845 else
846 return sk_setup_multicast6(s);
847 }
848
849 /**
850 * sk_join_group - join multicast group for given socket
851 * @s: socket
852 * @maddr: multicast address
853 *
854 * Join multicast group for given datagram socket and associated interface.
855 * The socket must have defined @iface.
856 *
857 * Result: 0 for success, -1 for an error.
858 */
859
860 int
861 sk_join_group(sock *s, ip_addr maddr)
862 {
863 if (sk_is_ipv4(s))
864 return sk_join_group4(s, maddr);
865 else
866 return sk_join_group6(s, maddr);
867 }
868
869 /**
870 * sk_leave_group - leave multicast group for given socket
871 * @s: socket
872 * @maddr: multicast address
873 *
874 * Leave multicast group for given datagram socket and associated interface.
875 * The socket must have defined @iface.
876 *
877 * Result: 0 for success, -1 for an error.
878 */
879
880 int
881 sk_leave_group(sock *s, ip_addr maddr)
882 {
883 if (sk_is_ipv4(s))
884 return sk_leave_group4(s, maddr);
885 else
886 return sk_leave_group6(s, maddr);
887 }
888
889 /**
890 * sk_setup_broadcast - enable broadcast for given socket
891 * @s: socket
892 *
893 * Allow reception and transmission of broadcast packets for given datagram
894 * socket. The socket must have defined @iface. For transmission, packets should
895 * be send to @brd address of @iface.
896 *
897 * Result: 0 for success, -1 for an error.
898 */
899
900 int
901 sk_setup_broadcast(sock *s)
902 {
903 int y = 1;
904
905 if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
906 ERR("SO_BROADCAST");
907
908 return 0;
909 }
910
911 /**
912 * sk_set_ttl - set transmit TTL for given socket
913 * @s: socket
914 * @ttl: TTL value
915 *
916 * Set TTL for already opened connections when TTL was not set before. Useful
917 * for accepted connections when different ones should have different TTL.
918 *
919 * Result: 0 for success, -1 for an error.
920 */
921
922 int
923 sk_set_ttl(sock *s, int ttl)
924 {
925 s->ttl = ttl;
926
927 if (sk_is_ipv4(s))
928 return sk_set_ttl4(s, ttl);
929 else
930 return sk_set_ttl6(s, ttl);
931 }
932
933 /**
934 * sk_set_min_ttl - set minimal accepted TTL for given socket
935 * @s: socket
936 * @ttl: TTL value
937 *
938 * Set minimal accepted TTL for given socket. Can be used for TTL security.
939 * implementations.
940 *
941 * Result: 0 for success, -1 for an error.
942 */
943
944 int
945 sk_set_min_ttl(sock *s, int ttl)
946 {
947 if (sk_is_ipv4(s))
948 return sk_set_min_ttl4(s, ttl);
949 else
950 return sk_set_min_ttl6(s, ttl);
951 }
952
953 #if 0
954 /**
955 * sk_set_md5_auth - add / remove MD5 security association for given socket
956 * @s: socket
957 * @local: IP address of local side
958 * @remote: IP address of remote side
959 * @ifa: Interface for link-local IP address
960 * @passwd: Password used for MD5 authentication
961 * @setkey: Update also system SA/SP database
962 *
963 * In TCP MD5 handling code in kernel, there is a set of security associations
964 * used for choosing password and other authentication parameters according to
965 * the local and remote address. This function is useful for listening socket,
966 * for active sockets it may be enough to set s->password field.
967 *
968 * When called with passwd != NULL, the new pair is added,
969 * When called with passwd == NULL, the existing pair is removed.
970 *
971 * Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
972 * stored in global SA/SP database (but the behavior also must be enabled on
973 * per-socket basis). In case of multiple sockets to the same neighbor, the
974 * socket-specific state must be configured for each socket while global state
975 * just once per src-dst pair. The @setkey argument controls whether the global
976 * state (SA/SP database) is also updated.
977 *
978 * Result: 0 for success, -1 for an error.
979 */
980
981 int
982 sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
983 { DUMMY; }
984 #endif
985
986 /**
987 * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
988 * @s: socket
989 * @offset: offset
990 *
991 * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
992 * kernel will automatically fill it for outgoing packets and check it for
993 * incoming packets. Should not be used on ICMPv6 sockets, where the position is
994 * known to the kernel.
995 *
996 * Result: 0 for success, -1 for an error.
997 */
998
999 int
1000 sk_set_ipv6_checksum(sock *s, int offset)
1001 {
1002 if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
1003 ERR("IPV6_CHECKSUM");
1004
1005 return 0;
1006 }
1007
1008 int
1009 sk_set_icmp6_filter(sock *s, int p1, int p2)
1010 {
1011 /* a bit of lame interface, but it is here only for Radv */
1012 struct icmp6_filter f;
1013
1014 ICMP6_FILTER_SETBLOCKALL(&f);
1015 ICMP6_FILTER_SETPASS(p1, &f);
1016 ICMP6_FILTER_SETPASS(p2, &f);
1017
1018 if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
1019 ERR("ICMP6_FILTER");
1020
1021 return 0;
1022 }
1023
1024 void
1025 sk_log_error(sock *s, const char *p)
1026 {
1027 log(L_ERR "%s: Socket error: %s%#m", p, s->err);
1028 }
1029
1030
1031 /*
1032 * Actual struct birdsock code
1033 */
1034
1035 static list sock_list;
1036 static struct birdsock *current_sock;
1037 static struct birdsock *stored_sock;
1038
1039 static inline sock *
1040 sk_next(sock *s)
1041 {
1042 if (!s->n.next->next)
1043 return NULL;
1044 else
1045 return SKIP_BACK(sock, n, s->n.next);
1046 }
1047
1048 static void
1049 sk_alloc_bufs(sock *s)
1050 {
1051 if (!s->rbuf && s->rbsize)
1052 s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
1053 s->rpos = s->rbuf;
1054 if (!s->tbuf && s->tbsize)
1055 s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
1056 s->tpos = s->ttx = s->tbuf;
1057 }
1058
1059 static void
1060 sk_free_bufs(sock *s)
1061 {
1062 if (s->rbuf_alloc)
1063 {
1064 xfree(s->rbuf_alloc);
1065 s->rbuf = s->rbuf_alloc = NULL;
1066 }
1067 if (s->tbuf_alloc)
1068 {
1069 xfree(s->tbuf_alloc);
1070 s->tbuf = s->tbuf_alloc = NULL;
1071 }
1072 }
1073
1074 static void
1075 sk_free(resource *r)
1076 {
1077 sock *s = (sock *) r;
1078
1079 sk_free_bufs(s);
1080 if (s->fd >= 0)
1081 {
1082 close(s->fd);
1083
1084 /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
1085 if (s->flags & SKF_THREAD)
1086 return;
1087
1088 if (s == current_sock)
1089 current_sock = sk_next(s);
1090 if (s == stored_sock)
1091 stored_sock = sk_next(s);
1092 rem_node(&s->n);
1093 }
1094 }
1095
1096 void
1097 sk_set_rbsize(sock *s, uint val)
1098 {
1099 ASSERT(s->rbuf_alloc == s->rbuf);
1100
1101 if (s->rbsize == val)
1102 return;
1103
1104 s->rbsize = val;
1105 xfree(s->rbuf_alloc);
1106 s->rbuf_alloc = xmalloc(val);
1107 s->rpos = s->rbuf = s->rbuf_alloc;
1108 }
1109
1110 void
1111 sk_set_tbsize(sock *s, uint val)
1112 {
1113 ASSERT(s->tbuf_alloc == s->tbuf);
1114
1115 if (s->tbsize == val)
1116 return;
1117
1118 byte *old_tbuf = s->tbuf;
1119
1120 s->tbsize = val;
1121 s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
1122 s->tpos = s->tbuf + (s->tpos - old_tbuf);
1123 s->ttx = s->tbuf + (s->ttx - old_tbuf);
1124 }
1125
1126 void
1127 sk_set_tbuf(sock *s, void *tbuf)
1128 {
1129 s->tbuf = tbuf ?: s->tbuf_alloc;
1130 s->ttx = s->tpos = s->tbuf;
1131 }
1132
1133 void
1134 sk_reallocate(sock *s)
1135 {
1136 sk_free_bufs(s);
1137 sk_alloc_bufs(s);
1138 }
1139
1140 static void
1141 sk_dump(resource *r)
1142 {
1143 sock *s = (sock *) r;
1144 static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "DEL!" };
1145
1146 debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
1147 sk_type_names[s->type],
1148 s->data,
1149 s->saddr,
1150 s->sport,
1151 s->daddr,
1152 s->dport,
1153 s->tos,
1154 s->ttl,
1155 s->iface ? s->iface->name : "none");
1156 }
1157
1158 static struct resclass sk_class = {
1159 "Socket",
1160 sizeof(sock),
1161 sk_free,
1162 sk_dump,
1163 NULL,
1164 NULL
1165 };
1166
1167 /**
1168 * sk_new - create a socket
1169 * @p: pool
1170 *
1171 * This function creates a new socket resource. If you want to use it,
1172 * you need to fill in all the required fields of the structure and
1173 * call sk_open() to do the actual opening of the socket.
1174 *
1175 * The real function name is sock_new(), sk_new() is a macro wrapper
1176 * to avoid collision with OpenSSL.
1177 */
1178 sock *
1179 sock_new(pool *p)
1180 {
1181 sock *s = ralloc(p, &sk_class);
1182 s->pool = p;
1183 // s->saddr = s->daddr = IPA_NONE;
1184 s->tos = s->priority = s->ttl = -1;
1185 s->fd = -1;
1186 return s;
1187 }
1188
1189 static int
1190 sk_setup(sock *s)
1191 {
1192 int y = 1;
1193 int fd = s->fd;
1194
1195 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1196 ERR("O_NONBLOCK");
1197
1198 if (!s->af)
1199 return 0;
1200
1201 if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
1202 s->flags |= SKF_PKTINFO;
1203
1204 #ifdef CONFIG_USE_HDRINCL
1205 if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
1206 {
1207 s->flags &= ~SKF_PKTINFO;
1208 s->flags |= SKF_HDRINCL;
1209 if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
1210 ERR("IP_HDRINCL");
1211 }
1212 #endif
1213
1214 if (s->vrf && !s->iface)
1215 {
1216 /* Bind socket to associated VRF interface.
1217 This is Linux-specific, but so is SO_BINDTODEVICE. */
1218 #ifdef SO_BINDTODEVICE
1219 struct ifreq ifr = {};
1220 strcpy(ifr.ifr_name, s->vrf->name);
1221 if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
1222 ERR("SO_BINDTODEVICE");
1223 #endif
1224 }
1225
1226 if (s->iface)
1227 {
1228 #ifdef SO_BINDTODEVICE
1229 struct ifreq ifr = {};
1230 strcpy(ifr.ifr_name, s->iface->name);
1231 if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
1232 ERR("SO_BINDTODEVICE");
1233 #endif
1234
1235 #ifdef CONFIG_UNIX_DONTROUTE
1236 if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
1237 ERR("SO_DONTROUTE");
1238 #endif
1239 }
1240
1241 if (sk_is_ipv4(s))
1242 {
1243 if (s->flags & SKF_LADDR_RX)
1244 if (sk_request_cmsg4_pktinfo(s) < 0)
1245 return -1;
1246
1247 if (s->flags & SKF_TTL_RX)
1248 if (sk_request_cmsg4_ttl(s) < 0)
1249 return -1;
1250
1251 if ((s->type == SK_UDP) || (s->type == SK_IP))
1252 if (sk_disable_mtu_disc4(s) < 0)
1253 return -1;
1254
1255 if (s->ttl >= 0)
1256 if (sk_set_ttl4(s, s->ttl) < 0)
1257 return -1;
1258
1259 if (s->tos >= 0)
1260 if (sk_set_tos4(s, s->tos) < 0)
1261 return -1;
1262 }
1263
1264 if (sk_is_ipv6(s))
1265 {
1266 if (s->flags & SKF_V6ONLY)
1267 if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
1268 ERR("IPV6_V6ONLY");
1269
1270 if (s->flags & SKF_LADDR_RX)
1271 if (sk_request_cmsg6_pktinfo(s) < 0)
1272 return -1;
1273
1274 if (s->flags & SKF_TTL_RX)
1275 if (sk_request_cmsg6_ttl(s) < 0)
1276 return -1;
1277
1278 if ((s->type == SK_UDP) || (s->type == SK_IP))
1279 if (sk_disable_mtu_disc6(s) < 0)
1280 return -1;
1281
1282 if (s->ttl >= 0)
1283 if (sk_set_ttl6(s, s->ttl) < 0)
1284 return -1;
1285
1286 if (s->tos >= 0)
1287 if (sk_set_tos6(s, s->tos) < 0)
1288 return -1;
1289 }
1290
1291 /* Must be after sk_set_tos4() as setting ToS on Linux also mangles priority */
1292 if (s->priority >= 0)
1293 if (sk_set_priority(s, s->priority) < 0)
1294 return -1;
1295
1296 return 0;
1297 }
1298
1299 static void
1300 sk_insert(sock *s)
1301 {
1302 add_tail(&sock_list, &s->n);
1303 }
1304
1305 static void
1306 sk_tcp_connected(sock *s)
1307 {
1308 sockaddr sa;
1309 int sa_len = sizeof(sa);
1310
1311 if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
1312 (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
1313 log(L_WARN "SOCK: Cannot get local IP address for TCP>");
1314
1315 s->type = SK_TCP;
1316 sk_alloc_bufs(s);
1317 s->tx_hook(s);
1318 }
1319
1320 static int
1321 sk_passive_connected(sock *s, int type)
1322 {
1323 sockaddr loc_sa, rem_sa;
1324 int loc_sa_len = sizeof(loc_sa);
1325 int rem_sa_len = sizeof(rem_sa);
1326
1327 int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
1328 if (fd < 0)
1329 {
1330 if ((errno != EINTR) && (errno != EAGAIN))
1331 s->err_hook(s, errno);
1332 return 0;
1333 }
1334
1335 sock *t = sk_new(s->pool);
1336 t->type = type;
1337 t->fd = fd;
1338 t->af = s->af;
1339 t->ttl = s->ttl;
1340 t->tos = s->tos;
1341 t->rbsize = s->rbsize;
1342 t->tbsize = s->tbsize;
1343
1344 if (type == SK_TCP)
1345 {
1346 if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
1347 (sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
1348 log(L_WARN "SOCK: Cannot get local IP address for TCP<");
1349
1350 if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
1351 log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
1352 }
1353
1354 if (sk_setup(t) < 0)
1355 {
1356 /* FIXME: Call err_hook instead ? */
1357 log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
1358
1359 /* FIXME: handle it better in rfree() */
1360 close(t->fd);
1361 t->fd = -1;
1362 rfree(t);
1363 return 1;
1364 }
1365
1366 sk_insert(t);
1367 sk_alloc_bufs(t);
1368 s->rx_hook(t, 0);
1369 return 1;
1370 }
1371
1372 /**
1373 * sk_open - open a socket
1374 * @s: socket
1375 *
1376 * This function takes a socket resource created by sk_new() and
1377 * initialized by the user and binds a corresponding network connection
1378 * to it.
1379 *
1380 * Result: 0 for success, -1 for an error.
1381 */
1382 int
1383 sk_open(sock *s)
1384 {
1385 int af = BIRD_AF;
1386 int fd = -1;
1387 int do_bind = 0;
1388 int bind_port = 0;
1389 ip_addr bind_addr = IPA_NONE;
1390 sockaddr sa;
1391
1392 switch (s->type)
1393 {
1394 case SK_TCP_ACTIVE:
1395 s->ttx = ""; /* Force s->ttx != s->tpos */
1396 /* Fall thru */
1397 case SK_TCP_PASSIVE:
1398 fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
1399 bind_port = s->sport;
1400 bind_addr = s->saddr;
1401 do_bind = bind_port || ipa_nonzero(bind_addr);
1402 break;
1403
1404 case SK_UDP:
1405 fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
1406 bind_port = s->sport;
1407 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1408 do_bind = 1;
1409 break;
1410
1411 case SK_IP:
1412 fd = socket(af, SOCK_RAW, s->dport);
1413 bind_port = 0;
1414 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1415 do_bind = ipa_nonzero(bind_addr);
1416 break;
1417
1418 case SK_MAGIC:
1419 af = 0;
1420 fd = s->fd;
1421 break;
1422
1423 default:
1424 bug("sk_open() called for invalid sock type %d", s->type);
1425 }
1426
1427 if (fd < 0)
1428 ERR("socket");
1429
1430 s->af = af;
1431 s->fd = fd;
1432
1433 if (sk_setup(s) < 0)
1434 goto err;
1435
1436 if (do_bind)
1437 {
1438 if (bind_port)
1439 {
1440 int y = 1;
1441
1442 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
1443 ERR2("SO_REUSEADDR");
1444
1445 #ifdef CONFIG_NO_IFACE_BIND
1446 /* Workaround missing ability to bind to an iface */
1447 if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
1448 {
1449 if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
1450 ERR2("SO_REUSEPORT");
1451 }
1452 #endif
1453 }
1454 else
1455 if (s->flags & SKF_HIGH_PORT)
1456 if (sk_set_high_port(s) < 0)
1457 log(L_WARN "Socket error: %s%#m", s->err);
1458
1459 sockaddr_fill(&sa, af, bind_addr, s->iface, bind_port);
1460 if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
1461 ERR2("bind");
1462 }
1463
1464 if (s->password)
1465 if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
1466 goto err;
1467
1468 switch (s->type)
1469 {
1470 case SK_TCP_ACTIVE:
1471 sockaddr_fill(&sa, af, s->daddr, s->iface, s->dport);
1472 if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
1473 sk_tcp_connected(s);
1474 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
1475 errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
1476 ERR2("connect");
1477 break;
1478
1479 case SK_TCP_PASSIVE:
1480 if (listen(fd, 8) < 0)
1481 ERR2("listen");
1482 break;
1483
1484 case SK_MAGIC:
1485 break;
1486
1487 default:
1488 sk_alloc_bufs(s);
1489 }
1490
1491 if (!(s->flags & SKF_THREAD))
1492 sk_insert(s);
1493 return 0;
1494
1495 err:
1496 close(fd);
1497 s->fd = -1;
1498 return -1;
1499 }
1500
1501 int
1502 sk_open_unix(sock *s, char *name)
1503 {
1504 struct sockaddr_un sa;
1505 int fd;
1506
1507 /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
1508
1509 fd = socket(AF_UNIX, SOCK_STREAM, 0);
1510 if (fd < 0)
1511 return -1;
1512
1513 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1514 return -1;
1515
1516 /* Path length checked in test_old_bird() */
1517 sa.sun_family = AF_UNIX;
1518 strcpy(sa.sun_path, name);
1519
1520 if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
1521 return -1;
1522
1523 if (listen(fd, 8) < 0)
1524 return -1;
1525
1526 s->fd = fd;
1527 sk_insert(s);
1528 return 0;
1529 }
1530
1531
1532 #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
1533 CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
1534 #define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
1535
1536 static void
1537 sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
1538 {
1539 if (sk_is_ipv4(s))
1540 sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
1541 else
1542 sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
1543 }
1544
1545 static void
1546 sk_process_cmsgs(sock *s, struct msghdr *msg)
1547 {
1548 struct cmsghdr *cm;
1549
1550 s->laddr = IPA_NONE;
1551 s->lifindex = 0;
1552 s->rcv_ttl = -1;
1553
1554 for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
1555 {
1556 if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
1557 {
1558 sk_process_cmsg4_pktinfo(s, cm);
1559 sk_process_cmsg4_ttl(s, cm);
1560 }
1561
1562 if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
1563 {
1564 sk_process_cmsg6_pktinfo(s, cm);
1565 sk_process_cmsg6_ttl(s, cm);
1566 }
1567 }
1568 }
1569
1570
1571 static inline int
1572 sk_sendmsg(sock *s)
1573 {
1574 struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
1575 byte cmsg_buf[CMSG_TX_SPACE];
1576 sockaddr dst;
1577
1578 sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
1579
1580 struct msghdr msg = {
1581 .msg_name = &dst.sa,
1582 .msg_namelen = SA_LEN(dst),
1583 .msg_iov = &iov,
1584 .msg_iovlen = 1
1585 };
1586
1587 #ifdef CONFIG_USE_HDRINCL
1588 byte hdr[20];
1589 struct iovec iov2[2] = { {hdr, 20}, iov };
1590
1591 if (s->flags & SKF_HDRINCL)
1592 {
1593 sk_prepare_ip_header(s, hdr, iov.iov_len);
1594 msg.msg_iov = iov2;
1595 msg.msg_iovlen = 2;
1596 }
1597 #endif
1598
1599 if (s->flags & SKF_PKTINFO)
1600 sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
1601
1602 return sendmsg(s->fd, &msg, 0);
1603 }
1604
1605 static inline int
1606 sk_recvmsg(sock *s)
1607 {
1608 struct iovec iov = {s->rbuf, s->rbsize};
1609 byte cmsg_buf[CMSG_RX_SPACE];
1610 sockaddr src;
1611
1612 struct msghdr msg = {
1613 .msg_name = &src.sa,
1614 .msg_namelen = sizeof(src), // XXXX ??
1615 .msg_iov = &iov,
1616 .msg_iovlen = 1,
1617 .msg_control = cmsg_buf,
1618 .msg_controllen = sizeof(cmsg_buf),
1619 .msg_flags = 0
1620 };
1621
1622 int rv = recvmsg(s->fd, &msg, 0);
1623 if (rv < 0)
1624 return rv;
1625
1626 //ifdef IPV4
1627 // if (cf_type == SK_IP)
1628 // rv = ipv4_skip_header(pbuf, rv);
1629 //endif
1630
1631 sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
1632 sk_process_cmsgs(s, &msg);
1633
1634 if (msg.msg_flags & MSG_TRUNC)
1635 s->flags |= SKF_TRUNCATED;
1636 else
1637 s->flags &= ~SKF_TRUNCATED;
1638
1639 return rv;
1640 }
1641
1642
1643 static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
1644
1645 static int
1646 sk_maybe_write(sock *s)
1647 {
1648 int e;
1649
1650 switch (s->type)
1651 {
1652 case SK_TCP:
1653 case SK_MAGIC:
1654 case SK_UNIX:
1655 while (s->ttx != s->tpos)
1656 {
1657 e = write(s->fd, s->ttx, s->tpos - s->ttx);
1658
1659 if (e < 0)
1660 {
1661 if (errno != EINTR && errno != EAGAIN)
1662 {
1663 reset_tx_buffer(s);
1664 /* EPIPE is just a connection close notification during TX */
1665 s->err_hook(s, (errno != EPIPE) ? errno : 0);
1666 return -1;
1667 }
1668 return 0;
1669 }
1670 s->ttx += e;
1671 }
1672 reset_tx_buffer(s);
1673 return 1;
1674
1675 case SK_UDP:
1676 case SK_IP:
1677 {
1678 if (s->tbuf == s->tpos)
1679 return 1;
1680
1681 e = sk_sendmsg(s);
1682
1683 if (e < 0)
1684 {
1685 if (errno != EINTR && errno != EAGAIN)
1686 {
1687 reset_tx_buffer(s);
1688 s->err_hook(s, errno);
1689 return -1;
1690 }
1691
1692 if (!s->tx_hook)
1693 reset_tx_buffer(s);
1694 return 0;
1695 }
1696 reset_tx_buffer(s);
1697 return 1;
1698 }
1699 default:
1700 bug("sk_maybe_write: unknown socket type %d", s->type);
1701 }
1702 }
1703
1704 int
1705 sk_rx_ready(sock *s)
1706 {
1707 int rv;
1708 struct pollfd pfd = { .fd = s->fd };
1709 pfd.events |= POLLIN;
1710
1711 redo:
1712 rv = poll(&pfd, 1, 0);
1713
1714 if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
1715 goto redo;
1716
1717 return rv;
1718 }
1719
1720 /**
1721 * sk_send - send data to a socket
1722 * @s: socket
1723 * @len: number of bytes to send
1724 *
1725 * This function sends @len bytes of data prepared in the
1726 * transmit buffer of the socket @s to the network connection.
1727 * If the packet can be sent immediately, it does so and returns
1728 * 1, else it queues the packet for later processing, returns 0
1729 * and calls the @tx_hook of the socket when the tranmission
1730 * takes place.
1731 */
1732 int
1733 sk_send(sock *s, unsigned len)
1734 {
1735 s->ttx = s->tbuf;
1736 s->tpos = s->tbuf + len;
1737 return sk_maybe_write(s);
1738 }
1739
1740 /**
1741 * sk_send_to - send data to a specific destination
1742 * @s: socket
1743 * @len: number of bytes to send
1744 * @addr: IP address to send the packet to
1745 * @port: port to send the packet to
1746 *
1747 * This is a sk_send() replacement for connection-less packet sockets
1748 * which allows destination of the packet to be chosen dynamically.
1749 * Raw IP sockets should use 0 for @port.
1750 */
1751 int
1752 sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
1753 {
1754 s->daddr = addr;
1755 if (port)
1756 s->dport = port;
1757
1758 s->ttx = s->tbuf;
1759 s->tpos = s->tbuf + len;
1760 return sk_maybe_write(s);
1761 }
1762
1763 /*
1764 int
1765 sk_send_full(sock *s, unsigned len, struct iface *ifa,
1766 ip_addr saddr, ip_addr daddr, unsigned dport)
1767 {
1768 s->iface = ifa;
1769 s->saddr = saddr;
1770 s->daddr = daddr;
1771 s->dport = dport;
1772 s->ttx = s->tbuf;
1773 s->tpos = s->tbuf + len;
1774 return sk_maybe_write(s);
1775 }
1776 */
1777
1778 /* sk_read() and sk_write() are called from BFD's event loop */
1779
1780 int
1781 sk_read(sock *s, int revents)
1782 {
1783 switch (s->type)
1784 {
1785 case SK_TCP_PASSIVE:
1786 return sk_passive_connected(s, SK_TCP);
1787
1788 case SK_UNIX_PASSIVE:
1789 return sk_passive_connected(s, SK_UNIX);
1790
1791 case SK_TCP:
1792 case SK_UNIX:
1793 {
1794 int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
1795
1796 if (c < 0)
1797 {
1798 if (errno != EINTR && errno != EAGAIN)
1799 s->err_hook(s, errno);
1800 else if (errno == EAGAIN && !(revents & POLLIN))
1801 {
1802 log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
1803 s->err_hook(s, 0);
1804 }
1805 }
1806 else if (!c)
1807 s->err_hook(s, 0);
1808 else
1809 {
1810 s->rpos += c;
1811 if (s->rx_hook(s, s->rpos - s->rbuf))
1812 {
1813 /* We need to be careful since the socket could have been deleted by the hook */
1814 if (current_sock == s)
1815 s->rpos = s->rbuf;
1816 }
1817 return 1;
1818 }
1819 return 0;
1820 }
1821
1822 case SK_MAGIC:
1823 return s->rx_hook(s, 0);
1824
1825 default:
1826 {
1827 int e = sk_recvmsg(s);
1828
1829 if (e < 0)
1830 {
1831 if (errno != EINTR && errno != EAGAIN)
1832 s->err_hook(s, errno);
1833 return 0;
1834 }
1835
1836 s->rpos = s->rbuf + e;
1837 s->rx_hook(s, e);
1838 return 1;
1839 }
1840 }
1841 }
1842
1843 int
1844 sk_write(sock *s)
1845 {
1846 switch (s->type)
1847 {
1848 case SK_TCP_ACTIVE:
1849 {
1850 sockaddr sa;
1851 sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
1852
1853 if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
1854 sk_tcp_connected(s);
1855 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
1856 s->err_hook(s, errno);
1857 return 0;
1858 }
1859
1860 default:
1861 if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
1862 {
1863 if (s->tx_hook)
1864 s->tx_hook(s);
1865 return 1;
1866 }
1867 return 0;
1868 }
1869 }
1870
1871 void
1872 sk_err(sock *s, int revents)
1873 {
1874 int se = 0, sse = sizeof(se);
1875 if ((s->type != SK_MAGIC) && (revents & POLLERR))
1876 if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
1877 {
1878 log(L_ERR "IO: Socket error: SO_ERROR: %m");
1879 se = 0;
1880 }
1881
1882 s->err_hook(s, se);
1883 }
1884
1885 void
1886 sk_dump_all(void)
1887 {
1888 node *n;
1889 sock *s;
1890
1891 debug("Open sockets:\n");
1892 WALK_LIST(n, sock_list)
1893 {
1894 s = SKIP_BACK(sock, n, n);
1895 debug("%p ", s);
1896 sk_dump(&s->r);
1897 }
1898 debug("\n");
1899 }
1900
1901
1902 /*
1903 * Internal event log and watchdog
1904 */
1905
1906 #define EVENT_LOG_LENGTH 32
1907
1908 struct event_log_entry
1909 {
1910 void *hook;
1911 void *data;
1912 btime timestamp;
1913 btime duration;
1914 };
1915
1916 static struct event_log_entry event_log[EVENT_LOG_LENGTH];
1917 static struct event_log_entry *event_open;
1918 static int event_log_pos, event_log_num, watchdog_active;
1919 static btime last_time;
1920 static btime loop_time;
1921
1922 static void
1923 io_update_time(void)
1924 {
1925 struct timespec ts;
1926 int rv;
1927
1928 if (!clock_monotonic_available)
1929 return;
1930
1931 /*
1932 * This is third time-tracking procedure (after update_times() above and
1933 * times_update() in BFD), dedicated to internal event log and latency
1934 * tracking. Hopefully, we consolidate these sometimes.
1935 */
1936
1937 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
1938 if (rv < 0)
1939 die("clock_gettime: %m");
1940
1941 last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
1942
1943 if (event_open)
1944 {
1945 event_open->duration = last_time - event_open->timestamp;
1946
1947 if (event_open->duration > config->latency_limit)
1948 log(L_WARN "Event 0x%p 0x%p took %d ms",
1949 event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
1950
1951 event_open = NULL;
1952 }
1953 }
1954
1955 /**
1956 * io_log_event - mark approaching event into event log
1957 * @hook: event hook address
1958 * @data: event data address
1959 *
1960 * Store info (hook, data, timestamp) about the following internal event into
1961 * a circular event log (@event_log). When latency tracking is enabled, the log
1962 * entry is kept open (in @event_open) so the duration can be filled later.
1963 */
1964 void
1965 io_log_event(void *hook, void *data)
1966 {
1967 if (config->latency_debug)
1968 io_update_time();
1969
1970 struct event_log_entry *en = event_log + event_log_pos;
1971
1972 en->hook = hook;
1973 en->data = data;
1974 en->timestamp = last_time;
1975 en->duration = 0;
1976
1977 event_log_num++;
1978 event_log_pos++;
1979 event_log_pos %= EVENT_LOG_LENGTH;
1980
1981 event_open = config->latency_debug ? en : NULL;
1982 }
1983
1984 static inline void
1985 io_close_event(void)
1986 {
1987 if (event_open)
1988 io_update_time();
1989 }
1990
1991 void
1992 io_log_dump(void)
1993 {
1994 int i;
1995
1996 log(L_DEBUG "Event log:");
1997 for (i = 0; i < EVENT_LOG_LENGTH; i++)
1998 {
1999 struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
2000 if (en->hook)
2001 log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
2002 (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
2003 }
2004 }
2005
2006 void
2007 watchdog_sigalrm(int sig UNUSED)
2008 {
2009 /* Update last_time and duration, but skip latency check */
2010 config->latency_limit = 0xffffffff;
2011 io_update_time();
2012
2013 /* We want core dump */
2014 abort();
2015 }
2016
2017 static inline void
2018 watchdog_start1(void)
2019 {
2020 io_update_time();
2021
2022 loop_time = last_time;
2023 }
2024
2025 static inline void
2026 watchdog_start(void)
2027 {
2028 io_update_time();
2029
2030 loop_time = last_time;
2031 event_log_num = 0;
2032
2033 if (config->watchdog_timeout)
2034 {
2035 alarm(config->watchdog_timeout);
2036 watchdog_active = 1;
2037 }
2038 }
2039
2040 static inline void
2041 watchdog_stop(void)
2042 {
2043 io_update_time();
2044
2045 if (watchdog_active)
2046 {
2047 alarm(0);
2048 watchdog_active = 0;
2049 }
2050
2051 btime duration = last_time - loop_time;
2052 if (duration > config->watchdog_warning)
2053 log(L_WARN "I/O loop cycle took %d ms for %d events",
2054 (int) (duration TO_MS), event_log_num);
2055 }
2056
2057
2058 /*
2059 * Main I/O Loop
2060 */
2061
2062 volatile int async_config_flag; /* Asynchronous reconfiguration/dump scheduled */
2063 volatile int async_dump_flag;
2064 volatile int async_shutdown_flag;
2065
2066 void
2067 io_init(void)
2068 {
2069 init_list(&near_timers);
2070 init_list(&far_timers);
2071 init_list(&sock_list);
2072 init_list(&global_event_list);
2073 krt_io_init();
2074 init_times();
2075 update_times();
2076 boot_time = now;
2077 srandom((int) now_real);
2078 }
2079
2080 static int short_loops = 0;
2081 #define SHORT_LOOP_MAX 10
2082
2083 void
2084 io_loop(void)
2085 {
2086 int poll_tout;
2087 time_t tout;
2088 int nfds, events, pout;
2089 sock *s;
2090 node *n;
2091 int fdmax = 256;
2092 struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
2093
2094 watchdog_start1();
2095 for(;;)
2096 {
2097 events = ev_run_list(&global_event_list);
2098 timers:
2099 update_times();
2100 tout = tm_first_shot();
2101 if (tout <= now)
2102 {
2103 tm_shot();
2104 goto timers;
2105 }
2106 poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */
2107
2108 io_close_event();
2109
2110 nfds = 0;
2111 WALK_LIST(n, sock_list)
2112 {
2113 pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
2114 s = SKIP_BACK(sock, n, n);
2115 if (s->rx_hook)
2116 {
2117 pfd[nfds].fd = s->fd;
2118 pfd[nfds].events |= POLLIN;
2119 }
2120 if (s->tx_hook && s->ttx != s->tpos)
2121 {
2122 pfd[nfds].fd = s->fd;
2123 pfd[nfds].events |= POLLOUT;
2124 }
2125 if (pfd[nfds].fd != -1)
2126 {
2127 s->index = nfds;
2128 nfds++;
2129 }
2130 else
2131 s->index = -1;
2132
2133 if (nfds >= fdmax)
2134 {
2135 fdmax *= 2;
2136 pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
2137 }
2138 }
2139
2140 /*
2141 * Yes, this is racy. But even if the signal comes before this test
2142 * and entering poll(), it gets caught on the next timer tick.
2143 */
2144
2145 if (async_config_flag)
2146 {
2147 io_log_event(async_config, NULL);
2148 async_config();
2149 async_config_flag = 0;
2150 continue;
2151 }
2152 if (async_dump_flag)
2153 {
2154 io_log_event(async_dump, NULL);
2155 async_dump();
2156 async_dump_flag = 0;
2157 continue;
2158 }
2159 if (async_shutdown_flag)
2160 {
2161 io_log_event(async_shutdown, NULL);
2162 async_shutdown();
2163 async_shutdown_flag = 0;
2164 continue;
2165 }
2166
2167 /* And finally enter poll() to find active sockets */
2168 watchdog_stop();
2169 pout = poll(pfd, nfds, poll_tout);
2170 watchdog_start();
2171
2172 if (pout < 0)
2173 {
2174 if (errno == EINTR || errno == EAGAIN)
2175 continue;
2176 die("poll: %m");
2177 }
2178 if (pout)
2179 {
2180 /* guaranteed to be non-empty */
2181 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2182
2183 while (current_sock)
2184 {
2185 sock *s = current_sock;
2186 if (s->index == -1)
2187 {
2188 current_sock = sk_next(s);
2189 goto next;
2190 }
2191
2192 int e;
2193 int steps;
2194
2195 steps = MAX_STEPS;
2196 if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2197 do
2198 {
2199 steps--;
2200 io_log_event(s->rx_hook, s->data);
2201 e = sk_read(s, pfd[s->index].revents);
2202 if (s != current_sock)
2203 goto next;
2204 }
2205 while (e && s->rx_hook && steps);
2206
2207 steps = MAX_STEPS;
2208 if (pfd[s->index].revents & POLLOUT)
2209 do
2210 {
2211 steps--;
2212 io_log_event(s->tx_hook, s->data);
2213 e = sk_write(s);
2214 if (s != current_sock)
2215 goto next;
2216 }
2217 while (e && steps);
2218
2219 current_sock = sk_next(s);
2220 next: ;
2221 }
2222
2223 short_loops++;
2224 if (events && (short_loops < SHORT_LOOP_MAX))
2225 continue;
2226 short_loops = 0;
2227
2228 int count = 0;
2229 current_sock = stored_sock;
2230 if (current_sock == NULL)
2231 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2232
2233 while (current_sock && count < MAX_RX_STEPS)
2234 {
2235 sock *s = current_sock;
2236 if (s->index == -1)
2237 {
2238 current_sock = sk_next(s);
2239 goto next2;
2240 }
2241
2242 if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2243 {
2244 count++;
2245 io_log_event(s->rx_hook, s->data);
2246 sk_read(s, pfd[s->index].revents);
2247 if (s != current_sock)
2248 goto next2;
2249 }
2250
2251 if (pfd[s->index].revents & (POLLHUP | POLLERR))
2252 {
2253 sk_err(s, pfd[s->index].revents);
2254 if (s != current_sock)
2255 goto next2;
2256 }
2257
2258 current_sock = sk_next(s);
2259 next2: ;
2260 }
2261
2262
2263 stored_sock = current_sock;
2264 }
2265 }
2266 }
2267
2268 void
2269 test_old_bird(char *path)
2270 {
2271 int fd;
2272 struct sockaddr_un sa;
2273
2274 fd = socket(AF_UNIX, SOCK_STREAM, 0);
2275 if (fd < 0)
2276 die("Cannot create socket: %m");
2277 if (strlen(path) >= sizeof(sa.sun_path))
2278 die("Socket path too long");
2279 bzero(&sa, sizeof(sa));
2280 sa.sun_family = AF_UNIX;
2281 strcpy(sa.sun_path, path);
2282 if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
2283 die("I found another BIRD running.");
2284 close(fd);
2285 }