]> git.ipfire.org Git - thirdparty/bird.git/blame - sysdep/unix/io.c
OSPF: Fix ECMP external merging
[thirdparty/bird.git] / sysdep / unix / io.c
CommitLineData
b5d9ee5c
MM
1/*
2 * BIRD Internet Routing Daemon -- Unix I/O
3 *
38a608c5 4 * (c) 1998--2004 Martin Mares <mj@ucw.cz>
b1a1faba 5 * (c) 2004 Ondrej Filip <feela@network.cz>
b5d9ee5c
MM
6 *
7 * Can be freely distributed and used under the terms of the GNU GPL.
8 */
9
607d9914
OZ
10/* Unfortunately, some glibc versions hide parts of RFC 3542 API
11 if _GNU_SOURCE is not defined. */
c8cafc8e
OZ
12#ifndef _GNU_SOURCE
13#define _GNU_SOURCE
14#endif
607d9914 15
b5d9ee5c
MM
16#include <stdio.h>
17#include <stdlib.h>
01b776e1 18#include <time.h>
b5d9ee5c
MM
19#include <sys/time.h>
20#include <sys/types.h>
21#include <sys/socket.h>
46a82e9c 22#include <sys/uio.h>
b93abffa 23#include <sys/un.h>
e1c13a5a 24#include <poll.h>
b5d9ee5c 25#include <unistd.h>
a0b176e3 26#include <fcntl.h>
b5d9ee5c 27#include <errno.h>
05476c4d 28#include <net/if.h>
d0e9b36d 29#include <netinet/in.h>
48e5f32d
OZ
30#include <netinet/tcp.h>
31#include <netinet/udp.h>
93e868c7 32#include <netinet/icmp6.h>
b5d9ee5c
MM
33
34#include "nest/bird.h"
35#include "lib/lists.h"
36#include "lib/resource.h"
37#include "lib/timer.h"
38#include "lib/socket.h"
e8f73195 39#include "lib/event.h"
afa8937a 40#include "lib/string.h"
b5d9ee5c
MM
41#include "nest/iface.h"
42
43#include "lib/unix.h"
a2867cd9 44#include "lib/sysio.h"
b5d9ee5c 45
ea89da38 46/* Maximum number of calls of tx handler for one socket in one
e1c13a5a 47 * poll iteration. Should be small enough to not monopolize CPU by
4323099d
OZ
48 * one protocol instance.
49 */
50#define MAX_STEPS 4
51
e1c13a5a 52/* Maximum number of calls of rx handler for all sockets in one poll
ea89da38
OZ
53 iteration. RX callbacks are often much more costly so we limit
54 this to gen small latencies */
55#define MAX_RX_STEPS 4
56
a9c986f9
MM
57/*
58 * Tracked Files
59 */
60
61struct rfile {
62 resource r;
63 FILE *f;
64};
65
66static void
67rf_free(resource *r)
68{
69 struct rfile *a = (struct rfile *) r;
70
71 fclose(a->f);
72}
73
74static void
75rf_dump(resource *r)
76{
77 struct rfile *a = (struct rfile *) r;
78
79 debug("(FILE *%p)\n", a->f);
80}
81
82static struct resclass rf_class = {
83 "FILE",
84 sizeof(struct rfile),
85 rf_free,
e81b440f 86 rf_dump,
acb60628 87 NULL,
e81b440f 88 NULL
a9c986f9
MM
89};
90
91void *
f78056fb 92tracked_fopen(pool *p, char *name, char *mode)
a9c986f9
MM
93{
94 FILE *f = fopen(name, mode);
95
96 if (f)
97 {
98 struct rfile *r = ralloc(p, &rf_class);
99 r->f = f;
100 }
101 return f;
102}
103
525fa2c1
MM
104/**
105 * DOC: Timers
106 *
107 * Timers are resources which represent a wish of a module to call
108 * a function at the specified time. The platform dependent code
58f7d004 109 * doesn't guarantee exact timing, only that a timer function
525fa2c1
MM
110 * won't be called before the requested time.
111 *
fd91ae33
OZ
112 * In BIRD, time is represented by values of the &bird_clock_t type
113 * which are integral numbers interpreted as a relative number of seconds since
114 * some fixed time point in past. The current time can be read
115 * from variable @now with reasonable accuracy and is monotonic. There is also
116 * a current 'absolute' time in variable @now_real reported by OS.
525fa2c1
MM
117 *
118 * Each timer is described by a &timer structure containing a pointer
119 * to the handler function (@hook), data private to this function (@data),
120 * time the function should be called at (@expires, 0 for inactive timers),
121 * for the other fields see |timer.h|.
b5d9ee5c
MM
122 */
123
124#define NEAR_TIMER_LIMIT 4
125
b5d9ee5c
MM
126static list near_timers, far_timers;
127static bird_clock_t first_far_timer = TIME_INFINITY;
128
002b6423 129/* now must be different from 0, because 0 is a special value in timer->expires */
a92cf57d 130bird_clock_t now = 1, now_real, boot_time;
fd91ae33
OZ
131
132static void
133update_times_plain(void)
134{
135 bird_clock_t new_time = time(NULL);
136 int delta = new_time - now_real;
137
138 if ((delta >= 0) && (delta < 60))
139 now += delta;
140 else if (now_real != 0)
141 log(L_WARN "Time jump, delta %d s", delta);
142
143 now_real = new_time;
144}
145
146static void
147update_times_gettime(void)
148{
149 struct timespec ts;
150 int rv;
151
152 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
153 if (rv != 0)
154 die("clock_gettime: %m");
155
156 if (ts.tv_sec != now) {
157 if (ts.tv_sec < now)
158 log(L_ERR "Monotonic timer is broken");
159
160 now = ts.tv_sec;
161 now_real = time(NULL);
162 }
163}
164
165static int clock_monotonic_available;
166
167static inline void
168update_times(void)
169{
170 if (clock_monotonic_available)
171 update_times_gettime();
172 else
173 update_times_plain();
174}
175
176static inline void
177init_times(void)
178{
179 struct timespec ts;
180 clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
181 if (!clock_monotonic_available)
182 log(L_WARN "Monotonic timer is missing");
183}
184
b5d9ee5c
MM
185
186static void
187tm_free(resource *r)
188{
189 timer *t = (timer *) r;
190
191 tm_stop(t);
192}
193
194static void
195tm_dump(resource *r)
196{
197 timer *t = (timer *) r;
198
e8f73195 199 debug("(code %p, data %p, ", t->hook, t->data);
af847acc
MM
200 if (t->randomize)
201 debug("rand %d, ", t->randomize);
202 if (t->recurrent)
203 debug("recur %d, ", t->recurrent);
b5d9ee5c
MM
204 if (t->expires)
205 debug("expires in %d sec)\n", t->expires - now);
206 else
207 debug("inactive)\n");
208}
209
210static struct resclass tm_class = {
211 "Timer",
212 sizeof(timer),
213 tm_free,
e81b440f 214 tm_dump,
acb60628 215 NULL,
e81b440f 216 NULL
b5d9ee5c
MM
217};
218
525fa2c1
MM
219/**
220 * tm_new - create a timer
221 * @p: pool
222 *
223 * This function creates a new timer resource and returns
224 * a pointer to it. To use the timer, you need to fill in
225 * the structure fields and call tm_start() to start timing.
226 */
b5d9ee5c
MM
227timer *
228tm_new(pool *p)
229{
230 timer *t = ralloc(p, &tm_class);
b5d9ee5c
MM
231 return t;
232}
233
234static inline void
235tm_insert_near(timer *t)
236{
237 node *n = HEAD(near_timers);
238
239 while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
240 n = n->next;
241 insert_node(&t->n, n->prev);
242}
243
525fa2c1
MM
244/**
245 * tm_start - start a timer
246 * @t: timer
247 * @after: number of seconds the timer should be run after
248 *
249 * This function schedules the hook function of the timer to
250 * be called after @after seconds. If the timer has been already
251 * started, it's @expire time is replaced by the new value.
252 *
253 * You can have set the @randomize field of @t, the timeout
254 * will be increased by a random number of seconds chosen
255 * uniformly from range 0 .. @randomize.
256 *
257 * You can call tm_start() from the handler function of the timer
258 * to request another run of the timer. Also, you can set the @recurrent
259 * field to have the timer re-added automatically with the same timeout.
260 */
b5d9ee5c
MM
261void
262tm_start(timer *t, unsigned after)
263{
264 bird_clock_t when;
265
266 if (t->randomize)
af847acc 267 after += random() % (t->randomize + 1);
b5d9ee5c
MM
268 when = now + after;
269 if (t->expires == when)
270 return;
271 if (t->expires)
272 rem_node(&t->n);
273 t->expires = when;
274 if (after <= NEAR_TIMER_LIMIT)
275 tm_insert_near(t);
276 else
277 {
278 if (!first_far_timer || first_far_timer > when)
279 first_far_timer = when;
280 add_tail(&far_timers, &t->n);
281 }
282}
283
525fa2c1
MM
284/**
285 * tm_stop - stop a timer
286 * @t: timer
287 *
288 * This function stops a timer. If the timer is already stopped,
289 * nothing happens.
290 */
b5d9ee5c
MM
291void
292tm_stop(timer *t)
293{
294 if (t->expires)
295 {
296 rem_node(&t->n);
297 t->expires = 0;
298 }
299}
300
301static void
302tm_dump_them(char *name, list *l)
303{
304 node *n;
305 timer *t;
306
307 debug("%s timers:\n", name);
308 WALK_LIST(n, *l)
309 {
310 t = SKIP_BACK(timer, n, n);
311 debug("%p ", t);
312 tm_dump(&t->r);
313 }
314 debug("\n");
315}
316
317void
318tm_dump_all(void)
319{
320 tm_dump_them("Near", &near_timers);
321 tm_dump_them("Far", &far_timers);
322}
323
324static inline time_t
325tm_first_shot(void)
326{
327 time_t x = first_far_timer;
328
329 if (!EMPTY_LIST(near_timers))
330 {
331 timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
332 if (t->expires < x)
333 x = t->expires;
334 }
335 return x;
336}
337
8bcb5fb1
OZ
338void io_log_event(void *hook, void *data);
339
b5d9ee5c
MM
340static void
341tm_shot(void)
342{
343 timer *t;
344 node *n, *m;
345
346 if (first_far_timer <= now)
347 {
28a9a189 348 bird_clock_t limit = now + NEAR_TIMER_LIMIT;
b5d9ee5c
MM
349 first_far_timer = TIME_INFINITY;
350 n = HEAD(far_timers);
351 while (m = n->next)
352 {
353 t = SKIP_BACK(timer, n, n);
354 if (t->expires <= limit)
355 {
356 rem_node(n);
357 tm_insert_near(t);
358 }
359 else if (t->expires < first_far_timer)
360 first_far_timer = t->expires;
361 n = m;
362 }
363 }
364 while ((n = HEAD(near_timers)) -> next)
365 {
af847acc 366 int delay;
b5d9ee5c
MM
367 t = SKIP_BACK(timer, n, n);
368 if (t->expires > now)
369 break;
370 rem_node(n);
af847acc 371 delay = t->expires - now;
b5d9ee5c 372 t->expires = 0;
af847acc
MM
373 if (t->recurrent)
374 {
375 int i = t->recurrent - delay;
376 if (i < 0)
377 i = 0;
378 tm_start(t, i);
379 }
8bcb5fb1 380 io_log_event(t->hook, t->data);
b5d9ee5c
MM
381 t->hook(t);
382 }
383}
384
0d3effcf
OF
385/**
386 * tm_parse_datetime - parse a date and time
387 * @x: datetime string
388 *
389 * tm_parse_datetime() takes a textual representation of
390 * a date and time (dd-mm-yyyy hh:mm:ss)
391 * and converts it to the corresponding value of type &bird_clock_t.
392 */
393bird_clock_t
394tm_parse_datetime(char *x)
395{
396 struct tm tm;
397 int n;
398 time_t t;
399
400 if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
401 return tm_parse_date(x);
402 tm.tm_mon--;
403 tm.tm_year -= 1900;
404 t = mktime(&tm);
405 if (t == (time_t) -1)
406 return 0;
407 return t;
408}
525fa2c1
MM
409/**
410 * tm_parse_date - parse a date
411 * @x: date string
412 *
413 * tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
414 * and converts it to the corresponding value of type &bird_clock_t.
415 */
913f7dc9
MM
416bird_clock_t
417tm_parse_date(char *x)
418{
419 struct tm tm;
420 int n;
421 time_t t;
422
423 if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
424 return 0;
425 tm.tm_mon--;
426 tm.tm_year -= 1900;
427 tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
428 t = mktime(&tm);
429 if (t == (time_t) -1)
430 return 0;
431 return t;
432}
433
c37e7851
OZ
434static void
435tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
913f7dc9 436{
c37e7851
OZ
437 static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
438 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
913f7dc9 439
c37e7851
OZ
440 if (delta < 20*3600)
441 bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
442 else if (delta < 360*86400)
443 bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
444 else
445 bsprintf(x, "%d", tm->tm_year+1900);
913f7dc9
MM
446}
447
c37e7851
OZ
448#include "conf/conf.h"
449
525fa2c1
MM
450/**
451 * tm_format_datetime - convert date and time to textual representation
452 * @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
8e433d6a 453 * @fmt_spec: specification of resulting textual representation of the time
525fa2c1
MM
454 * @t: time
455 *
fd91ae33
OZ
456 * This function formats the given relative time value @t to a textual
457 * date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
525fa2c1 458 */
7a88832e 459void
c37e7851 460tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
7a88832e 461{
c37e7851 462 const char *fmt_used;
7a88832e 463 struct tm *tm;
fd91ae33
OZ
464 bird_clock_t delta = now - t;
465 t = now_real - delta;
7a88832e 466 tm = localtime(&t);
7a88832e 467
c37e7851
OZ
468 if (fmt_spec->fmt1 == NULL)
469 return tm_format_reltime(x, tm, delta);
afa8937a 470
c37e7851
OZ
471 if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
472 fmt_used = fmt_spec->fmt1;
afa8937a 473 else
c37e7851
OZ
474 fmt_used = fmt_spec->fmt2;
475
476 int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
477 if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
478 strcpy(x, "<too-long>");
afa8937a
MM
479}
480
05476c4d 481
525fa2c1
MM
482/**
483 * DOC: Sockets
484 *
485 * Socket resources represent network connections. Their data structure (&socket)
486 * contains a lot of fields defining the exact type of the socket, the local and
487 * remote addresses and ports, pointers to socket buffers and finally pointers to
488 * hook functions to be called when new data have arrived to the receive buffer
489 * (@rx_hook), when the contents of the transmit buffer have been transmitted
490 * (@tx_hook) and when an error or connection close occurs (@err_hook).
491 *
38a608c5 492 * Freeing of sockets from inside socket hooks is perfectly safe.
b5d9ee5c
MM
493 */
494
abae6e9c
MM
495#ifndef SOL_IP
496#define SOL_IP IPPROTO_IP
497#endif
498
b1a1faba
OF
499#ifndef SOL_IPV6
500#define SOL_IPV6 IPPROTO_IPV6
501#endif
502
48e5f32d
OZ
503#ifndef SOL_ICMPV6
504#define SOL_ICMPV6 IPPROTO_ICMPV6
505#endif
506
507
05476c4d
OZ
508/*
509 * Sockaddr helper functions
510 */
38a608c5 511
3e236955 512static inline int UNUSED sockaddr_length(int af)
05476c4d
OZ
513{ return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
514
515static inline void
3e236955 516sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
38a608c5 517{
05476c4d
OZ
518 memset(sa, 0, sizeof(struct sockaddr_in));
519#ifdef HAVE_SIN_LEN
520 sa->sin_len = sizeof(struct sockaddr_in);
521#endif
522 sa->sin_family = AF_INET;
523 sa->sin_port = htons(port);
524 sa->sin_addr = ipa_to_in4(a);
38a608c5 525}
b5d9ee5c 526
05476c4d
OZ
527static inline void
528sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
b5d9ee5c 529{
05476c4d
OZ
530 memset(sa, 0, sizeof(struct sockaddr_in6));
531#ifdef SIN6_LEN
532 sa->sin6_len = sizeof(struct sockaddr_in6);
533#endif
534 sa->sin6_family = AF_INET6;
535 sa->sin6_port = htons(port);
536 sa->sin6_flowinfo = 0;
537 sa->sin6_addr = ipa_to_in6(a);
538
539 if (ifa && ipa_is_link_local(a))
540 sa->sin6_scope_id = ifa->index;
4da25acb 541}
b5d9ee5c 542
05476c4d
OZ
543void
544sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
4da25acb 545{
05476c4d 546 if (af == AF_INET)
3e236955 547 sockaddr_fill4((struct sockaddr_in *) sa, a, port);
05476c4d
OZ
548 else if (af == AF_INET6)
549 sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
550 else
551 bug("Unknown AF");
4da25acb
MM
552}
553
05476c4d 554static inline void
3e236955 555sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
4da25acb 556{
05476c4d
OZ
557 *port = ntohs(sa->sin_port);
558 *a = ipa_from_in4(sa->sin_addr);
b5d9ee5c
MM
559}
560
05476c4d
OZ
561static inline void
562sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
48e5f32d 563{
05476c4d
OZ
564 *port = ntohs(sa->sin6_port);
565 *a = ipa_from_in6(sa->sin6_addr);
48e5f32d 566
05476c4d
OZ
567 if (ifa && ipa_is_link_local(*a))
568 *ifa = if_find_by_index(sa->sin6_scope_id);
48e5f32d
OZ
569}
570
05476c4d
OZ
571int
572sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
48e5f32d 573{
05476c4d
OZ
574 if (sa->sa.sa_family != af)
575 goto fail;
48e5f32d 576
05476c4d 577 if (af == AF_INET)
3e236955 578 sockaddr_read4((struct sockaddr_in *) sa, a, port);
05476c4d
OZ
579 else if (af == AF_INET6)
580 sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
581 else
582 goto fail;
48e5f32d 583
05476c4d 584 return 0;
48e5f32d 585
05476c4d
OZ
586 fail:
587 *a = IPA_NONE;
588 *port = 0;
589 return -1;
48e5f32d
OZ
590}
591
48e5f32d 592
05476c4d
OZ
593/*
594 * IPv6 multicast syscalls
595 */
4da25acb 596
05476c4d 597/* Fortunately standardized in RFC 3493 */
b5d9ee5c 598
05476c4d
OZ
599#define INIT_MREQ6(maddr,ifa) \
600 { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
b5d9ee5c 601
05476c4d
OZ
602static inline int
603sk_setup_multicast6(sock *s)
b5d9ee5c 604{
05476c4d
OZ
605 int index = s->iface->index;
606 int ttl = s->ttl;
607 int n = 0;
b5d9ee5c 608
05476c4d
OZ
609 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
610 ERR("IPV6_MULTICAST_IF");
b5d9ee5c 611
05476c4d
OZ
612 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
613 ERR("IPV6_MULTICAST_HOPS");
4f22c981 614
05476c4d
OZ
615 if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
616 ERR("IPV6_MULTICAST_LOOP");
4f22c981 617
05476c4d 618 return 0;
061ab802
OZ
619}
620
05476c4d
OZ
621static inline int
622sk_join_group6(sock *s, ip_addr maddr)
4f22c981 623{
05476c4d 624 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
eb1451a3 625
05476c4d
OZ
626 if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
627 ERR("IPV6_JOIN_GROUP");
4f22c981 628
05476c4d 629 return 0;
b5d9ee5c
MM
630}
631
05476c4d
OZ
632static inline int
633sk_leave_group6(sock *s, ip_addr maddr)
b5d9ee5c 634{
05476c4d 635 struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
b5d9ee5c 636
05476c4d
OZ
637 if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
638 ERR("IPV6_LEAVE_GROUP");
639
640 return 0;
641}
4f22c981 642
bed41728 643
05476c4d
OZ
644/*
645 * IPv6 packet control messages
646 */
bed41728 647
05476c4d 648/* Also standardized, in RFC 3542 */
bed41728 649
dcc60494
OZ
650/*
651 * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
652 * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
653 * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
654 * RFC and we use IPV6_PKTINFO.
655 */
656#ifndef IPV6_RECVPKTINFO
657#define IPV6_RECVPKTINFO IPV6_PKTINFO
658#endif
70e212f9
OZ
659/*
660 * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
661 */
662#ifndef IPV6_RECVHOPLIMIT
663#define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
664#endif
dcc60494 665
70e212f9 666
05476c4d
OZ
667#define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
668#define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
bed41728 669
05476c4d
OZ
670static inline int
671sk_request_cmsg6_pktinfo(sock *s)
672{
673 int y = 1;
70e212f9 674
05476c4d
OZ
675 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
676 ERR("IPV6_RECVPKTINFO");
677
678 return 0;
bed41728
OZ
679}
680
05476c4d
OZ
681static inline int
682sk_request_cmsg6_ttl(sock *s)
bed41728 683{
05476c4d 684 int y = 1;
bed41728 685
05476c4d
OZ
686 if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
687 ERR("IPV6_RECVHOPLIMIT");
70e212f9 688
05476c4d
OZ
689 return 0;
690}
70e212f9 691
05476c4d
OZ
692static inline void
693sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
694{
695 if (cm->cmsg_type == IPV6_PKTINFO)
70e212f9 696 {
05476c4d
OZ
697 struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
698 s->laddr = ipa_from_in6(pi->ipi6_addr);
699 s->lifindex = pi->ipi6_ifindex;
70e212f9 700 }
05476c4d 701}
70e212f9 702
05476c4d
OZ
703static inline void
704sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
705{
706 if (cm->cmsg_type == IPV6_HOPLIMIT)
707 s->rcv_ttl = * (int *) CMSG_DATA(cm);
bed41728
OZ
708}
709
05476c4d
OZ
710static inline void
711sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
bed41728
OZ
712{
713 struct cmsghdr *cm;
714 struct in6_pktinfo *pi;
8945f73d 715 int controllen = 0;
bed41728 716
bed41728
OZ
717 msg->msg_control = cbuf;
718 msg->msg_controllen = cbuflen;
719
720 cm = CMSG_FIRSTHDR(msg);
48e5f32d 721 cm->cmsg_level = SOL_IPV6;
bed41728
OZ
722 cm->cmsg_type = IPV6_PKTINFO;
723 cm->cmsg_len = CMSG_LEN(sizeof(*pi));
8945f73d 724 controllen += CMSG_SPACE(sizeof(*pi));
bed41728
OZ
725
726 pi = (struct in6_pktinfo *) CMSG_DATA(cm);
bed41728 727 pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
05476c4d 728 pi->ipi6_addr = ipa_to_in6(s->saddr);
bed41728 729
8945f73d 730 msg->msg_controllen = controllen;
bed41728 731}
48e5f32d 732
bed41728 733
05476c4d
OZ
734/*
735 * Miscellaneous socket syscalls
736 */
737
738static inline int
739sk_set_ttl4(sock *s, int ttl)
a39b165e 740{
05476c4d
OZ
741 if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
742 ERR("IP_TTL");
743
744 return 0;
a39b165e
OZ
745}
746
05476c4d
OZ
747static inline int
748sk_set_ttl6(sock *s, int ttl)
749{
750 if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
751 ERR("IPV6_UNICAST_HOPS");
38a608c5 752
05476c4d
OZ
753 return 0;
754}
755
756static inline int
757sk_set_tos4(sock *s, int tos)
b5d9ee5c 758{
05476c4d
OZ
759 if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
760 ERR("IP_TOS");
b5d9ee5c 761
05476c4d
OZ
762 return 0;
763}
ef4a50be 764
05476c4d
OZ
765static inline int
766sk_set_tos6(sock *s, int tos)
767{
768 if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
769 ERR("IPV6_TCLASS");
48e5f32d 770
05476c4d
OZ
771 return 0;
772}
48e5f32d 773
b867a87c 774static inline int
3e236955 775sk_set_high_port(sock *s UNUSED)
b867a87c
OZ
776{
777 /* Port range setting is optional, ignore it if not supported */
778
779#ifdef IP_PORTRANGE
780 if (sk_is_ipv4(s))
781 {
782 int range = IP_PORTRANGE_HIGH;
783 if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
784 ERR("IP_PORTRANGE");
785 }
786#endif
787
788#ifdef IPV6_PORTRANGE
789 if (sk_is_ipv6(s))
790 {
791 int range = IPV6_PORTRANGE_HIGH;
792 if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
793 ERR("IPV6_PORTRANGE");
794 }
795#endif
796
797 return 0;
798}
799
88a183c6
OZ
800static inline byte *
801sk_skip_ip_header(byte *pkt, int *len)
802{
803 if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
804 return NULL;
805
806 int hlen = (*pkt & 0x0f) * 4;
807 if ((hlen < 20) || (hlen > *len))
808 return NULL;
809
810 *len -= hlen;
811 return pkt + hlen;
812}
813
814byte *
815sk_rx_buffer(sock *s, int *len)
816{
817 if (sk_is_ipv4(s) && (s->type == SK_IP))
818 return sk_skip_ip_header(s->rbuf, len);
819 else
820 return s->rbuf;
821}
822
48e5f32d 823
05476c4d
OZ
824/*
825 * Public socket functions
826 */
48e5f32d 827
05476c4d
OZ
828/**
829 * sk_setup_multicast - enable multicast for given socket
830 * @s: socket
831 *
832 * Prepare transmission of multicast packets for given datagram socket.
833 * The socket must have defined @iface.
834 *
835 * Result: 0 for success, -1 for an error.
836 */
48e5f32d 837
05476c4d
OZ
838int
839sk_setup_multicast(sock *s)
840{
841 ASSERT(s->iface);
48e5f32d 842
05476c4d
OZ
843 if (sk_is_ipv4(s))
844 return sk_setup_multicast4(s);
845 else
846 return sk_setup_multicast6(s);
847}
48e5f32d 848
05476c4d
OZ
849/**
850 * sk_join_group - join multicast group for given socket
851 * @s: socket
852 * @maddr: multicast address
853 *
854 * Join multicast group for given datagram socket and associated interface.
855 * The socket must have defined @iface.
856 *
857 * Result: 0 for success, -1 for an error.
858 */
789772ed 859
05476c4d
OZ
860int
861sk_join_group(sock *s, ip_addr maddr)
862{
863 if (sk_is_ipv4(s))
864 return sk_join_group4(s, maddr);
865 else
866 return sk_join_group6(s, maddr);
867}
ef4a50be 868
05476c4d
OZ
869/**
870 * sk_leave_group - leave multicast group for given socket
871 * @s: socket
872 * @maddr: multicast address
873 *
874 * Leave multicast group for given datagram socket and associated interface.
875 * The socket must have defined @iface.
876 *
877 * Result: 0 for success, -1 for an error.
878 */
789772ed 879
05476c4d
OZ
880int
881sk_leave_group(sock *s, ip_addr maddr)
882{
883 if (sk_is_ipv4(s))
884 return sk_leave_group4(s, maddr);
885 else
886 return sk_leave_group6(s, maddr);
b5d9ee5c
MM
887}
888
a39b165e 889/**
05476c4d
OZ
890 * sk_setup_broadcast - enable broadcast for given socket
891 * @s: socket
892 *
893 * Allow reception and transmission of broadcast packets for given datagram
894 * socket. The socket must have defined @iface. For transmission, packets should
895 * be send to @brd address of @iface.
896 *
897 * Result: 0 for success, -1 for an error.
898 */
899
900int
901sk_setup_broadcast(sock *s)
902{
903 int y = 1;
904
905 if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
906 ERR("SO_BROADCAST");
907
908 return 0;
909}
910
911/**
912 * sk_set_ttl - set transmit TTL for given socket
a39b165e
OZ
913 * @s: socket
914 * @ttl: TTL value
915 *
05476c4d
OZ
916 * Set TTL for already opened connections when TTL was not set before. Useful
917 * for accepted connections when different ones should have different TTL.
a39b165e
OZ
918 *
919 * Result: 0 for success, -1 for an error.
920 */
921
922int
923sk_set_ttl(sock *s, int ttl)
924{
a39b165e 925 s->ttl = ttl;
a39b165e 926
05476c4d
OZ
927 if (sk_is_ipv4(s))
928 return sk_set_ttl4(s, ttl);
929 else
930 return sk_set_ttl6(s, ttl);
a39b165e
OZ
931}
932
b1b19433 933/**
05476c4d 934 * sk_set_min_ttl - set minimal accepted TTL for given socket
b1b19433
OZ
935 * @s: socket
936 * @ttl: TTL value
937 *
05476c4d
OZ
938 * Set minimal accepted TTL for given socket. Can be used for TTL security.
939 * implementations.
b1b19433
OZ
940 *
941 * Result: 0 for success, -1 for an error.
942 */
943
944int
945sk_set_min_ttl(sock *s, int ttl)
946{
05476c4d
OZ
947 if (sk_is_ipv4(s))
948 return sk_set_min_ttl4(s, ttl);
949 else
950 return sk_set_min_ttl6(s, ttl);
b1b19433 951}
d51aa281 952
05476c4d 953#if 0
d51aa281 954/**
05476c4d 955 * sk_set_md5_auth - add / remove MD5 security association for given socket
d51aa281 956 * @s: socket
a7baa098
OZ
957 * @local: IP address of local side
958 * @remote: IP address of remote side
eb1451a3 959 * @ifa: Interface for link-local IP address
a7baa098
OZ
960 * @passwd: Password used for MD5 authentication
961 * @setkey: Update also system SA/SP database
d51aa281 962 *
a7baa098
OZ
963 * In TCP MD5 handling code in kernel, there is a set of security associations
964 * used for choosing password and other authentication parameters according to
965 * the local and remote address. This function is useful for listening socket,
966 * for active sockets it may be enough to set s->password field.
d51aa281
OZ
967 *
968 * When called with passwd != NULL, the new pair is added,
969 * When called with passwd == NULL, the existing pair is removed.
970 *
a7baa098
OZ
971 * Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
972 * stored in global SA/SP database (but the behavior also must be enabled on
973 * per-socket basis). In case of multiple sockets to the same neighbor, the
974 * socket-specific state must be configured for each socket while global state
975 * just once per src-dst pair. The @setkey argument controls whether the global
976 * state (SA/SP database) is also updated.
977 *
d51aa281
OZ
978 * Result: 0 for success, -1 for an error.
979 */
980
981int
a7baa098 982sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
05476c4d
OZ
983{ DUMMY; }
984#endif
f9c799a0 985
05476c4d
OZ
986/**
987 * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
988 * @s: socket
989 * @offset: offset
990 *
991 * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
992 * kernel will automatically fill it for outgoing packets and check it for
993 * incoming packets. Should not be used on ICMPv6 sockets, where the position is
994 * known to the kernel.
995 *
996 * Result: 0 for success, -1 for an error.
997 */
f9c799a0 998
4ac7c834
OZ
999int
1000sk_set_ipv6_checksum(sock *s, int offset)
1001{
48e5f32d 1002 if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
05476c4d 1003 ERR("IPV6_CHECKSUM");
4ac7c834
OZ
1004
1005 return 0;
1006}
1007
93e868c7 1008int
05476c4d 1009sk_set_icmp6_filter(sock *s, int p1, int p2)
93e868c7
OZ
1010{
1011 /* a bit of lame interface, but it is here only for Radv */
1012 struct icmp6_filter f;
1013
1014 ICMP6_FILTER_SETBLOCKALL(&f);
1015 ICMP6_FILTER_SETPASS(p1, &f);
1016 ICMP6_FILTER_SETPASS(p2, &f);
1017
48e5f32d 1018 if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
05476c4d 1019 ERR("ICMP6_FILTER");
93e868c7
OZ
1020
1021 return 0;
1022}
1023
05476c4d
OZ
1024void
1025sk_log_error(sock *s, const char *p)
1026{
1027 log(L_ERR "%s: Socket error: %s%#m", p, s->err);
1028}
1029
1030
1031/*
1032 * Actual struct birdsock code
1033 */
1034
1035static list sock_list;
1036static struct birdsock *current_sock;
1037static struct birdsock *stored_sock;
05476c4d
OZ
1038
1039static inline sock *
1040sk_next(sock *s)
1041{
1042 if (!s->n.next->next)
1043 return NULL;
1044 else
1045 return SKIP_BACK(sock, n, s->n.next);
1046}
1047
1048static void
1049sk_alloc_bufs(sock *s)
1050{
1051 if (!s->rbuf && s->rbsize)
1052 s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
1053 s->rpos = s->rbuf;
1054 if (!s->tbuf && s->tbsize)
1055 s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
1056 s->tpos = s->ttx = s->tbuf;
1057}
1058
1059static void
1060sk_free_bufs(sock *s)
1061{
1062 if (s->rbuf_alloc)
1063 {
1064 xfree(s->rbuf_alloc);
1065 s->rbuf = s->rbuf_alloc = NULL;
1066 }
1067 if (s->tbuf_alloc)
1068 {
1069 xfree(s->tbuf_alloc);
1070 s->tbuf = s->tbuf_alloc = NULL;
1071 }
1072}
1073
1074static void
1075sk_free(resource *r)
1076{
1077 sock *s = (sock *) r;
1078
1079 sk_free_bufs(s);
1080 if (s->fd >= 0)
1081 {
1082 close(s->fd);
1083
1084 /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
1085 if (s->flags & SKF_THREAD)
1086 return;
1087
1088 if (s == current_sock)
1089 current_sock = sk_next(s);
1090 if (s == stored_sock)
1091 stored_sock = sk_next(s);
1092 rem_node(&s->n);
05476c4d
OZ
1093 }
1094}
1095
1096void
1097sk_set_rbsize(sock *s, uint val)
1098{
1099 ASSERT(s->rbuf_alloc == s->rbuf);
1100
1101 if (s->rbsize == val)
1102 return;
1103
1104 s->rbsize = val;
1105 xfree(s->rbuf_alloc);
1106 s->rbuf_alloc = xmalloc(val);
1107 s->rpos = s->rbuf = s->rbuf_alloc;
1108}
1109
1110void
1111sk_set_tbsize(sock *s, uint val)
1112{
1113 ASSERT(s->tbuf_alloc == s->tbuf);
1114
1115 if (s->tbsize == val)
1116 return;
1117
1118 byte *old_tbuf = s->tbuf;
1119
1120 s->tbsize = val;
1121 s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
1122 s->tpos = s->tbuf + (s->tpos - old_tbuf);
1123 s->ttx = s->tbuf + (s->ttx - old_tbuf);
1124}
1125
1126void
1127sk_set_tbuf(sock *s, void *tbuf)
1128{
1129 s->tbuf = tbuf ?: s->tbuf_alloc;
1130 s->ttx = s->tpos = s->tbuf;
1131}
1132
1133void
1134sk_reallocate(sock *s)
1135{
1136 sk_free_bufs(s);
1137 sk_alloc_bufs(s);
1138}
1139
1140static void
1141sk_dump(resource *r)
1142{
1143 sock *s = (sock *) r;
1144 static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "DEL!" };
1145
af454f9b 1146 debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
05476c4d
OZ
1147 sk_type_names[s->type],
1148 s->data,
1149 s->saddr,
1150 s->sport,
1151 s->daddr,
1152 s->dport,
1153 s->tos,
1154 s->ttl,
1155 s->iface ? s->iface->name : "none");
1156}
1157
1158static struct resclass sk_class = {
1159 "Socket",
1160 sizeof(sock),
1161 sk_free,
1162 sk_dump,
1163 NULL,
1164 NULL
1165};
1166
1167/**
1168 * sk_new - create a socket
1169 * @p: pool
1170 *
1171 * This function creates a new socket resource. If you want to use it,
1172 * you need to fill in all the required fields of the structure and
1173 * call sk_open() to do the actual opening of the socket.
1174 *
1175 * The real function name is sock_new(), sk_new() is a macro wrapper
1176 * to avoid collision with OpenSSL.
1177 */
1178sock *
1179sock_new(pool *p)
1180{
1181 sock *s = ralloc(p, &sk_class);
1182 s->pool = p;
1183 // s->saddr = s->daddr = IPA_NONE;
1184 s->tos = s->priority = s->ttl = -1;
1185 s->fd = -1;
1186 return s;
1187}
1188
1189static int
1190sk_setup(sock *s)
f9c799a0 1191{
05476c4d
OZ
1192 int y = 1;
1193 int fd = s->fd;
f9c799a0 1194
05476c4d
OZ
1195 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1196 ERR("O_NONBLOCK");
f9c799a0 1197
05476c4d
OZ
1198 if (!s->af)
1199 return 0;
f9c799a0 1200
05476c4d
OZ
1201 if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
1202 s->flags |= SKF_PKTINFO;
f9c799a0 1203
05476c4d
OZ
1204#ifdef CONFIG_USE_HDRINCL
1205 if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
1206 {
1207 s->flags &= ~SKF_PKTINFO;
1208 s->flags |= SKF_HDRINCL;
1209 if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
1210 ERR("IP_HDRINCL");
1211 }
48e5f32d
OZ
1212#endif
1213
05476c4d
OZ
1214 if (s->iface)
1215 {
1216#ifdef SO_BINDTODEVICE
f7a99acb 1217 struct ifreq ifr = {};
05476c4d
OZ
1218 strcpy(ifr.ifr_name, s->iface->name);
1219 if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
1220 ERR("SO_BINDTODEVICE");
1221#endif
f1aceff5 1222
05476c4d
OZ
1223#ifdef CONFIG_UNIX_DONTROUTE
1224 if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
1225 ERR("SO_DONTROUTE");
1226#endif
1227 }
f9c799a0 1228
05476c4d
OZ
1229 if (s->priority >= 0)
1230 if (sk_set_priority(s, s->priority) < 0)
f9c799a0 1231 return -1;
f9c799a0 1232
05476c4d
OZ
1233 if (sk_is_ipv4(s))
1234 {
1235 if (s->flags & SKF_LADDR_RX)
1236 if (sk_request_cmsg4_pktinfo(s) < 0)
1237 return -1;
f9c799a0 1238
05476c4d
OZ
1239 if (s->flags & SKF_TTL_RX)
1240 if (sk_request_cmsg4_ttl(s) < 0)
1241 return -1;
f9c799a0 1242
05476c4d
OZ
1243 if ((s->type == SK_UDP) || (s->type == SK_IP))
1244 if (sk_disable_mtu_disc4(s) < 0)
1245 return -1;
f9c799a0 1246
05476c4d
OZ
1247 if (s->ttl >= 0)
1248 if (sk_set_ttl4(s, s->ttl) < 0)
1249 return -1;
f9c799a0 1250
05476c4d
OZ
1251 if (s->tos >= 0)
1252 if (sk_set_tos4(s, s->tos) < 0)
1253 return -1;
1254 }
f9c799a0 1255
05476c4d
OZ
1256 if (sk_is_ipv6(s))
1257 {
1258 if (s->flags & SKF_V6ONLY)
1259 if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
1260 ERR("IPV6_V6ONLY");
f9c799a0 1261
05476c4d
OZ
1262 if (s->flags & SKF_LADDR_RX)
1263 if (sk_request_cmsg6_pktinfo(s) < 0)
1264 return -1;
f9c799a0 1265
05476c4d
OZ
1266 if (s->flags & SKF_TTL_RX)
1267 if (sk_request_cmsg6_ttl(s) < 0)
1268 return -1;
f9c799a0 1269
05476c4d
OZ
1270 if ((s->type == SK_UDP) || (s->type == SK_IP))
1271 if (sk_disable_mtu_disc6(s) < 0)
1272 return -1;
f9c799a0 1273
05476c4d
OZ
1274 if (s->ttl >= 0)
1275 if (sk_set_ttl6(s, s->ttl) < 0)
1276 return -1;
f9c799a0 1277
05476c4d
OZ
1278 if (s->tos >= 0)
1279 if (sk_set_tos6(s, s->tos) < 0)
1280 return -1;
1281 }
f9c799a0
OZ
1282
1283 return 0;
1284}
1285
05476c4d
OZ
1286static void
1287sk_insert(sock *s)
f9c799a0 1288{
05476c4d 1289 add_tail(&sock_list, &s->n);
f9c799a0
OZ
1290}
1291
b93abffa 1292static void
b5d9ee5c
MM
1293sk_tcp_connected(sock *s)
1294{
05476c4d
OZ
1295 sockaddr sa;
1296 int sa_len = sizeof(sa);
1297
1298 if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
1299 (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
1300 log(L_WARN "SOCK: Cannot get local IP address for TCP>");
9be9a264 1301
b5d9ee5c
MM
1302 s->type = SK_TCP;
1303 sk_alloc_bufs(s);
320f4173 1304 s->tx_hook(s);
b5d9ee5c
MM
1305}
1306
b93abffa 1307static int
05476c4d 1308sk_passive_connected(sock *s, int type)
b93abffa 1309{
05476c4d
OZ
1310 sockaddr loc_sa, rem_sa;
1311 int loc_sa_len = sizeof(loc_sa);
1312 int rem_sa_len = sizeof(rem_sa);
cf31112f 1313
05476c4d
OZ
1314 int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
1315 if (fd < 0)
1316 {
1317 if ((errno != EINTR) && (errno != EAGAIN))
c025b852 1318 s->err_hook(s, errno);
05476c4d
OZ
1319 return 0;
1320 }
1321
1322 sock *t = sk_new(s->pool);
1323 t->type = type;
1324 t->fd = fd;
1325 t->af = s->af;
1326 t->ttl = s->ttl;
1327 t->tos = s->tos;
1328 t->rbsize = s->rbsize;
1329 t->tbsize = s->tbsize;
1330
1331 if (type == SK_TCP)
1332 {
1333 if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
1334 (sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
1335 log(L_WARN "SOCK: Cannot get local IP address for TCP<");
1336
1337 if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
1338 log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
1339 }
1340
1341 if (sk_setup(t) < 0)
1342 {
1343 /* FIXME: Call err_hook instead ? */
1344 log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
1345
1346 /* FIXME: handle it better in rfree() */
9c89560e 1347 close(t->fd);
05476c4d
OZ
1348 t->fd = -1;
1349 rfree(t);
1350 return 1;
1351 }
1352
1353 sk_insert(t);
1354 sk_alloc_bufs(t);
1355 s->rx_hook(t, 0);
1356 return 1;
b93abffa
MM
1357}
1358
525fa2c1
MM
1359/**
1360 * sk_open - open a socket
1361 * @s: socket
1362 *
1363 * This function takes a socket resource created by sk_new() and
1364 * initialized by the user and binds a corresponding network connection
1365 * to it.
1366 *
1367 * Result: 0 for success, -1 for an error.
1368 */
b5d9ee5c
MM
1369int
1370sk_open(sock *s)
1371{
05476c4d
OZ
1372 int af = BIRD_AF;
1373 int fd = -1;
48e5f32d
OZ
1374 int do_bind = 0;
1375 int bind_port = 0;
1376 ip_addr bind_addr = IPA_NONE;
1377 sockaddr sa;
b5d9ee5c 1378
48e5f32d 1379 switch (s->type)
05476c4d
OZ
1380 {
1381 case SK_TCP_ACTIVE:
1382 s->ttx = ""; /* Force s->ttx != s->tpos */
1383 /* Fall thru */
1384 case SK_TCP_PASSIVE:
1385 fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
1386 bind_port = s->sport;
1387 bind_addr = s->saddr;
1388 do_bind = bind_port || ipa_nonzero(bind_addr);
1389 break;
9c89560e 1390
05476c4d
OZ
1391 case SK_UDP:
1392 fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
1393 bind_port = s->sport;
1394 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1395 do_bind = 1;
1396 break;
1397
1398 case SK_IP:
1399 fd = socket(af, SOCK_RAW, s->dport);
1400 bind_port = 0;
1401 bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1402 do_bind = ipa_nonzero(bind_addr);
1403 break;
1404
1405 case SK_MAGIC:
1406 af = 0;
1407 fd = s->fd;
1408 break;
1409
1410 default:
1411 bug("sk_open() called for invalid sock type %d", s->type);
1412 }
1413
b5d9ee5c 1414 if (fd < 0)
05476c4d
OZ
1415 ERR("socket");
1416
1417 s->af = af;
b5d9ee5c
MM
1418 s->fd = fd;
1419
05476c4d
OZ
1420 if (sk_setup(s) < 0)
1421 goto err;
38a608c5 1422
48e5f32d 1423 if (do_bind)
05476c4d
OZ
1424 {
1425 if (bind_port)
b5d9ee5c 1426 {
05476c4d
OZ
1427 int y = 1;
1428
1429 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
1430 ERR2("SO_REUSEADDR");
48e5f32d 1431
8931425d 1432#ifdef CONFIG_NO_IFACE_BIND
05476c4d
OZ
1433 /* Workaround missing ability to bind to an iface */
1434 if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
1435 {
1436 if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
1437 ERR2("SO_REUSEPORT");
1438 }
8931425d 1439#endif
b5d9ee5c 1440 }
b867a87c
OZ
1441 else
1442 if (s->flags & SKF_HIGH_PORT)
1443 if (sk_set_high_port(s) < 0)
1444 log(L_WARN "Socket error: %s%#m", s->err);
48e5f32d 1445
05476c4d
OZ
1446 sockaddr_fill(&sa, af, bind_addr, s->iface, bind_port);
1447 if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
1448 ERR2("bind");
1449 }
d51aa281
OZ
1450
1451 if (s->password)
a7baa098 1452 if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
05476c4d 1453 goto err;
d51aa281 1454
48e5f32d 1455 switch (s->type)
05476c4d
OZ
1456 {
1457 case SK_TCP_ACTIVE:
1458 sockaddr_fill(&sa, af, s->daddr, s->iface, s->dport);
1459 if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
1460 sk_tcp_connected(s);
1461 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
1462 errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
1463 ERR2("connect");
1464 break;
1465
1466 case SK_TCP_PASSIVE:
1467 if (listen(fd, 8) < 0)
1468 ERR2("listen");
1469 break;
1470
1471 case SK_MAGIC:
1472 break;
1473
1474 default:
1475 sk_alloc_bufs(s);
1476 }
b5d9ee5c 1477
bf139664
OZ
1478 if (!(s->flags & SKF_THREAD))
1479 sk_insert(s);
b5d9ee5c
MM
1480 return 0;
1481
05476c4d 1482err:
b5d9ee5c
MM
1483 close(fd);
1484 s->fd = -1;
1485 return -1;
1486}
1487
05476c4d 1488int
b93abffa
MM
1489sk_open_unix(sock *s, char *name)
1490{
b93abffa 1491 struct sockaddr_un sa;
05476c4d
OZ
1492 int fd;
1493
1494 /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
b93abffa
MM
1495
1496 fd = socket(AF_UNIX, SOCK_STREAM, 0);
1497 if (fd < 0)
05476c4d
OZ
1498 return -1;
1499
1500 if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1501 return -1;
68fa95cf 1502
97e46d28 1503 /* Path length checked in test_old_bird() */
b93abffa 1504 sa.sun_family = AF_UNIX;
97c6fa02 1505 strcpy(sa.sun_path, name);
05476c4d 1506
0b3bf4b1 1507 if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
05476c4d
OZ
1508 return -1;
1509
1510 if (listen(fd, 8) < 0)
1511 return -1;
1512
1513 s->fd = fd;
38a608c5 1514 sk_insert(s);
05476c4d
OZ
1515 return 0;
1516}
1517
1518
1519#define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
1520 CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
1521#define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
1522
1523static void
1524sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
1525{
1526 if (sk_is_ipv4(s))
1527 sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
1528 else
1529 sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
1530}
1531
1532static void
1533sk_process_cmsgs(sock *s, struct msghdr *msg)
1534{
1535 struct cmsghdr *cm;
1536
1537 s->laddr = IPA_NONE;
1538 s->lifindex = 0;
1539 s->rcv_ttl = -1;
1540
1541 for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
1542 {
1543 if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
1544 {
1545 sk_process_cmsg4_pktinfo(s, cm);
1546 sk_process_cmsg4_ttl(s, cm);
1547 }
b93abffa 1548
05476c4d
OZ
1549 if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
1550 {
1551 sk_process_cmsg6_pktinfo(s, cm);
1552 sk_process_cmsg6_ttl(s, cm);
1553 }
1554 }
b93abffa
MM
1555}
1556
48e5f32d
OZ
1557
1558static inline int
1559sk_sendmsg(sock *s)
1560{
1561 struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
1562 byte cmsg_buf[CMSG_TX_SPACE];
1563 sockaddr dst;
1564
05476c4d 1565 sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
48e5f32d
OZ
1566
1567 struct msghdr msg = {
05476c4d
OZ
1568 .msg_name = &dst.sa,
1569 .msg_namelen = SA_LEN(dst),
48e5f32d
OZ
1570 .msg_iov = &iov,
1571 .msg_iovlen = 1
1572 };
1573
1574#ifdef CONFIG_USE_HDRINCL
1575 byte hdr[20];
1576 struct iovec iov2[2] = { {hdr, 20}, iov };
1577
1578 if (s->flags & SKF_HDRINCL)
1579 {
05476c4d 1580 sk_prepare_ip_header(s, hdr, iov.iov_len);
48e5f32d
OZ
1581 msg.msg_iov = iov2;
1582 msg.msg_iovlen = 2;
1583 }
1584#endif
1585
1586 if (s->flags & SKF_PKTINFO)
05476c4d 1587 sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
48e5f32d
OZ
1588
1589 return sendmsg(s->fd, &msg, 0);
1590}
1591
1592static inline int
1593sk_recvmsg(sock *s)
1594{
1595 struct iovec iov = {s->rbuf, s->rbsize};
1596 byte cmsg_buf[CMSG_RX_SPACE];
1597 sockaddr src;
1598
1599 struct msghdr msg = {
05476c4d
OZ
1600 .msg_name = &src.sa,
1601 .msg_namelen = sizeof(src), // XXXX ??
48e5f32d
OZ
1602 .msg_iov = &iov,
1603 .msg_iovlen = 1,
1604 .msg_control = cmsg_buf,
1605 .msg_controllen = sizeof(cmsg_buf),
1606 .msg_flags = 0
1607 };
1608
1609 int rv = recvmsg(s->fd, &msg, 0);
1610 if (rv < 0)
1611 return rv;
1612
1613 //ifdef IPV4
1614 // if (cf_type == SK_IP)
1615 // rv = ipv4_skip_header(pbuf, rv);
1616 //endif
1617
05476c4d
OZ
1618 sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
1619 sk_process_cmsgs(s, &msg);
48e5f32d
OZ
1620
1621 if (msg.msg_flags & MSG_TRUNC)
1622 s->flags |= SKF_TRUNCATED;
1623 else
1624 s->flags &= ~SKF_TRUNCATED;
1625
1626 return rv;
1627}
1628
1629
353729f5
OZ
1630static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
1631
b5d9ee5c
MM
1632static int
1633sk_maybe_write(sock *s)
1634{
1635 int e;
1636
1637 switch (s->type)
05476c4d
OZ
1638 {
1639 case SK_TCP:
1640 case SK_MAGIC:
1641 case SK_UNIX:
1642 while (s->ttx != s->tpos)
b5d9ee5c 1643 {
05476c4d
OZ
1644 e = write(s->fd, s->ttx, s->tpos - s->ttx);
1645
1646 if (e < 0)
1647 {
1648 if (errno != EINTR && errno != EAGAIN)
b5d9ee5c 1649 {
05476c4d
OZ
1650 reset_tx_buffer(s);
1651 /* EPIPE is just a connection close notification during TX */
1652 s->err_hook(s, (errno != EPIPE) ? errno : 0);
1653 return -1;
b5d9ee5c 1654 }
05476c4d
OZ
1655 return 0;
1656 }
1657 s->ttx += e;
1658 }
1659 reset_tx_buffer(s);
1660 return 1;
1661
1662 case SK_UDP:
1663 case SK_IP:
1664 {
1665 if (s->tbuf == s->tpos)
b5d9ee5c 1666 return 1;
05476c4d
OZ
1667
1668 e = sk_sendmsg(s);
1669
1670 if (e < 0)
1671 {
1672 if (errno != EINTR && errno != EAGAIN)
1673 {
1674 reset_tx_buffer(s);
1675 s->err_hook(s, errno);
1676 return -1;
1677 }
1678
1679 if (!s->tx_hook)
1680 reset_tx_buffer(s);
1681 return 0;
b5d9ee5c 1682 }
05476c4d
OZ
1683 reset_tx_buffer(s);
1684 return 1;
b5d9ee5c 1685 }
05476c4d
OZ
1686 default:
1687 bug("sk_maybe_write: unknown socket type %d", s->type);
1688 }
b5d9ee5c
MM
1689}
1690
ea89da38
OZ
1691int
1692sk_rx_ready(sock *s)
1693{
ea89da38 1694 int rv;
9c92f692
MM
1695 struct pollfd pfd = { .fd = s->fd };
1696 pfd.events |= POLLIN;
ea89da38
OZ
1697
1698 redo:
9c92f692 1699 rv = poll(&pfd, 1, 0);
9c89560e 1700
ea89da38
OZ
1701 if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
1702 goto redo;
1703
1704 return rv;
1705}
1706
525fa2c1
MM
1707/**
1708 * sk_send - send data to a socket
1709 * @s: socket
1710 * @len: number of bytes to send
1711 *
1712 * This function sends @len bytes of data prepared in the
1713 * transmit buffer of the socket @s to the network connection.
1714 * If the packet can be sent immediately, it does so and returns
1715 * 1, else it queues the packet for later processing, returns 0
1716 * and calls the @tx_hook of the socket when the tranmission
1717 * takes place.
1718 */
b5d9ee5c
MM
1719int
1720sk_send(sock *s, unsigned len)
1721{
b5d9ee5c
MM
1722 s->ttx = s->tbuf;
1723 s->tpos = s->tbuf + len;
1724 return sk_maybe_write(s);
1725}
1726
525fa2c1
MM
1727/**
1728 * sk_send_to - send data to a specific destination
1729 * @s: socket
1730 * @len: number of bytes to send
1731 * @addr: IP address to send the packet to
1732 * @port: port to send the packet to
1733 *
2e9b2421 1734 * This is a sk_send() replacement for connection-less packet sockets
525fa2c1 1735 * which allows destination of the packet to be chosen dynamically.
48e5f32d 1736 * Raw IP sockets should use 0 for @port.
525fa2c1 1737 */
b5d9ee5c
MM
1738int
1739sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
1740{
353729f5 1741 s->daddr = addr;
48e5f32d
OZ
1742 if (port)
1743 s->dport = port;
1744
b5d9ee5c
MM
1745 s->ttx = s->tbuf;
1746 s->tpos = s->tbuf + len;
1747 return sk_maybe_write(s);
1748}
1749
353729f5
OZ
1750/*
1751int
1752sk_send_full(sock *s, unsigned len, struct iface *ifa,
1753 ip_addr saddr, ip_addr daddr, unsigned dport)
1754{
1755 s->iface = ifa;
1756 s->saddr = saddr;
1757 s->daddr = daddr;
1758 s->dport = dport;
1759 s->ttx = s->tbuf;
1760 s->tpos = s->tbuf + len;
1761 return sk_maybe_write(s);
1762}
1763*/
1764
6a8d3f1c
OZ
1765 /* sk_read() and sk_write() are called from BFD's event loop */
1766
1767int
fd926ed4 1768sk_read(sock *s, int revents)
b5d9ee5c
MM
1769{
1770 switch (s->type)
05476c4d
OZ
1771 {
1772 case SK_TCP_PASSIVE:
1773 return sk_passive_connected(s, SK_TCP);
1774
1775 case SK_UNIX_PASSIVE:
1776 return sk_passive_connected(s, SK_UNIX);
1777
1778 case SK_TCP:
1779 case SK_UNIX:
b5d9ee5c 1780 {
05476c4d
OZ
1781 int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
1782
1783 if (c < 0)
b93abffa 1784 {
05476c4d
OZ
1785 if (errno != EINTR && errno != EAGAIN)
1786 s->err_hook(s, errno);
fd926ed4
MM
1787 else if (errno == EAGAIN && !(revents & POLLIN))
1788 {
1789 log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
1790 s->err_hook(s, 0);
1791 }
b5d9ee5c 1792 }
05476c4d
OZ
1793 else if (!c)
1794 s->err_hook(s, 0);
1795 else
b5d9ee5c 1796 {
05476c4d
OZ
1797 s->rpos += c;
1798 if (s->rx_hook(s, s->rpos - s->rbuf))
1799 {
1800 /* We need to be careful since the socket could have been deleted by the hook */
1801 if (current_sock == s)
1802 s->rpos = s->rbuf;
1803 }
1804 return 1;
b5d9ee5c 1805 }
05476c4d
OZ
1806 return 0;
1807 }
353729f5 1808
05476c4d
OZ
1809 case SK_MAGIC:
1810 return s->rx_hook(s, 0);
b5d9ee5c 1811
05476c4d
OZ
1812 default:
1813 {
1814 int e = sk_recvmsg(s);
353729f5 1815
05476c4d
OZ
1816 if (e < 0)
1817 {
1818 if (errno != EINTR && errno != EAGAIN)
1819 s->err_hook(s, errno);
1820 return 0;
b5d9ee5c 1821 }
05476c4d
OZ
1822
1823 s->rpos = s->rbuf + e;
1824 s->rx_hook(s, e);
1825 return 1;
b5d9ee5c 1826 }
05476c4d 1827 }
b5d9ee5c
MM
1828}
1829
6a8d3f1c 1830int
b5d9ee5c
MM
1831sk_write(sock *s)
1832{
320f4173 1833 switch (s->type)
05476c4d
OZ
1834 {
1835 case SK_TCP_ACTIVE:
320f4173 1836 {
05476c4d
OZ
1837 sockaddr sa;
1838 sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
1839
1840 if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
1841 sk_tcp_connected(s);
1842 else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
1843 s->err_hook(s, errno);
38a608c5 1844 return 0;
320f4173 1845 }
05476c4d
OZ
1846
1847 default:
1848 if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
1849 {
1850 if (s->tx_hook)
1851 s->tx_hook(s);
1852 return 1;
1853 }
1854 return 0;
1855 }
b5d9ee5c
MM
1856}
1857
9dbcb11c
MM
1858void
1859sk_err(sock *s, int revents)
1860{
1861 int se = 0, sse = sizeof(se);
ccd2a3ed 1862 if ((s->type != SK_MAGIC) && (revents & POLLERR))
9dbcb11c
MM
1863 if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
1864 {
1865 log(L_ERR "IO: Socket error: SO_ERROR: %m");
1866 se = 0;
1867 }
1868
1869 s->err_hook(s, se);
1870}
1871
b5d9ee5c
MM
1872void
1873sk_dump_all(void)
1874{
1875 node *n;
1876 sock *s;
1877
1878 debug("Open sockets:\n");
1879 WALK_LIST(n, sock_list)
05476c4d
OZ
1880 {
1881 s = SKIP_BACK(sock, n, n);
1882 debug("%p ", s);
1883 sk_dump(&s->r);
1884 }
b5d9ee5c
MM
1885 debug("\n");
1886}
1887
b5d9ee5c 1888
8bcb5fb1
OZ
1889/*
1890 * Internal event log and watchdog
1891 */
1892
1893#define EVENT_LOG_LENGTH 32
1894
1895struct event_log_entry
1896{
1897 void *hook;
1898 void *data;
1899 btime timestamp;
1900 btime duration;
1901};
1902
1903static struct event_log_entry event_log[EVENT_LOG_LENGTH];
1904static struct event_log_entry *event_open;
1905static int event_log_pos, event_log_num, watchdog_active;
1906static btime last_time;
1907static btime loop_time;
1908
1909static void
1910io_update_time(void)
1911{
1912 struct timespec ts;
1913 int rv;
1914
1915 if (!clock_monotonic_available)
1916 return;
1917
1918 /*
1919 * This is third time-tracking procedure (after update_times() above and
1920 * times_update() in BFD), dedicated to internal event log and latency
1921 * tracking. Hopefully, we consolidate these sometimes.
1922 */
1923
1924 rv = clock_gettime(CLOCK_MONOTONIC, &ts);
1925 if (rv < 0)
1926 die("clock_gettime: %m");
1927
1928 last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
1929
1930 if (event_open)
1931 {
1932 event_open->duration = last_time - event_open->timestamp;
1933
1934 if (event_open->duration > config->latency_limit)
1935 log(L_WARN "Event 0x%p 0x%p took %d ms",
1936 event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
1937
1938 event_open = NULL;
1939 }
1940}
1941
1942/**
1943 * io_log_event - mark approaching event into event log
1944 * @hook: event hook address
1945 * @data: event data address
1946 *
1947 * Store info (hook, data, timestamp) about the following internal event into
1948 * a circular event log (@event_log). When latency tracking is enabled, the log
1949 * entry is kept open (in @event_open) so the duration can be filled later.
1950 */
1951void
1952io_log_event(void *hook, void *data)
1953{
1954 if (config->latency_debug)
1955 io_update_time();
1956
1957 struct event_log_entry *en = event_log + event_log_pos;
1958
1959 en->hook = hook;
1960 en->data = data;
1961 en->timestamp = last_time;
1962 en->duration = 0;
1963
1964 event_log_num++;
1965 event_log_pos++;
1966 event_log_pos %= EVENT_LOG_LENGTH;
1967
1968 event_open = config->latency_debug ? en : NULL;
1969}
1970
1971static inline void
1972io_close_event(void)
1973{
1974 if (event_open)
1975 io_update_time();
1976}
1977
1978void
1979io_log_dump(void)
1980{
1981 int i;
1982
1983 log(L_DEBUG "Event log:");
1984 for (i = 0; i < EVENT_LOG_LENGTH; i++)
1985 {
1986 struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
1987 if (en->hook)
1988 log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
1989 (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
1990 }
1991}
1992
1993void
1994watchdog_sigalrm(int sig UNUSED)
1995{
1996 /* Update last_time and duration, but skip latency check */
1997 config->latency_limit = 0xffffffff;
1998 io_update_time();
1999
2000 /* We want core dump */
2001 abort();
2002}
2003
2004static inline void
2005watchdog_start1(void)
2006{
2007 io_update_time();
2008
2009 loop_time = last_time;
2010}
2011
2012static inline void
2013watchdog_start(void)
2014{
2015 io_update_time();
2016
2017 loop_time = last_time;
2018 event_log_num = 0;
2019
2020 if (config->watchdog_timeout)
2021 {
2022 alarm(config->watchdog_timeout);
2023 watchdog_active = 1;
2024 }
2025}
2026
2027static inline void
2028watchdog_stop(void)
2029{
2030 io_update_time();
2031
2032 if (watchdog_active)
2033 {
2034 alarm(0);
2035 watchdog_active = 0;
2036 }
2037
2038 btime duration = last_time - loop_time;
2039 if (duration > config->watchdog_warning)
2040 log(L_WARN "I/O loop cycle took %d ms for %d events",
2041 (int) (duration TO_MS), event_log_num);
2042}
2043
2044
b5d9ee5c
MM
2045/*
2046 * Main I/O Loop
2047 */
2048
4c9dd1e4
MM
2049volatile int async_config_flag; /* Asynchronous reconfiguration/dump scheduled */
2050volatile int async_dump_flag;
c8cafc8e 2051volatile int async_shutdown_flag;
4c9dd1e4 2052
b5d9ee5c
MM
2053void
2054io_init(void)
2055{
2056 init_list(&near_timers);
2057 init_list(&far_timers);
2058 init_list(&sock_list);
e8f73195 2059 init_list(&global_event_list);
7e5f5ffd 2060 krt_io_init();
fd91ae33
OZ
2061 init_times();
2062 update_times();
a92cf57d 2063 boot_time = now;
fd91ae33 2064 srandom((int) now_real);
b5d9ee5c
MM
2065}
2066
ea89da38
OZ
2067static int short_loops = 0;
2068#define SHORT_LOOP_MAX 10
2069
b5d9ee5c
MM
2070void
2071io_loop(void)
2072{
e1c13a5a 2073 int poll_tout;
b5d9ee5c 2074 time_t tout;
ea0a8be2 2075 int nfds, events, pout;
b5d9ee5c 2076 sock *s;
38a608c5 2077 node *n;
e1c13a5a
MM
2078 int fdmax = 256;
2079 struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
b5d9ee5c 2080
8bcb5fb1 2081 watchdog_start1();
b5d9ee5c
MM
2082 for(;;)
2083 {
30770df2 2084 events = ev_run_list(&global_event_list);
bd22d7f4 2085 timers:
fd91ae33 2086 update_times();
b5d9ee5c
MM
2087 tout = tm_first_shot();
2088 if (tout <= now)
2089 {
2090 tm_shot();
bd22d7f4 2091 goto timers;
b5d9ee5c 2092 }
e1c13a5a 2093 poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */
b5d9ee5c 2094
8bcb5fb1
OZ
2095 io_close_event();
2096
e1c13a5a 2097 nfds = 0;
b5d9ee5c
MM
2098 WALK_LIST(n, sock_list)
2099 {
e1c13a5a 2100 pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
b5d9ee5c
MM
2101 s = SKIP_BACK(sock, n, n);
2102 if (s->rx_hook)
2103 {
e1c13a5a
MM
2104 pfd[nfds].fd = s->fd;
2105 pfd[nfds].events |= POLLIN;
b5d9ee5c
MM
2106 }
2107 if (s->tx_hook && s->ttx != s->tpos)
2108 {
e1c13a5a
MM
2109 pfd[nfds].fd = s->fd;
2110 pfd[nfds].events |= POLLOUT;
2111 }
2112 if (pfd[nfds].fd != -1)
2113 {
2114 s->index = nfds;
2115 nfds++;
b5d9ee5c 2116 }
38a608c5 2117 else
e1c13a5a
MM
2118 s->index = -1;
2119
2120 if (nfds >= fdmax)
2121 {
2122 fdmax *= 2;
2123 pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
2124 }
b5d9ee5c
MM
2125 }
2126
4c9dd1e4
MM
2127 /*
2128 * Yes, this is racy. But even if the signal comes before this test
e1c13a5a 2129 * and entering poll(), it gets caught on the next timer tick.
4c9dd1e4
MM
2130 */
2131
2132 if (async_config_flag)
2133 {
8bcb5fb1 2134 io_log_event(async_config, NULL);
4c9dd1e4
MM
2135 async_config();
2136 async_config_flag = 0;
f4aabcee 2137 continue;
4c9dd1e4
MM
2138 }
2139 if (async_dump_flag)
2140 {
8bcb5fb1 2141 io_log_event(async_dump, NULL);
4c9dd1e4
MM
2142 async_dump();
2143 async_dump_flag = 0;
f4aabcee
MM
2144 continue;
2145 }
2146 if (async_shutdown_flag)
2147 {
8bcb5fb1 2148 io_log_event(async_shutdown, NULL);
f4aabcee
MM
2149 async_shutdown();
2150 async_shutdown_flag = 0;
2151 continue;
4c9dd1e4
MM
2152 }
2153
e1c13a5a 2154 /* And finally enter poll() to find active sockets */
8bcb5fb1 2155 watchdog_stop();
ea0a8be2 2156 pout = poll(pfd, nfds, poll_tout);
8bcb5fb1 2157 watchdog_start();
ea89da38 2158
ea0a8be2 2159 if (pout < 0)
b5d9ee5c
MM
2160 {
2161 if (errno == EINTR || errno == EAGAIN)
2162 continue;
e1c13a5a 2163 die("poll: %m");
b5d9ee5c 2164 }
ea0a8be2 2165 if (pout)
b5d9ee5c 2166 {
ea89da38
OZ
2167 /* guaranteed to be non-empty */
2168 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2169
38a608c5 2170 while (current_sock)
b5d9ee5c 2171 {
38a608c5 2172 sock *s = current_sock;
e1c13a5a
MM
2173 if (s->index == -1)
2174 {
2175 current_sock = sk_next(s);
2176 goto next;
2177 }
2178
38a608c5 2179 int e;
ea89da38
OZ
2180 int steps;
2181
2182 steps = MAX_STEPS;
9dbcb11c 2183 if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
38a608c5
MM
2184 do
2185 {
4323099d 2186 steps--;
8bcb5fb1 2187 io_log_event(s->rx_hook, s->data);
fd926ed4 2188 e = sk_read(s, pfd[s->index].revents);
38a608c5
MM
2189 if (s != current_sock)
2190 goto next;
2191 }
4323099d
OZ
2192 while (e && s->rx_hook && steps);
2193
2194 steps = MAX_STEPS;
e1c13a5a 2195 if (pfd[s->index].revents & POLLOUT)
38a608c5
MM
2196 do
2197 {
4323099d 2198 steps--;
8bcb5fb1 2199 io_log_event(s->tx_hook, s->data);
38a608c5
MM
2200 e = sk_write(s);
2201 if (s != current_sock)
2202 goto next;
2203 }
4323099d 2204 while (e && steps);
9dbcb11c 2205
38a608c5
MM
2206 current_sock = sk_next(s);
2207 next: ;
b5d9ee5c 2208 }
ea89da38
OZ
2209
2210 short_loops++;
2211 if (events && (short_loops < SHORT_LOOP_MAX))
2212 continue;
2213 short_loops = 0;
2214
2215 int count = 0;
2216 current_sock = stored_sock;
2217 if (current_sock == NULL)
2218 current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2219
2220 while (current_sock && count < MAX_RX_STEPS)
2221 {
2222 sock *s = current_sock;
e1c13a5a
MM
2223 if (s->index == -1)
2224 {
2225 current_sock = sk_next(s);
2226 goto next2;
2227 }
ea89da38 2228
9dbcb11c 2229 if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
ea89da38
OZ
2230 {
2231 count++;
8bcb5fb1 2232 io_log_event(s->rx_hook, s->data);
fd926ed4 2233 sk_read(s, pfd[s->index].revents);
ea89da38 2234 if (s != current_sock)
9dbcb11c
MM
2235 goto next2;
2236 }
2237
2238 if (pfd[s->index].revents & (POLLHUP | POLLERR))
2239 {
2240 sk_err(s, pfd[s->index].revents);
33d22f0e 2241 goto next2;
ea89da38 2242 }
9dbcb11c 2243
ea89da38
OZ
2244 current_sock = sk_next(s);
2245 next2: ;
2246 }
2247
9dbcb11c 2248
ea89da38 2249 stored_sock = current_sock;
b5d9ee5c
MM
2250 }
2251 }
2252}
41c8976e
OF
2253
2254void
2255test_old_bird(char *path)
2256{
2257 int fd;
2258 struct sockaddr_un sa;
2259
2260 fd = socket(AF_UNIX, SOCK_STREAM, 0);
41c8976e
OF
2261 if (fd < 0)
2262 die("Cannot create socket: %m");
97e46d28
OZ
2263 if (strlen(path) >= sizeof(sa.sun_path))
2264 die("Socket path too long");
41c8976e
OF
2265 bzero(&sa, sizeof(sa));
2266 sa.sun_family = AF_UNIX;
2267 strcpy(sa.sun_path, path);
2268 if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
2269 die("I found another BIRD running.");
2270 close(fd);
2271}