1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 #include "sd-netlink.h"
7 #include "alloc-util.h"
12 #include "netlink-internal.h"
13 #include "netlink-slot.h"
14 #include "netlink-util.h"
15 #include "process-util.h"
16 #include "socket-util.h"
17 #include "string-util.h"
20 /* Some really high limit, to catch programming errors */
21 #define REPLY_CALLBACKS_MAX UINT16_MAX
23 static int sd_netlink_new(sd_netlink
**ret
) {
24 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
26 assert_return(ret
, -EINVAL
);
28 rtnl
= new(sd_netlink
, 1);
32 *rtnl
= (sd_netlink
) {
35 .sockaddr
.nl
.nl_family
= AF_NETLINK
,
36 .original_pid
= getpid_cached(),
39 /* Kernel change notification messages have sequence number 0. We want to avoid that with our
40 * own serials, in order not to get confused when matching up kernel replies to our earlier
43 * Moreover, when using netlink socket activation (i.e. where PID 1 binds an AF_NETLINK
44 * socket for us and passes it to us across execve()) and we get restarted multiple times
45 * while the socket sticks around we might get confused by replies from earlier runs coming
46 * in late — which is pretty likely if we'd start our sequence numbers always from 1. Hence,
47 * let's start with a value based on the system clock. This should make collisions much less
48 * likely (though still theoretically possible). We use a 32 bit µs counter starting at boot
49 * for this (and explicitly exclude the zero, see above). This counter will wrap around after
50 * a bit more than 1h, but that's hopefully OK as the kernel shouldn't take that long to
51 * reply to our requests.
53 * We only pick the initial start value this way. For each message we simply increase the
54 * sequence number by 1. This means we could enqueue 1 netlink message per µs without risking
55 * collisions, which should be OK.
57 * Note this means the serials will be in the range 1…UINT32_MAX here.
59 * (In an ideal world we'd attach the current serial counter to the netlink socket itself
60 * somehow, to avoid all this, but I couldn't come up with a nice way to do this) */
61 .serial
= (uint32_t) (now(CLOCK_MONOTONIC
) % UINT32_MAX
) + 1,
64 /* We guarantee that the read buffer has at least space for
66 if (!greedy_realloc((void**)&rtnl
->rbuffer
, &rtnl
->rbuffer_allocated
,
67 sizeof(struct nlmsghdr
), sizeof(uint8_t)))
70 *ret
= TAKE_PTR(rtnl
);
75 int sd_netlink_new_from_netlink(sd_netlink
**ret
, int fd
) {
76 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
80 assert_return(ret
, -EINVAL
);
82 r
= sd_netlink_new(&rtnl
);
86 addrlen
= sizeof(rtnl
->sockaddr
);
88 r
= getsockname(fd
, &rtnl
->sockaddr
.sa
, &addrlen
);
92 if (rtnl
->sockaddr
.nl
.nl_family
!= AF_NETLINK
)
97 *ret
= TAKE_PTR(rtnl
);
102 static bool rtnl_pid_changed(const sd_netlink
*rtnl
) {
105 /* We don't support people creating an rtnl connection and
106 * keeping it around over a fork(). Let's complain. */
108 return rtnl
->original_pid
!= getpid_cached();
111 int sd_netlink_open_fd(sd_netlink
**ret
, int fd
) {
112 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
115 assert_return(ret
, -EINVAL
);
116 assert_return(fd
>= 0, -EBADF
);
118 r
= sd_netlink_new(&rtnl
);
122 r
= getsockopt_int(fd
, SOL_SOCKET
, SO_PROTOCOL
, &protocol
);
127 rtnl
->protocol
= protocol
;
129 r
= setsockopt_int(fd
, SOL_NETLINK
, NETLINK_EXT_ACK
, 1);
131 log_debug_errno(r
, "sd-netlink: Failed to enable NETLINK_EXT_ACK option, ignoring: %m");
133 r
= socket_bind(rtnl
);
135 rtnl
->fd
= -1; /* on failure, the caller remains owner of the fd, hence don't close it here */
140 *ret
= TAKE_PTR(rtnl
);
145 int netlink_open_family(sd_netlink
**ret
, int family
) {
146 _cleanup_close_
int fd
= -1;
149 fd
= socket_open(family
);
153 r
= sd_netlink_open_fd(ret
, fd
);
161 int sd_netlink_open(sd_netlink
**ret
) {
162 return netlink_open_family(ret
, NETLINK_ROUTE
);
165 int sd_netlink_inc_rcvbuf(sd_netlink
*rtnl
, size_t size
) {
166 assert_return(rtnl
, -EINVAL
);
167 assert_return(!rtnl_pid_changed(rtnl
), -ECHILD
);
169 return fd_inc_rcvbuf(rtnl
->fd
, size
);
172 static sd_netlink
*netlink_free(sd_netlink
*rtnl
) {
178 for (i
= 0; i
< rtnl
->rqueue_size
; i
++)
179 sd_netlink_message_unref(rtnl
->rqueue
[i
]);
182 for (i
= 0; i
< rtnl
->rqueue_partial_size
; i
++)
183 sd_netlink_message_unref(rtnl
->rqueue_partial
[i
]);
184 free(rtnl
->rqueue_partial
);
188 while ((s
= rtnl
->slots
)) {
190 netlink_slot_disconnect(s
, true);
192 hashmap_free(rtnl
->reply_callbacks
);
193 prioq_free(rtnl
->reply_callbacks_prioq
);
195 sd_event_source_unref(rtnl
->io_event_source
);
196 sd_event_source_unref(rtnl
->time_event_source
);
197 sd_event_unref(rtnl
->event
);
199 hashmap_free(rtnl
->broadcast_group_refs
);
201 hashmap_free(rtnl
->genl_family_to_nlmsg_type
);
202 hashmap_free(rtnl
->nlmsg_type_to_genl_family
);
204 safe_close(rtnl
->fd
);
208 DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink
, sd_netlink
, netlink_free
);
210 static void rtnl_seal_message(sd_netlink
*rtnl
, sd_netlink_message
*m
) {
214 assert(!rtnl_pid_changed(rtnl
));
218 /* Avoid collisions with outstanding requests */
220 picked
= rtnl
->serial
;
222 /* Don't use seq == 0, as that is used for broadcasts, so we would get confused by replies to
224 rtnl
->serial
= rtnl
->serial
== UINT32_MAX
? 1 : rtnl
->serial
+ 1;
226 } while (hashmap_contains(rtnl
->reply_callbacks
, UINT32_TO_PTR(picked
)));
228 m
->hdr
->nlmsg_seq
= picked
;
229 rtnl_message_seal(m
);
232 int sd_netlink_send(sd_netlink
*nl
,
233 sd_netlink_message
*message
,
237 assert_return(nl
, -EINVAL
);
238 assert_return(!rtnl_pid_changed(nl
), -ECHILD
);
239 assert_return(message
, -EINVAL
);
240 assert_return(!message
->sealed
, -EPERM
);
242 rtnl_seal_message(nl
, message
);
244 r
= socket_write_message(nl
, message
);
249 *serial
= rtnl_message_get_serial(message
);
254 int sd_netlink_sendv(sd_netlink
*nl
,
255 sd_netlink_message
**messages
,
257 uint32_t **ret_serial
) {
258 _cleanup_free_
uint32_t *serials
= NULL
;
262 assert_return(nl
, -EINVAL
);
263 assert_return(!rtnl_pid_changed(nl
), -ECHILD
);
264 assert_return(messages
, -EINVAL
);
265 assert_return(msgcount
> 0, -EINVAL
);
268 serials
= new0(uint32_t, msgcount
);
273 for (i
= 0; i
< msgcount
; i
++) {
274 assert_return(!messages
[i
]->sealed
, -EPERM
);
275 rtnl_seal_message(nl
, messages
[i
]);
277 serials
[i
] = rtnl_message_get_serial(messages
[i
]);
280 r
= socket_writev_message(nl
, messages
, msgcount
);
285 *ret_serial
= TAKE_PTR(serials
);
290 int rtnl_rqueue_make_room(sd_netlink
*rtnl
) {
293 if (rtnl
->rqueue_size
>= RTNL_RQUEUE_MAX
)
294 return log_debug_errno(SYNTHETIC_ERRNO(ENOBUFS
),
295 "rtnl: exhausted the read queue size (%d)",
298 if (!GREEDY_REALLOC(rtnl
->rqueue
, rtnl
->rqueue_allocated
, rtnl
->rqueue_size
+ 1))
304 int rtnl_rqueue_partial_make_room(sd_netlink
*rtnl
) {
307 if (rtnl
->rqueue_partial_size
>= RTNL_RQUEUE_MAX
)
308 return log_debug_errno(SYNTHETIC_ERRNO(ENOBUFS
),
309 "rtnl: exhausted the partial read queue size (%d)",
312 if (!GREEDY_REALLOC(rtnl
->rqueue_partial
, rtnl
->rqueue_partial_allocated
,
313 rtnl
->rqueue_partial_size
+ 1))
319 static int dispatch_rqueue(sd_netlink
*rtnl
, sd_netlink_message
**message
) {
325 if (rtnl
->rqueue_size
<= 0) {
326 /* Try to read a new message */
327 r
= socket_read_message(rtnl
);
328 if (r
== -ENOBUFS
) { /* FIXME: ignore buffer overruns for now */
329 log_debug_errno(r
, "Got ENOBUFS from netlink socket, ignoring.");
336 /* Dispatch a queued message */
337 *message
= rtnl
->rqueue
[0];
339 memmove(rtnl
->rqueue
, rtnl
->rqueue
+ 1, sizeof(sd_netlink_message
*) * rtnl
->rqueue_size
);
344 static int process_timeout(sd_netlink
*rtnl
) {
345 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
346 struct reply_callback
*c
;
347 sd_netlink_slot
*slot
;
353 c
= prioq_peek(rtnl
->reply_callbacks_prioq
);
357 n
= now(CLOCK_MONOTONIC
);
361 r
= rtnl_message_new_synthetic_error(rtnl
, -ETIMEDOUT
, c
->serial
, &m
);
365 assert_se(prioq_pop(rtnl
->reply_callbacks_prioq
) == c
);
367 hashmap_remove(rtnl
->reply_callbacks
, UINT32_TO_PTR(c
->serial
));
369 slot
= container_of(c
, sd_netlink_slot
, reply_callback
);
371 r
= c
->callback(rtnl
, m
, slot
->userdata
);
373 log_debug_errno(r
, "sd-netlink: timedout callback %s%s%sfailed: %m",
374 slot
->description
? "'" : "",
375 strempty(slot
->description
),
376 slot
->description
? "' " : "");
379 netlink_slot_disconnect(slot
, true);
384 static int process_reply(sd_netlink
*rtnl
, sd_netlink_message
*m
) {
385 struct reply_callback
*c
;
386 sd_netlink_slot
*slot
;
394 serial
= rtnl_message_get_serial(m
);
395 c
= hashmap_remove(rtnl
->reply_callbacks
, UINT32_TO_PTR(serial
));
399 if (c
->timeout
!= 0) {
400 prioq_remove(rtnl
->reply_callbacks_prioq
, c
, &c
->prioq_idx
);
404 r
= sd_netlink_message_get_type(m
, &type
);
408 if (type
== NLMSG_DONE
)
411 slot
= container_of(c
, sd_netlink_slot
, reply_callback
);
413 r
= c
->callback(rtnl
, m
, slot
->userdata
);
415 log_debug_errno(r
, "sd-netlink: reply callback %s%s%sfailed: %m",
416 slot
->description
? "'" : "",
417 strempty(slot
->description
),
418 slot
->description
? "' " : "");
421 netlink_slot_disconnect(slot
, true);
426 static int process_match(sd_netlink
*rtnl
, sd_netlink_message
*m
) {
427 struct match_callback
*c
;
428 sd_netlink_slot
*slot
;
435 r
= sd_netlink_message_get_type(m
, &type
);
439 LIST_FOREACH(match_callbacks
, c
, rtnl
->match_callbacks
) {
443 slot
= container_of(c
, sd_netlink_slot
, match_callback
);
445 r
= c
->callback(rtnl
, m
, slot
->userdata
);
447 log_debug_errno(r
, "sd-netlink: match callback %s%s%sfailed: %m",
448 slot
->description
? "'" : "",
449 strempty(slot
->description
),
450 slot
->description
? "' " : "");
458 static int process_running(sd_netlink
*rtnl
, sd_netlink_message
**ret
) {
459 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
464 r
= process_timeout(rtnl
);
468 r
= dispatch_rqueue(rtnl
, &m
);
474 if (sd_netlink_message_is_broadcast(m
)) {
475 r
= process_match(rtnl
, m
);
479 r
= process_reply(rtnl
, m
);
499 int sd_netlink_process(sd_netlink
*rtnl
, sd_netlink_message
**ret
) {
500 NETLINK_DONT_DESTROY(rtnl
);
503 assert_return(rtnl
, -EINVAL
);
504 assert_return(!rtnl_pid_changed(rtnl
), -ECHILD
);
505 assert_return(!rtnl
->processing
, -EBUSY
);
507 rtnl
->processing
= true;
508 r
= process_running(rtnl
, ret
);
509 rtnl
->processing
= false;
514 static usec_t
calc_elapse(uint64_t usec
) {
515 if (usec
== (uint64_t) -1)
519 usec
= RTNL_DEFAULT_TIMEOUT
;
521 return now(CLOCK_MONOTONIC
) + usec
;
524 static int rtnl_poll(sd_netlink
*rtnl
, bool need_more
, uint64_t timeout_usec
) {
525 usec_t m
= USEC_INFINITY
;
530 e
= sd_netlink_get_events(rtnl
);
535 /* Caller wants more data, and doesn't care about
536 * what's been read or any other timeouts. */
540 /* Caller wants to process if there is something to
541 * process, but doesn't care otherwise */
543 r
= sd_netlink_get_timeout(rtnl
, &until
);
548 nw
= now(CLOCK_MONOTONIC
);
549 m
= until
> nw
? until
- nw
: 0;
553 if (timeout_usec
!= (uint64_t) -1 && (m
== USEC_INFINITY
|| timeout_usec
< m
))
556 r
= fd_wait_for_event(rtnl
->fd
, e
, m
);
563 int sd_netlink_wait(sd_netlink
*nl
, uint64_t timeout_usec
) {
564 assert_return(nl
, -EINVAL
);
565 assert_return(!rtnl_pid_changed(nl
), -ECHILD
);
567 if (nl
->rqueue_size
> 0)
570 return rtnl_poll(nl
, false, timeout_usec
);
573 static int timeout_compare(const void *a
, const void *b
) {
574 const struct reply_callback
*x
= a
, *y
= b
;
576 if (x
->timeout
!= 0 && y
->timeout
== 0)
579 if (x
->timeout
== 0 && y
->timeout
!= 0)
582 return CMP(x
->timeout
, y
->timeout
);
585 int sd_netlink_call_async(
587 sd_netlink_slot
**ret_slot
,
588 sd_netlink_message
*m
,
589 sd_netlink_message_handler_t callback
,
590 sd_netlink_destroy_t destroy_callback
,
593 const char *description
) {
594 _cleanup_free_ sd_netlink_slot
*slot
= NULL
;
597 assert_return(nl
, -EINVAL
);
598 assert_return(m
, -EINVAL
);
599 assert_return(callback
, -EINVAL
);
600 assert_return(!rtnl_pid_changed(nl
), -ECHILD
);
602 if (hashmap_size(nl
->reply_callbacks
) >= REPLY_CALLBACKS_MAX
)
605 r
= hashmap_ensure_allocated(&nl
->reply_callbacks
, &trivial_hash_ops
);
609 if (usec
!= (uint64_t) -1) {
610 r
= prioq_ensure_allocated(&nl
->reply_callbacks_prioq
, timeout_compare
);
615 r
= netlink_slot_allocate(nl
, !ret_slot
, NETLINK_REPLY_CALLBACK
, sizeof(struct reply_callback
), userdata
, description
, &slot
);
619 slot
->reply_callback
.callback
= callback
;
620 slot
->reply_callback
.timeout
= calc_elapse(usec
);
622 k
= sd_netlink_send(nl
, m
, &slot
->reply_callback
.serial
);
626 r
= hashmap_put(nl
->reply_callbacks
, UINT32_TO_PTR(slot
->reply_callback
.serial
), &slot
->reply_callback
);
630 if (slot
->reply_callback
.timeout
!= 0) {
631 r
= prioq_put(nl
->reply_callbacks_prioq
, &slot
->reply_callback
, &slot
->reply_callback
.prioq_idx
);
633 (void) hashmap_remove(nl
->reply_callbacks
, UINT32_TO_PTR(slot
->reply_callback
.serial
));
638 /* Set this at last. Otherwise, some failures in above call the destroy callback but some do not. */
639 slot
->destroy_callback
= destroy_callback
;
649 int sd_netlink_read(sd_netlink
*rtnl
,
652 sd_netlink_message
**ret
) {
656 assert_return(rtnl
, -EINVAL
);
657 assert_return(!rtnl_pid_changed(rtnl
), -ECHILD
);
659 timeout
= calc_elapse(usec
);
665 for (i
= 0; i
< rtnl
->rqueue_size
; i
++) {
666 uint32_t received_serial
;
668 received_serial
= rtnl_message_get_serial(rtnl
->rqueue
[i
]);
670 if (received_serial
== serial
) {
671 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*incoming
= NULL
;
674 incoming
= rtnl
->rqueue
[i
];
676 /* found a match, remove from rqueue and return it */
677 memmove(rtnl
->rqueue
+ i
,rtnl
->rqueue
+ i
+ 1,
678 sizeof(sd_netlink_message
*) * (rtnl
->rqueue_size
- i
- 1));
681 r
= sd_netlink_message_get_errno(incoming
);
685 r
= sd_netlink_message_get_type(incoming
, &type
);
689 if (type
== NLMSG_DONE
) {
695 *ret
= TAKE_PTR(incoming
);
701 r
= socket_read_message(rtnl
);
705 /* received message, so try to process straight away */
711 n
= now(CLOCK_MONOTONIC
);
717 left
= (uint64_t) -1;
719 r
= rtnl_poll(rtnl
, true, left
);
727 int sd_netlink_call(sd_netlink
*rtnl
,
728 sd_netlink_message
*message
,
730 sd_netlink_message
**ret
) {
734 assert_return(rtnl
, -EINVAL
);
735 assert_return(!rtnl_pid_changed(rtnl
), -ECHILD
);
736 assert_return(message
, -EINVAL
);
738 r
= sd_netlink_send(rtnl
, message
, &serial
);
742 return sd_netlink_read(rtnl
, serial
, usec
, ret
);
745 int sd_netlink_get_events(const sd_netlink
*rtnl
) {
746 assert_return(rtnl
, -EINVAL
);
747 assert_return(!rtnl_pid_changed(rtnl
), -ECHILD
);
749 if (rtnl
->rqueue_size
== 0)
755 int sd_netlink_get_timeout(const sd_netlink
*rtnl
, uint64_t *timeout_usec
) {
756 struct reply_callback
*c
;
758 assert_return(rtnl
, -EINVAL
);
759 assert_return(timeout_usec
, -EINVAL
);
760 assert_return(!rtnl_pid_changed(rtnl
), -ECHILD
);
762 if (rtnl
->rqueue_size
> 0) {
767 c
= prioq_peek(rtnl
->reply_callbacks_prioq
);
769 *timeout_usec
= (uint64_t) -1;
773 *timeout_usec
= c
->timeout
;
778 static int io_callback(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
779 sd_netlink
*rtnl
= userdata
;
784 r
= sd_netlink_process(rtnl
, NULL
);
791 static int time_callback(sd_event_source
*s
, uint64_t usec
, void *userdata
) {
792 sd_netlink
*rtnl
= userdata
;
797 r
= sd_netlink_process(rtnl
, NULL
);
804 static int prepare_callback(sd_event_source
*s
, void *userdata
) {
805 sd_netlink
*rtnl
= userdata
;
812 e
= sd_netlink_get_events(rtnl
);
816 r
= sd_event_source_set_io_events(rtnl
->io_event_source
, e
);
820 r
= sd_netlink_get_timeout(rtnl
, &until
);
826 j
= sd_event_source_set_time(rtnl
->time_event_source
, until
);
831 r
= sd_event_source_set_enabled(rtnl
->time_event_source
, r
> 0);
838 int sd_netlink_attach_event(sd_netlink
*rtnl
, sd_event
*event
, int64_t priority
) {
841 assert_return(rtnl
, -EINVAL
);
842 assert_return(!rtnl
->event
, -EBUSY
);
844 assert(!rtnl
->io_event_source
);
845 assert(!rtnl
->time_event_source
);
848 rtnl
->event
= sd_event_ref(event
);
850 r
= sd_event_default(&rtnl
->event
);
855 r
= sd_event_add_io(rtnl
->event
, &rtnl
->io_event_source
, rtnl
->fd
, 0, io_callback
, rtnl
);
859 r
= sd_event_source_set_priority(rtnl
->io_event_source
, priority
);
863 r
= sd_event_source_set_description(rtnl
->io_event_source
, "rtnl-receive-message");
867 r
= sd_event_source_set_prepare(rtnl
->io_event_source
, prepare_callback
);
871 r
= sd_event_add_time(rtnl
->event
, &rtnl
->time_event_source
, CLOCK_MONOTONIC
, 0, 0, time_callback
, rtnl
);
875 r
= sd_event_source_set_priority(rtnl
->time_event_source
, priority
);
879 r
= sd_event_source_set_description(rtnl
->time_event_source
, "rtnl-timer");
886 sd_netlink_detach_event(rtnl
);
890 int sd_netlink_detach_event(sd_netlink
*rtnl
) {
891 assert_return(rtnl
, -EINVAL
);
892 assert_return(rtnl
->event
, -ENXIO
);
894 rtnl
->io_event_source
= sd_event_source_unref(rtnl
->io_event_source
);
896 rtnl
->time_event_source
= sd_event_source_unref(rtnl
->time_event_source
);
898 rtnl
->event
= sd_event_unref(rtnl
->event
);
903 int sd_netlink_add_match(
905 sd_netlink_slot
**ret_slot
,
907 sd_netlink_message_handler_t callback
,
908 sd_netlink_destroy_t destroy_callback
,
910 const char *description
) {
911 _cleanup_free_ sd_netlink_slot
*slot
= NULL
;
914 assert_return(rtnl
, -EINVAL
);
915 assert_return(callback
, -EINVAL
);
916 assert_return(!rtnl_pid_changed(rtnl
), -ECHILD
);
918 r
= netlink_slot_allocate(rtnl
, !ret_slot
, NETLINK_MATCH_CALLBACK
, sizeof(struct match_callback
), userdata
, description
, &slot
);
922 slot
->match_callback
.callback
= callback
;
923 slot
->match_callback
.type
= type
;
928 r
= socket_broadcast_group_ref(rtnl
, RTNLGRP_LINK
);
935 r
= socket_broadcast_group_ref(rtnl
, RTNLGRP_IPV4_IFADDR
);
939 r
= socket_broadcast_group_ref(rtnl
, RTNLGRP_IPV6_IFADDR
);
946 r
= socket_broadcast_group_ref(rtnl
, RTNLGRP_NEIGH
);
953 r
= socket_broadcast_group_ref(rtnl
, RTNLGRP_IPV4_ROUTE
);
957 r
= socket_broadcast_group_ref(rtnl
, RTNLGRP_IPV6_ROUTE
);
963 r
= socket_broadcast_group_ref(rtnl
, RTNLGRP_IPV4_RULE
);
967 r
= socket_broadcast_group_ref(rtnl
, RTNLGRP_IPV6_RULE
);
973 r
= socket_broadcast_group_ref(rtnl
, RTNLGRP_NEXTHOP
);
982 LIST_PREPEND(match_callbacks
, rtnl
->match_callbacks
, &slot
->match_callback
);
984 /* Set this at last. Otherwise, some failures in above call the destroy callback but some do not. */
985 slot
->destroy_callback
= destroy_callback
;