1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
3 #include <linux/filter.h>
8 #include "sd-netlink.h"
10 #include "alloc-util.h"
11 #include "errno-util.h"
16 #include "netlink-genl.h"
17 #include "netlink-internal.h"
18 #include "netlink-slot.h"
19 #include "netlink-util.h"
20 #include "ordered-set.h"
22 #include "process-util.h"
23 #include "socket-util.h"
24 #include "string-util.h"
25 #include "time-util.h"
27 /* Some really high limit, to catch programming errors */
28 #define REPLY_CALLBACKS_MAX UINT16_MAX
30 static int netlink_new(sd_netlink
**ret
) {
31 _cleanup_(sd_netlink_unrefp
) sd_netlink
*nl
= NULL
;
33 assert_return(ret
, -EINVAL
);
35 nl
= new(sd_netlink
, 1);
42 .sockaddr
.nl
.nl_family
= AF_NETLINK
,
43 .original_pid
= getpid_cached(),
46 /* Kernel change notification messages have sequence number 0. We want to avoid that with our
47 * own serials, in order not to get confused when matching up kernel replies to our earlier
50 * Moreover, when using netlink socket activation (i.e. where PID 1 binds an AF_NETLINK
51 * socket for us and passes it to us across execve()) and we get restarted multiple times
52 * while the socket sticks around we might get confused by replies from earlier runs coming
53 * in late — which is pretty likely if we'd start our sequence numbers always from 1. Hence,
54 * let's start with a value based on the system clock. This should make collisions much less
55 * likely (though still theoretically possible). We use a 32 bit μs counter starting at boot
56 * for this (and explicitly exclude the zero, see above). This counter will wrap around after
57 * a bit more than 1h, but that's hopefully OK as the kernel shouldn't take that long to
58 * reply to our requests.
60 * We only pick the initial start value this way. For each message we simply increase the
61 * sequence number by 1. This means we could enqueue 1 netlink message per μs without risking
62 * collisions, which should be OK.
64 * Note this means the serials will be in the range 1…UINT32_MAX here.
66 * (In an ideal world we'd attach the current serial counter to the netlink socket itself
67 * somehow, to avoid all this, but I couldn't come up with a nice way to do this) */
68 .serial
= (uint32_t) (now(CLOCK_MONOTONIC
) % UINT32_MAX
) + 1,
75 int sd_netlink_open_fd(sd_netlink
**ret
, int fd
) {
76 _cleanup_(sd_netlink_unrefp
) sd_netlink
*nl
= NULL
;
77 int r
, protocol
= 0; /* Avoid maybe-uninitialized false positive */
79 assert_return(ret
, -EINVAL
);
80 assert_return(fd
>= 0, -EBADF
);
86 r
= getsockopt_int(fd
, SOL_SOCKET
, SO_PROTOCOL
, &protocol
);
91 nl
->protocol
= protocol
;
93 r
= setsockopt_int(fd
, SOL_NETLINK
, NETLINK_EXT_ACK
, true);
95 log_debug_errno(r
, "sd-netlink: Failed to enable NETLINK_EXT_ACK option, ignoring: %m");
97 r
= setsockopt_int(fd
, SOL_NETLINK
, NETLINK_GET_STRICT_CHK
, true);
99 log_debug_errno(r
, "sd-netlink: Failed to enable NETLINK_GET_STRICT_CHK option, ignoring: %m");
103 nl
->fd
= -EBADF
; /* on failure, the caller remains owner of the fd, hence don't close it here */
113 int sd_netlink_open(sd_netlink
**ret
) {
114 return netlink_open_family(ret
, NETLINK_ROUTE
);
117 int sd_netlink_increase_rxbuf(sd_netlink
*nl
, size_t size
) {
118 assert_return(nl
, -EINVAL
);
119 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
121 return fd_increase_rxbuf(nl
->fd
, size
);
124 static sd_netlink
*netlink_free(sd_netlink
*nl
) {
129 ordered_set_free(nl
->rqueue
);
130 hashmap_free(nl
->rqueue_by_serial
);
131 hashmap_free(nl
->rqueue_partial_by_serial
);
134 while ((s
= nl
->slots
)) {
136 netlink_slot_disconnect(s
, true);
138 hashmap_free(nl
->reply_callbacks
);
139 prioq_free(nl
->reply_callbacks_prioq
);
141 sd_event_source_unref(nl
->io_event_source
);
142 sd_event_source_unref(nl
->time_event_source
);
143 sd_event_unref(nl
->event
);
145 hashmap_free(nl
->broadcast_group_refs
);
147 genl_clear_family(nl
);
153 DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink
, sd_netlink
, netlink_free
);
157 sd_netlink_message
*message
,
158 uint32_t *ret_serial
) {
162 assert_return(nl
, -EINVAL
);
163 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
164 assert_return(message
, -EINVAL
);
165 assert_return(!message
->sealed
, -EPERM
);
167 netlink_seal_message(nl
, message
);
169 r
= socket_write_message(nl
, message
);
174 *ret_serial
= message_get_serial(message
);
179 static int dispatch_rqueue(sd_netlink
*nl
, sd_netlink_message
**ret
) {
180 sd_netlink_message
*m
;
186 if (ordered_set_isempty(nl
->rqueue
)) {
187 /* Try to read a new message */
188 r
= socket_read_message(nl
);
189 if (r
== -ENOBUFS
) /* FIXME: ignore buffer overruns for now */
190 log_debug_errno(r
, "sd-netlink: Got ENOBUFS from netlink socket, ignoring.");
195 /* Dispatch a queued message */
196 m
= ordered_set_steal_first(nl
->rqueue
);
198 sd_netlink_message_unref(hashmap_remove_value(nl
->rqueue_by_serial
, UINT32_TO_PTR(message_get_serial(m
)), m
));
203 static int process_timeout(sd_netlink
*nl
) {
204 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
205 struct reply_callback
*c
;
206 sd_netlink_slot
*slot
;
212 c
= prioq_peek(nl
->reply_callbacks_prioq
);
216 n
= now(CLOCK_MONOTONIC
);
220 r
= message_new_synthetic_error(nl
, -ETIMEDOUT
, c
->serial
, &m
);
224 assert_se(prioq_pop(nl
->reply_callbacks_prioq
) == c
);
225 hashmap_remove(nl
->reply_callbacks
, UINT32_TO_PTR(c
->serial
));
227 slot
= container_of(c
, sd_netlink_slot
, reply_callback
);
229 r
= c
->callback(nl
, m
, slot
->userdata
);
231 log_debug_errno(r
, "sd-netlink: timedout callback %s%s%sfailed: %m",
232 slot
->description
? "'" : "",
233 strempty(slot
->description
),
234 slot
->description
? "' " : "");
237 netlink_slot_disconnect(slot
, true);
242 static int process_reply(sd_netlink
*nl
, sd_netlink_message
*m
) {
243 struct reply_callback
*c
;
244 sd_netlink_slot
*slot
;
252 serial
= message_get_serial(m
);
253 c
= hashmap_remove(nl
->reply_callbacks
, UINT32_TO_PTR(serial
));
257 if (c
->timeout
!= USEC_INFINITY
)
258 prioq_remove(nl
->reply_callbacks_prioq
, c
, &c
->prioq_idx
);
260 r
= sd_netlink_message_get_type(m
, &type
);
264 if (type
== NLMSG_DONE
)
267 slot
= container_of(c
, sd_netlink_slot
, reply_callback
);
269 r
= c
->callback(nl
, m
, slot
->userdata
);
271 log_debug_errno(r
, "sd-netlink: reply callback %s%s%sfailed: %m",
272 slot
->description
? "'" : "",
273 strempty(slot
->description
),
274 slot
->description
? "' " : "");
277 netlink_slot_disconnect(slot
, true);
282 static int process_match(sd_netlink
*nl
, sd_netlink_message
*m
) {
290 r
= sd_netlink_message_get_type(m
, &type
);
294 if (m
->protocol
== NETLINK_GENERIC
) {
295 r
= sd_genl_message_get_command(nl
, m
, &cmd
);
301 LIST_FOREACH(match_callbacks
, c
, nl
->match_callbacks
) {
302 sd_netlink_slot
*slot
;
307 if (c
->cmd
!= 0 && c
->cmd
!= cmd
)
310 for (size_t i
= 0; i
< c
->n_groups
; i
++)
311 if (c
->groups
[i
] == m
->multicast_group
) {
319 slot
= container_of(c
, sd_netlink_slot
, match_callback
);
321 r
= c
->callback(nl
, m
, slot
->userdata
);
323 log_debug_errno(r
, "sd-netlink: match callback %s%s%sfailed: %m",
324 slot
->description
? "'" : "",
325 strempty(slot
->description
),
326 slot
->description
? "' " : "");
334 static int process_running(sd_netlink
*nl
, sd_netlink_message
**ret
) {
335 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
340 r
= process_timeout(nl
);
344 r
= dispatch_rqueue(nl
, &m
);
350 if (sd_netlink_message_is_broadcast(m
))
351 r
= process_match(nl
, m
);
353 r
= process_reply(nl
, m
);
372 int sd_netlink_process(sd_netlink
*nl
, sd_netlink_message
**ret
) {
373 NETLINK_DONT_DESTROY(nl
);
376 assert_return(nl
, -EINVAL
);
377 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
378 assert_return(!nl
->processing
, -EBUSY
);
380 nl
->processing
= true;
381 r
= process_running(nl
, ret
);
382 nl
->processing
= false;
387 static usec_t
timespan_to_timestamp(usec_t usec
) {
388 static bool default_timeout_set
= false;
389 static usec_t default_timeout
;
393 if (!default_timeout_set
) {
396 default_timeout_set
= true;
397 default_timeout
= NETLINK_DEFAULT_TIMEOUT_USEC
;
399 e
= secure_getenv("SYSTEMD_NETLINK_DEFAULT_TIMEOUT");
401 r
= parse_sec(e
, &default_timeout
);
403 log_debug_errno(r
, "sd-netlink: Failed to parse $SYSTEMD_NETLINK_DEFAULT_TIMEOUT environment variable, ignoring: %m");
407 usec
= default_timeout
;
410 return usec_add(now(CLOCK_MONOTONIC
), usec
);
413 static int netlink_poll(sd_netlink
*nl
, bool need_more
, usec_t timeout_usec
) {
414 usec_t m
= USEC_INFINITY
;
419 e
= sd_netlink_get_events(nl
);
424 /* Caller wants more data, and doesn't care about
425 * what's been read or any other timeouts. */
430 /* Caller wants to process if there is something to
431 * process, but doesn't care otherwise */
433 r
= sd_netlink_get_timeout(nl
, &until
);
437 m
= usec_sub_unsigned(until
, now(CLOCK_MONOTONIC
));
440 r
= fd_wait_for_event(nl
->fd
, e
, MIN(m
, timeout_usec
));
447 int sd_netlink_wait(sd_netlink
*nl
, uint64_t timeout_usec
) {
450 assert_return(nl
, -EINVAL
);
451 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
453 if (!ordered_set_isempty(nl
->rqueue
))
456 r
= netlink_poll(nl
, false, timeout_usec
);
457 if (ERRNO_IS_NEG_TRANSIENT(r
)) /* Convert EINTR to "something happened" and give user a chance to run some code before calling back into us */
462 static int timeout_compare(const void *a
, const void *b
) {
463 const struct reply_callback
*x
= a
, *y
= b
;
465 return CMP(x
->timeout
, y
->timeout
);
468 size_t netlink_get_reply_callback_count(sd_netlink
*nl
) {
471 return hashmap_size(nl
->reply_callbacks
);
474 int sd_netlink_call_async(
476 sd_netlink_slot
**ret_slot
,
477 sd_netlink_message
*m
,
478 sd_netlink_message_handler_t callback
,
479 sd_netlink_destroy_t destroy_callback
,
482 const char *description
) {
484 _cleanup_free_ sd_netlink_slot
*slot
= NULL
;
487 assert_return(nl
, -EINVAL
);
488 assert_return(m
, -EINVAL
);
489 assert_return(callback
, -EINVAL
);
490 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
492 if (hashmap_size(nl
->reply_callbacks
) >= REPLY_CALLBACKS_MAX
)
495 r
= hashmap_ensure_allocated(&nl
->reply_callbacks
, &trivial_hash_ops
);
499 if (usec
!= UINT64_MAX
) {
500 r
= prioq_ensure_allocated(&nl
->reply_callbacks_prioq
, timeout_compare
);
505 r
= netlink_slot_allocate(nl
, !ret_slot
, NETLINK_REPLY_CALLBACK
, sizeof(struct reply_callback
), userdata
, description
, &slot
);
509 slot
->reply_callback
.callback
= callback
;
510 slot
->reply_callback
.timeout
= timespan_to_timestamp(usec
);
512 k
= sd_netlink_send(nl
, m
, &slot
->reply_callback
.serial
);
516 r
= hashmap_put(nl
->reply_callbacks
, UINT32_TO_PTR(slot
->reply_callback
.serial
), &slot
->reply_callback
);
520 if (slot
->reply_callback
.timeout
!= USEC_INFINITY
) {
521 r
= prioq_put(nl
->reply_callbacks_prioq
, &slot
->reply_callback
, &slot
->reply_callback
.prioq_idx
);
523 (void) hashmap_remove(nl
->reply_callbacks
, UINT32_TO_PTR(slot
->reply_callback
.serial
));
528 /* Set this at last. Otherwise, some failures in above would call destroy_callback but some would not. */
529 slot
->destroy_callback
= destroy_callback
;
543 sd_netlink_message
**ret
) {
548 assert_return(nl
, -EINVAL
);
549 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
551 timeout
= timespan_to_timestamp(usec
);
554 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
557 m
= hashmap_remove(nl
->rqueue_by_serial
, UINT32_TO_PTR(serial
));
561 /* found a match, remove from rqueue and return it */
562 sd_netlink_message_unref(ordered_set_remove(nl
->rqueue
, m
));
564 r
= sd_netlink_message_get_errno(m
);
568 r
= sd_netlink_message_get_type(m
, &type
);
572 if (type
== NLMSG_DONE
) {
583 r
= socket_read_message(nl
);
587 /* received message, so try to process straight away */
590 if (timeout
!= USEC_INFINITY
) {
593 n
= now(CLOCK_MONOTONIC
);
597 left
= usec_sub_unsigned(timeout
, n
);
599 left
= USEC_INFINITY
;
601 r
= netlink_poll(nl
, true, left
);
611 sd_netlink_message
*message
,
613 sd_netlink_message
**ret
) {
618 assert_return(nl
, -EINVAL
);
619 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
620 assert_return(message
, -EINVAL
);
622 r
= sd_netlink_send(nl
, message
, &serial
);
626 return sd_netlink_read(nl
, serial
, usec
, ret
);
629 int sd_netlink_get_events(sd_netlink
*nl
) {
630 assert_return(nl
, -EINVAL
);
631 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
633 return ordered_set_isempty(nl
->rqueue
) ? POLLIN
: 0;
636 int sd_netlink_get_timeout(sd_netlink
*nl
, uint64_t *ret
) {
637 struct reply_callback
*c
;
639 assert_return(nl
, -EINVAL
);
640 assert_return(ret
, -EINVAL
);
641 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
643 if (!ordered_set_isempty(nl
->rqueue
)) {
648 c
= prioq_peek(nl
->reply_callbacks_prioq
);
658 static int io_callback(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
659 sd_netlink
*nl
= ASSERT_PTR(userdata
);
662 r
= sd_netlink_process(nl
, NULL
);
669 static int time_callback(sd_event_source
*s
, uint64_t usec
, void *userdata
) {
670 sd_netlink
*nl
= ASSERT_PTR(userdata
);
673 r
= sd_netlink_process(nl
, NULL
);
680 static int prepare_callback(sd_event_source
*s
, void *userdata
) {
681 sd_netlink
*nl
= ASSERT_PTR(userdata
);
687 r
= sd_netlink_get_events(nl
);
691 r
= sd_event_source_set_io_events(nl
->io_event_source
, r
);
695 enabled
= sd_netlink_get_timeout(nl
, &until
);
699 r
= sd_event_source_set_time(nl
->time_event_source
, until
);
704 r
= sd_event_source_set_enabled(nl
->time_event_source
,
705 enabled
> 0 ? SD_EVENT_ONESHOT
: SD_EVENT_OFF
);
712 int sd_netlink_attach_event(sd_netlink
*nl
, sd_event
*event
, int64_t priority
) {
715 assert_return(nl
, -EINVAL
);
716 assert_return(!nl
->event
, -EBUSY
);
718 assert(!nl
->io_event_source
);
719 assert(!nl
->time_event_source
);
722 nl
->event
= sd_event_ref(event
);
724 r
= sd_event_default(&nl
->event
);
729 r
= sd_event_add_io(nl
->event
, &nl
->io_event_source
, nl
->fd
, 0, io_callback
, nl
);
733 r
= sd_event_source_set_priority(nl
->io_event_source
, priority
);
737 r
= sd_event_source_set_description(nl
->io_event_source
, "netlink-receive-message");
741 r
= sd_event_source_set_prepare(nl
->io_event_source
, prepare_callback
);
745 r
= sd_event_add_time(nl
->event
, &nl
->time_event_source
, CLOCK_MONOTONIC
, 0, 0, time_callback
, nl
);
749 r
= sd_event_source_set_priority(nl
->time_event_source
, priority
);
753 r
= sd_event_source_set_description(nl
->time_event_source
, "netlink-timer");
760 sd_netlink_detach_event(nl
);
764 int sd_netlink_detach_event(sd_netlink
*nl
) {
765 assert_return(nl
, -EINVAL
);
766 assert_return(nl
->event
, -ENXIO
);
768 nl
->io_event_source
= sd_event_source_unref(nl
->io_event_source
);
770 nl
->time_event_source
= sd_event_source_unref(nl
->time_event_source
);
772 nl
->event
= sd_event_unref(nl
->event
);
777 sd_event
* sd_netlink_get_event(sd_netlink
*nl
) {
778 assert_return(nl
, NULL
);
783 int netlink_add_match_internal(
785 sd_netlink_slot
**ret_slot
,
786 const uint32_t *groups
,
790 sd_netlink_message_handler_t callback
,
791 sd_netlink_destroy_t destroy_callback
,
793 const char *description
) {
795 _cleanup_free_ sd_netlink_slot
*slot
= NULL
;
799 assert(n_groups
> 0);
801 for (size_t i
= 0; i
< n_groups
; i
++) {
802 r
= socket_broadcast_group_ref(nl
, groups
[i
]);
807 r
= netlink_slot_allocate(nl
, !ret_slot
, NETLINK_MATCH_CALLBACK
, sizeof(struct match_callback
),
808 userdata
, description
, &slot
);
812 slot
->match_callback
.groups
= newdup(uint32_t, groups
, n_groups
);
813 if (!slot
->match_callback
.groups
)
816 slot
->match_callback
.n_groups
= n_groups
;
817 slot
->match_callback
.callback
= callback
;
818 slot
->match_callback
.type
= type
;
819 slot
->match_callback
.cmd
= cmd
;
821 LIST_PREPEND(match_callbacks
, nl
->match_callbacks
, &slot
->match_callback
);
823 /* Set this at last. Otherwise, some failures in above call the destroy callback but some do not. */
824 slot
->destroy_callback
= destroy_callback
;
833 int sd_netlink_add_match(
835 sd_netlink_slot
**ret_slot
,
837 sd_netlink_message_handler_t callback
,
838 sd_netlink_destroy_t destroy_callback
,
840 const char *description
) {
842 static const uint32_t
843 address_groups
[] = { RTNLGRP_IPV4_IFADDR
, RTNLGRP_IPV6_IFADDR
, },
844 link_groups
[] = { RTNLGRP_LINK
, },
845 neighbor_groups
[] = { RTNLGRP_NEIGH
, },
846 nexthop_groups
[] = { RTNLGRP_NEXTHOP
, },
847 route_groups
[] = { RTNLGRP_IPV4_ROUTE
, RTNLGRP_IPV6_ROUTE
, },
848 rule_groups
[] = { RTNLGRP_IPV4_RULE
, RTNLGRP_IPV6_RULE
, },
849 tc_groups
[] = { RTNLGRP_TC
};
850 const uint32_t *groups
;
853 assert_return(rtnl
, -EINVAL
);
854 assert_return(callback
, -EINVAL
);
855 assert_return(!netlink_pid_changed(rtnl
), -ECHILD
);
860 groups
= link_groups
;
861 n_groups
= ELEMENTSOF(link_groups
);
865 groups
= address_groups
;
866 n_groups
= ELEMENTSOF(address_groups
);
870 groups
= neighbor_groups
;
871 n_groups
= ELEMENTSOF(neighbor_groups
);
875 groups
= route_groups
;
876 n_groups
= ELEMENTSOF(route_groups
);
880 groups
= rule_groups
;
881 n_groups
= ELEMENTSOF(rule_groups
);
885 groups
= nexthop_groups
;
886 n_groups
= ELEMENTSOF(nexthop_groups
);
893 n_groups
= ELEMENTSOF(tc_groups
);
899 return netlink_add_match_internal(rtnl
, ret_slot
, groups
, n_groups
, type
, 0, callback
,
900 destroy_callback
, userdata
, description
);
903 int sd_netlink_attach_filter(sd_netlink
*nl
, size_t len
, const struct sock_filter
*filter
) {
904 assert_return(nl
, -EINVAL
);
905 assert_return(len
== 0 || filter
, -EINVAL
);
907 if (setsockopt(nl
->fd
, SOL_SOCKET
,
908 len
== 0 ? SO_DETACH_FILTER
: SO_ATTACH_FILTER
,
909 &(struct sock_fprog
) {
911 .filter
= (struct sock_filter
*) filter
,
912 }, sizeof(struct sock_fprog
)) < 0)