1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 #include "sd-netlink.h"
7 #include "alloc-util.h"
12 #include "netlink-genl.h"
13 #include "netlink-internal.h"
14 #include "netlink-slot.h"
15 #include "netlink-util.h"
16 #include "process-util.h"
17 #include "socket-util.h"
18 #include "string-util.h"
20 /* Some really high limit, to catch programming errors */
21 #define REPLY_CALLBACKS_MAX UINT16_MAX
23 static int netlink_new(sd_netlink
**ret
) {
24 _cleanup_(sd_netlink_unrefp
) sd_netlink
*nl
= NULL
;
26 assert_return(ret
, -EINVAL
);
28 nl
= new(sd_netlink
, 1);
35 .sockaddr
.nl
.nl_family
= AF_NETLINK
,
36 .original_pid
= getpid_cached(),
39 /* Kernel change notification messages have sequence number 0. We want to avoid that with our
40 * own serials, in order not to get confused when matching up kernel replies to our earlier
43 * Moreover, when using netlink socket activation (i.e. where PID 1 binds an AF_NETLINK
44 * socket for us and passes it to us across execve()) and we get restarted multiple times
45 * while the socket sticks around we might get confused by replies from earlier runs coming
46 * in late — which is pretty likely if we'd start our sequence numbers always from 1. Hence,
47 * let's start with a value based on the system clock. This should make collisions much less
48 * likely (though still theoretically possible). We use a 32 bit µs counter starting at boot
49 * for this (and explicitly exclude the zero, see above). This counter will wrap around after
50 * a bit more than 1h, but that's hopefully OK as the kernel shouldn't take that long to
51 * reply to our requests.
53 * We only pick the initial start value this way. For each message we simply increase the
54 * sequence number by 1. This means we could enqueue 1 netlink message per µs without risking
55 * collisions, which should be OK.
57 * Note this means the serials will be in the range 1…UINT32_MAX here.
59 * (In an ideal world we'd attach the current serial counter to the netlink socket itself
60 * somehow, to avoid all this, but I couldn't come up with a nice way to do this) */
61 .serial
= (uint32_t) (now(CLOCK_MONOTONIC
) % UINT32_MAX
) + 1,
64 /* We guarantee that the read buffer has at least space for a message header */
65 if (!greedy_realloc((void**) &nl
->rbuffer
, sizeof(struct nlmsghdr
), sizeof(uint8_t)))
72 _public_
int sd_netlink_new_from_fd(sd_netlink
**ret
, int fd
) {
73 _cleanup_(sd_netlink_unrefp
) sd_netlink
*nl
= NULL
;
77 assert_return(ret
, -EINVAL
);
83 addrlen
= sizeof(nl
->sockaddr
);
85 if (getsockname(fd
, &nl
->sockaddr
.sa
, &addrlen
) < 0)
88 if (nl
->sockaddr
.nl
.nl_family
!= AF_NETLINK
)
97 _public_
int sd_netlink_open_fd(sd_netlink
**ret
, int fd
) {
98 _cleanup_(sd_netlink_unrefp
) sd_netlink
*nl
= NULL
;
101 assert_return(ret
, -EINVAL
);
102 assert_return(fd
>= 0, -EBADF
);
104 r
= netlink_new(&nl
);
108 r
= getsockopt_int(fd
, SOL_SOCKET
, SO_PROTOCOL
, &protocol
);
113 nl
->protocol
= protocol
;
115 r
= setsockopt_int(fd
, SOL_NETLINK
, NETLINK_EXT_ACK
, true);
117 log_debug_errno(r
, "sd-netlink: Failed to enable NETLINK_EXT_ACK option, ignoring: %m");
119 r
= setsockopt_int(fd
, SOL_NETLINK
, NETLINK_GET_STRICT_CHK
, true);
121 log_debug_errno(r
, "sd-netlink: Failed to enable NETLINK_GET_STRICT_CHK option, ignoring: %m");
125 nl
->fd
= -1; /* on failure, the caller remains owner of the fd, hence don't close it here */
135 _public_
int sd_netlink_open(sd_netlink
**ret
) {
136 return netlink_open_family(ret
, NETLINK_ROUTE
);
139 _public_
int sd_netlink_increase_rxbuf(sd_netlink
*nl
, size_t size
) {
140 assert_return(nl
, -EINVAL
);
141 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
143 return fd_increase_rxbuf(nl
->fd
, size
);
146 static sd_netlink
*netlink_free(sd_netlink
*nl
) {
152 for (i
= 0; i
< nl
->rqueue_size
; i
++)
153 sd_netlink_message_unref(nl
->rqueue
[i
]);
156 for (i
= 0; i
< nl
->rqueue_partial_size
; i
++)
157 sd_netlink_message_unref(nl
->rqueue_partial
[i
]);
158 free(nl
->rqueue_partial
);
162 while ((s
= nl
->slots
)) {
164 netlink_slot_disconnect(s
, true);
166 hashmap_free(nl
->reply_callbacks
);
167 prioq_free(nl
->reply_callbacks_prioq
);
169 sd_event_source_unref(nl
->io_event_source
);
170 sd_event_source_unref(nl
->time_event_source
);
171 sd_event_unref(nl
->event
);
173 hashmap_free(nl
->broadcast_group_refs
);
175 genl_clear_family(nl
);
181 DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink
, sd_netlink
, netlink_free
);
183 _public_
int sd_netlink_send(
185 sd_netlink_message
*message
,
190 assert_return(nl
, -EINVAL
);
191 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
192 assert_return(message
, -EINVAL
);
193 assert_return(!message
->sealed
, -EPERM
);
195 netlink_seal_message(nl
, message
);
197 r
= socket_write_message(nl
, message
);
202 *serial
= message_get_serial(message
);
207 int netlink_rqueue_make_room(sd_netlink
*nl
) {
210 if (nl
->rqueue_size
>= NETLINK_RQUEUE_MAX
)
211 return log_debug_errno(SYNTHETIC_ERRNO(ENOBUFS
),
212 "sd-netlink: exhausted the read queue size (%d)",
215 if (!GREEDY_REALLOC(nl
->rqueue
, nl
->rqueue_size
+ 1))
221 int netlink_rqueue_partial_make_room(sd_netlink
*nl
) {
224 if (nl
->rqueue_partial_size
>= NETLINK_RQUEUE_MAX
)
225 return log_debug_errno(SYNTHETIC_ERRNO(ENOBUFS
),
226 "sd-netlink: exhausted the partial read queue size (%d)",
229 if (!GREEDY_REALLOC(nl
->rqueue_partial
, nl
->rqueue_partial_size
+ 1))
235 static int dispatch_rqueue(sd_netlink
*nl
, sd_netlink_message
**message
) {
241 if (nl
->rqueue_size
<= 0) {
242 /* Try to read a new message */
243 r
= socket_read_message(nl
);
244 if (r
== -ENOBUFS
) { /* FIXME: ignore buffer overruns for now */
245 log_debug_errno(r
, "sd-netlink: Got ENOBUFS from netlink socket, ignoring.");
252 /* Dispatch a queued message */
253 *message
= nl
->rqueue
[0];
255 memmove(nl
->rqueue
, nl
->rqueue
+ 1, sizeof(sd_netlink_message
*) * nl
->rqueue_size
);
260 static int process_timeout(sd_netlink
*nl
) {
261 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
262 struct reply_callback
*c
;
263 sd_netlink_slot
*slot
;
269 c
= prioq_peek(nl
->reply_callbacks_prioq
);
273 n
= now(CLOCK_MONOTONIC
);
277 r
= message_new_synthetic_error(nl
, -ETIMEDOUT
, c
->serial
, &m
);
281 assert_se(prioq_pop(nl
->reply_callbacks_prioq
) == c
);
283 hashmap_remove(nl
->reply_callbacks
, UINT32_TO_PTR(c
->serial
));
285 slot
= container_of(c
, sd_netlink_slot
, reply_callback
);
287 r
= c
->callback(nl
, m
, slot
->userdata
);
289 log_debug_errno(r
, "sd-netlink: timedout callback %s%s%sfailed: %m",
290 slot
->description
? "'" : "",
291 strempty(slot
->description
),
292 slot
->description
? "' " : "");
295 netlink_slot_disconnect(slot
, true);
300 static int process_reply(sd_netlink
*nl
, sd_netlink_message
*m
) {
301 struct reply_callback
*c
;
302 sd_netlink_slot
*slot
;
310 serial
= message_get_serial(m
);
311 c
= hashmap_remove(nl
->reply_callbacks
, UINT32_TO_PTR(serial
));
315 if (c
->timeout
!= 0) {
316 prioq_remove(nl
->reply_callbacks_prioq
, c
, &c
->prioq_idx
);
320 r
= sd_netlink_message_get_type(m
, &type
);
324 if (type
== NLMSG_DONE
)
327 slot
= container_of(c
, sd_netlink_slot
, reply_callback
);
329 r
= c
->callback(nl
, m
, slot
->userdata
);
331 log_debug_errno(r
, "sd-netlink: reply callback %s%s%sfailed: %m",
332 slot
->description
? "'" : "",
333 strempty(slot
->description
),
334 slot
->description
? "' " : "");
337 netlink_slot_disconnect(slot
, true);
342 static int process_match(sd_netlink
*nl
, sd_netlink_message
*m
) {
350 r
= sd_netlink_message_get_type(m
, &type
);
354 if (m
->protocol
== NETLINK_GENERIC
) {
355 r
= sd_genl_message_get_command(nl
, m
, &cmd
);
361 LIST_FOREACH(match_callbacks
, c
, nl
->match_callbacks
) {
362 sd_netlink_slot
*slot
;
367 if (c
->cmd
!= 0 && c
->cmd
!= cmd
)
370 for (size_t i
= 0; i
< c
->n_groups
; i
++)
371 if (c
->groups
[i
] == m
->multicast_group
) {
379 slot
= container_of(c
, sd_netlink_slot
, match_callback
);
381 r
= c
->callback(nl
, m
, slot
->userdata
);
383 log_debug_errno(r
, "sd-netlink: match callback %s%s%sfailed: %m",
384 slot
->description
? "'" : "",
385 strempty(slot
->description
),
386 slot
->description
? "' " : "");
394 static int process_running(sd_netlink
*nl
, sd_netlink_message
**ret
) {
395 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
400 r
= process_timeout(nl
);
404 r
= dispatch_rqueue(nl
, &m
);
410 if (sd_netlink_message_is_broadcast(m
))
411 r
= process_match(nl
, m
);
413 r
= process_reply(nl
, m
);
432 int sd_netlink_process(sd_netlink
*nl
, sd_netlink_message
**ret
) {
433 NETLINK_DONT_DESTROY(nl
);
436 assert_return(nl
, -EINVAL
);
437 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
438 assert_return(!nl
->processing
, -EBUSY
);
440 nl
->processing
= true;
441 r
= process_running(nl
, ret
);
442 nl
->processing
= false;
447 static usec_t
calc_elapse(uint64_t usec
) {
448 if (usec
== UINT64_MAX
)
452 usec
= NETLINK_DEFAULT_TIMEOUT_USEC
;
454 return usec_add(now(CLOCK_MONOTONIC
), usec
);
457 static int netlink_poll(sd_netlink
*nl
, bool need_more
, usec_t timeout_usec
) {
458 usec_t m
= USEC_INFINITY
;
463 e
= sd_netlink_get_events(nl
);
468 /* Caller wants more data, and doesn't care about
469 * what's been read or any other timeouts. */
474 /* Caller wants to process if there is something to
475 * process, but doesn't care otherwise */
477 r
= sd_netlink_get_timeout(nl
, &until
);
481 m
= usec_sub_unsigned(until
, now(CLOCK_MONOTONIC
));
484 r
= fd_wait_for_event(nl
->fd
, e
, MIN(m
, timeout_usec
));
491 int sd_netlink_wait(sd_netlink
*nl
, uint64_t timeout_usec
) {
492 assert_return(nl
, -EINVAL
);
493 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
495 if (nl
->rqueue_size
> 0)
498 return netlink_poll(nl
, false, timeout_usec
);
501 static int timeout_compare(const void *a
, const void *b
) {
502 const struct reply_callback
*x
= a
, *y
= b
;
504 if (x
->timeout
!= 0 && y
->timeout
== 0)
507 if (x
->timeout
== 0 && y
->timeout
!= 0)
510 return CMP(x
->timeout
, y
->timeout
);
513 _public_
int sd_netlink_call_async(
515 sd_netlink_slot
**ret_slot
,
516 sd_netlink_message
*m
,
517 sd_netlink_message_handler_t callback
,
518 sd_netlink_destroy_t destroy_callback
,
521 const char *description
) {
523 _cleanup_free_ sd_netlink_slot
*slot
= NULL
;
526 assert_return(nl
, -EINVAL
);
527 assert_return(m
, -EINVAL
);
528 assert_return(callback
, -EINVAL
);
529 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
531 if (hashmap_size(nl
->reply_callbacks
) >= REPLY_CALLBACKS_MAX
)
534 r
= hashmap_ensure_allocated(&nl
->reply_callbacks
, &trivial_hash_ops
);
538 if (usec
!= UINT64_MAX
) {
539 r
= prioq_ensure_allocated(&nl
->reply_callbacks_prioq
, timeout_compare
);
544 r
= netlink_slot_allocate(nl
, !ret_slot
, NETLINK_REPLY_CALLBACK
, sizeof(struct reply_callback
), userdata
, description
, &slot
);
548 slot
->reply_callback
.callback
= callback
;
549 slot
->reply_callback
.timeout
= calc_elapse(usec
);
551 k
= sd_netlink_send(nl
, m
, &slot
->reply_callback
.serial
);
555 r
= hashmap_put(nl
->reply_callbacks
, UINT32_TO_PTR(slot
->reply_callback
.serial
), &slot
->reply_callback
);
559 if (slot
->reply_callback
.timeout
!= 0) {
560 r
= prioq_put(nl
->reply_callbacks_prioq
, &slot
->reply_callback
, &slot
->reply_callback
.prioq_idx
);
562 (void) hashmap_remove(nl
->reply_callbacks
, UINT32_TO_PTR(slot
->reply_callback
.serial
));
567 /* Set this at last. Otherwise, some failures in above call the destroy callback but some do not. */
568 slot
->destroy_callback
= destroy_callback
;
578 _public_
int sd_netlink_read(
582 sd_netlink_message
**ret
) {
587 assert_return(nl
, -EINVAL
);
588 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
590 timeout
= calc_elapse(usec
);
595 for (unsigned i
= 0; i
< nl
->rqueue_size
; i
++) {
596 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*incoming
= NULL
;
597 uint32_t received_serial
;
600 received_serial
= message_get_serial(nl
->rqueue
[i
]);
601 if (received_serial
!= serial
)
604 incoming
= nl
->rqueue
[i
];
606 /* found a match, remove from rqueue and return it */
607 memmove(nl
->rqueue
+ i
, nl
->rqueue
+ i
+ 1,
608 sizeof(sd_netlink_message
*) * (nl
->rqueue_size
- i
- 1));
611 r
= sd_netlink_message_get_errno(incoming
);
615 r
= sd_netlink_message_get_type(incoming
, &type
);
619 if (type
== NLMSG_DONE
) {
625 *ret
= TAKE_PTR(incoming
);
629 r
= socket_read_message(nl
);
633 /* received message, so try to process straight away */
639 n
= now(CLOCK_MONOTONIC
);
643 left
= usec_sub_unsigned(timeout
, n
);
645 left
= USEC_INFINITY
;
647 r
= netlink_poll(nl
, true, left
);
655 _public_
int sd_netlink_call(
657 sd_netlink_message
*message
,
659 sd_netlink_message
**ret
) {
664 assert_return(nl
, -EINVAL
);
665 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
666 assert_return(message
, -EINVAL
);
668 r
= sd_netlink_send(nl
, message
, &serial
);
672 return sd_netlink_read(nl
, serial
, usec
, ret
);
675 _public_
int sd_netlink_get_events(sd_netlink
*nl
) {
676 assert_return(nl
, -EINVAL
);
677 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
679 return nl
->rqueue_size
== 0 ? POLLIN
: 0;
682 _public_
int sd_netlink_get_timeout(sd_netlink
*nl
, uint64_t *timeout_usec
) {
683 struct reply_callback
*c
;
685 assert_return(nl
, -EINVAL
);
686 assert_return(timeout_usec
, -EINVAL
);
687 assert_return(!netlink_pid_changed(nl
), -ECHILD
);
689 if (nl
->rqueue_size
> 0) {
694 c
= prioq_peek(nl
->reply_callbacks_prioq
);
696 *timeout_usec
= UINT64_MAX
;
700 *timeout_usec
= c
->timeout
;
705 static int io_callback(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
706 sd_netlink
*nl
= userdata
;
711 r
= sd_netlink_process(nl
, NULL
);
718 static int time_callback(sd_event_source
*s
, uint64_t usec
, void *userdata
) {
719 sd_netlink
*nl
= userdata
;
724 r
= sd_netlink_process(nl
, NULL
);
731 static int prepare_callback(sd_event_source
*s
, void *userdata
) {
732 sd_netlink
*nl
= userdata
;
739 e
= sd_netlink_get_events(nl
);
743 r
= sd_event_source_set_io_events(nl
->io_event_source
, e
);
747 r
= sd_netlink_get_timeout(nl
, &until
);
753 j
= sd_event_source_set_time(nl
->time_event_source
, until
);
758 r
= sd_event_source_set_enabled(nl
->time_event_source
, r
> 0);
765 _public_
int sd_netlink_attach_event(sd_netlink
*nl
, sd_event
*event
, int64_t priority
) {
768 assert_return(nl
, -EINVAL
);
769 assert_return(!nl
->event
, -EBUSY
);
771 assert(!nl
->io_event_source
);
772 assert(!nl
->time_event_source
);
775 nl
->event
= sd_event_ref(event
);
777 r
= sd_event_default(&nl
->event
);
782 r
= sd_event_add_io(nl
->event
, &nl
->io_event_source
, nl
->fd
, 0, io_callback
, nl
);
786 r
= sd_event_source_set_priority(nl
->io_event_source
, priority
);
790 r
= sd_event_source_set_description(nl
->io_event_source
, "netlink-receive-message");
794 r
= sd_event_source_set_prepare(nl
->io_event_source
, prepare_callback
);
798 r
= sd_event_add_time(nl
->event
, &nl
->time_event_source
, CLOCK_MONOTONIC
, 0, 0, time_callback
, nl
);
802 r
= sd_event_source_set_priority(nl
->time_event_source
, priority
);
806 r
= sd_event_source_set_description(nl
->time_event_source
, "netlink-timer");
813 sd_netlink_detach_event(nl
);
817 _public_
int sd_netlink_detach_event(sd_netlink
*nl
) {
818 assert_return(nl
, -EINVAL
);
819 assert_return(nl
->event
, -ENXIO
);
821 nl
->io_event_source
= sd_event_source_unref(nl
->io_event_source
);
823 nl
->time_event_source
= sd_event_source_unref(nl
->time_event_source
);
825 nl
->event
= sd_event_unref(nl
->event
);
830 int netlink_add_match_internal(
832 sd_netlink_slot
**ret_slot
,
833 const uint32_t *groups
,
837 sd_netlink_message_handler_t callback
,
838 sd_netlink_destroy_t destroy_callback
,
840 const char *description
) {
842 _cleanup_free_ sd_netlink_slot
*slot
= NULL
;
846 assert(n_groups
> 0);
848 for (size_t i
= 0; i
< n_groups
; i
++) {
849 r
= socket_broadcast_group_ref(nl
, groups
[i
]);
854 r
= netlink_slot_allocate(nl
, !ret_slot
, NETLINK_MATCH_CALLBACK
, sizeof(struct match_callback
),
855 userdata
, description
, &slot
);
859 slot
->match_callback
.groups
= newdup(uint32_t, groups
, n_groups
);
860 if (!slot
->match_callback
.groups
)
863 slot
->match_callback
.n_groups
= n_groups
;
864 slot
->match_callback
.callback
= callback
;
865 slot
->match_callback
.type
= type
;
866 slot
->match_callback
.cmd
= cmd
;
868 LIST_PREPEND(match_callbacks
, nl
->match_callbacks
, &slot
->match_callback
);
870 /* Set this at last. Otherwise, some failures in above call the destroy callback but some do not. */
871 slot
->destroy_callback
= destroy_callback
;
880 _public_
int sd_netlink_add_match(
882 sd_netlink_slot
**ret_slot
,
884 sd_netlink_message_handler_t callback
,
885 sd_netlink_destroy_t destroy_callback
,
887 const char *description
) {
889 static const uint32_t
890 address_groups
[] = { RTNLGRP_IPV4_IFADDR
, RTNLGRP_IPV6_IFADDR
, },
891 link_groups
[] = { RTNLGRP_LINK
, },
892 neighbor_groups
[] = { RTNLGRP_NEIGH
, },
893 nexthop_groups
[] = { RTNLGRP_NEXTHOP
, },
894 route_groups
[] = { RTNLGRP_IPV4_ROUTE
, RTNLGRP_IPV6_ROUTE
, },
895 rule_groups
[] = { RTNLGRP_IPV4_RULE
, RTNLGRP_IPV6_RULE
, },
896 tc_groups
[] = { RTNLGRP_TC
};
897 const uint32_t *groups
;
900 assert_return(rtnl
, -EINVAL
);
901 assert_return(callback
, -EINVAL
);
902 assert_return(!netlink_pid_changed(rtnl
), -ECHILD
);
907 groups
= link_groups
;
908 n_groups
= ELEMENTSOF(link_groups
);
912 groups
= address_groups
;
913 n_groups
= ELEMENTSOF(address_groups
);
917 groups
= neighbor_groups
;
918 n_groups
= ELEMENTSOF(neighbor_groups
);
922 groups
= route_groups
;
923 n_groups
= ELEMENTSOF(route_groups
);
927 groups
= rule_groups
;
928 n_groups
= ELEMENTSOF(rule_groups
);
932 groups
= nexthop_groups
;
933 n_groups
= ELEMENTSOF(nexthop_groups
);
940 n_groups
= ELEMENTSOF(tc_groups
);
946 return netlink_add_match_internal(rtnl
, ret_slot
, groups
, n_groups
, type
, 0, callback
,
947 destroy_callback
, userdata
, description
);
950 _public_
int sd_netlink_attach_filter(sd_netlink
*nl
, size_t len
, struct sock_filter
*filter
) {
951 assert_return(nl
, -EINVAL
);
952 assert_return(len
== 0 || filter
, -EINVAL
);
954 if (setsockopt(nl
->fd
, SOL_SOCKET
,
955 len
== 0 ? SO_DETACH_FILTER
: SO_ATTACH_FILTER
,
956 &(struct sock_fprog
) {
959 }, sizeof(struct sock_fprog
)) < 0)