2 This file is part of systemd.
4 Copyright 2017 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <selinux/selinux.h>
24 #include "alloc-util.h"
25 #include "audit-util.h"
26 #include "cgroup-util.h"
27 #include "journald-context.h"
28 #include "process-util.h"
29 #include "string-util.h"
30 #include "user-util.h"
32 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
33 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
34 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
35 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
37 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
38 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
39 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
40 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
41 * flushed out). Data newer than 1s is used immediately without refresh.
43 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
44 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
45 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
47 * Caching metadata like this has two major benefits:
49 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
51 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
52 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
53 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
54 * stream connection. This should improve cases where a service process logs immediately before exiting and we
55 * previously had trouble associating the log message with the service.
57 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
58 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
59 * and sometimes slightly newer than what was current at the log event).
62 /* We refresh every 1s */
63 #define REFRESH_USEC (1*USEC_PER_SEC)
65 /* Data older than 5s we flush out */
66 #define MAX_USEC (5*USEC_PER_SEC)
68 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
69 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
70 * clients itself is limited.) */
71 #define CACHE_MAX (16*1024)
73 static int client_context_compare(const void *a
, const void *b
) {
74 const ClientContext
*x
= a
, *y
= b
;
76 if (x
->timestamp
< y
->timestamp
)
78 if (x
->timestamp
> y
->timestamp
)
89 static int client_context_new(Server
*s
, pid_t pid
, ClientContext
**ret
) {
94 assert(pid_is_valid(pid
));
97 r
= hashmap_ensure_allocated(&s
->client_contexts
, NULL
);
101 r
= prioq_ensure_allocated(&s
->client_contexts_lru
, client_context_compare
);
105 c
= new0(ClientContext
, 1);
111 c
->uid
= UID_INVALID
;
112 c
->gid
= GID_INVALID
;
113 c
->auditid
= AUDIT_SESSION_INVALID
;
114 c
->loginuid
= UID_INVALID
;
115 c
->owner_uid
= UID_INVALID
;
116 c
->lru_index
= PRIOQ_IDX_NULL
;
117 c
->timestamp
= USEC_INFINITY
;
119 r
= hashmap_put(s
->client_contexts
, PID_TO_PTR(pid
), c
);
129 static void client_context_reset(ClientContext
*c
) {
132 c
->timestamp
= USEC_INFINITY
;
134 c
->uid
= UID_INVALID
;
135 c
->gid
= GID_INVALID
;
137 c
->comm
= mfree(c
->comm
);
138 c
->exe
= mfree(c
->exe
);
139 c
->cmdline
= mfree(c
->cmdline
);
140 c
->capeff
= mfree(c
->capeff
);
142 c
->auditid
= AUDIT_SESSION_INVALID
;
143 c
->loginuid
= UID_INVALID
;
145 c
->cgroup
= mfree(c
->cgroup
);
146 c
->session
= mfree(c
->session
);
147 c
->owner_uid
= UID_INVALID
;
148 c
->unit
= mfree(c
->unit
);
149 c
->user_unit
= mfree(c
->user_unit
);
150 c
->slice
= mfree(c
->slice
);
151 c
->user_slice
= mfree(c
->user_slice
);
153 c
->invocation_id
= SD_ID128_NULL
;
155 c
->label
= mfree(c
->label
);
159 static ClientContext
* client_context_free(Server
*s
, ClientContext
*c
) {
165 assert_se(hashmap_remove(s
->client_contexts
, PID_TO_PTR(c
->pid
)) == c
);
168 assert_se(prioq_remove(s
->client_contexts_lru
, c
, &c
->lru_index
) >= 0);
170 client_context_reset(c
);
175 static void client_context_read_uid_gid(ClientContext
*c
, const struct ucred
*ucred
) {
177 assert(pid_is_valid(c
->pid
));
179 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
180 if (ucred
&& uid_is_valid(ucred
->uid
))
183 (void) get_process_uid(c
->pid
, &c
->uid
);
185 if (ucred
&& gid_is_valid(ucred
->gid
))
188 (void) get_process_gid(c
->pid
, &c
->gid
);
191 static void client_context_read_basic(ClientContext
*c
) {
195 assert(pid_is_valid(c
->pid
));
197 if (get_process_comm(c
->pid
, &t
) >= 0)
198 free_and_replace(c
->comm
, t
);
200 if (get_process_exe(c
->pid
, &t
) >= 0)
201 free_and_replace(c
->exe
, t
);
203 if (get_process_cmdline(c
->pid
, 0, false, &t
) >= 0)
204 free_and_replace(c
->cmdline
, t
);
206 if (get_process_capeff(c
->pid
, &t
) >= 0)
207 free_and_replace(c
->capeff
, t
);
210 static int client_context_read_label(
212 const char *label
, size_t label_size
) {
215 assert(pid_is_valid(c
->pid
));
216 assert(label_size
== 0 || label
);
218 if (label_size
> 0) {
221 /* If we got an SELinux label passed in it counts. */
223 l
= newdup_suffix0(char, label
, label_size
);
227 free_and_replace(c
->label
, l
);
228 c
->label_size
= label_size
;
234 /* If we got no SELinux label passed in, let's try to acquire one */
236 if (getpidcon(c
->pid
, &con
) >= 0) {
237 free_and_replace(c
->label
, con
);
238 c
->label_size
= strlen(c
->label
);
246 static int client_context_read_cgroup(Server
*s
, ClientContext
*c
, const char *unit_id
) {
252 /* Try to acquire the current cgroup path */
253 r
= cg_pid_get_path_shifted(c
->pid
, s
->cgroup_root
, &t
);
256 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
257 if (unit_id
&& !c
->unit
) {
258 c
->unit
= strdup(unit_id
);
266 /* Let's shortcut this if the cgroup path didn't change */
267 if (streq_ptr(c
->cgroup
, t
)) {
272 free_and_replace(c
->cgroup
, t
);
274 (void) cg_path_get_session(c
->cgroup
, &t
);
275 free_and_replace(c
->session
, t
);
277 if (cg_path_get_owner_uid(c
->cgroup
, &c
->owner_uid
) < 0)
278 c
->owner_uid
= UID_INVALID
;
280 (void) cg_path_get_unit(c
->cgroup
, &t
);
281 free_and_replace(c
->unit
, t
);
283 (void) cg_path_get_user_unit(c
->cgroup
, &t
);
284 free_and_replace(c
->user_unit
, t
);
286 (void) cg_path_get_slice(c
->cgroup
, &t
);
287 free_and_replace(c
->slice
, t
);
289 (void) cg_path_get_user_slice(c
->cgroup
, &t
);
290 free_and_replace(c
->user_slice
, t
);
295 static int client_context_read_invocation_id(
299 _cleanup_free_
char *escaped
= NULL
, *slice_path
= NULL
;
300 char ids
[SD_ID128_STRING_MAX
];
307 /* Read the invocation ID of a unit off a unit. It's stored in the "trusted.invocation_id" extended attribute
308 * on the cgroup path. */
310 if (!c
->unit
|| !c
->slice
)
313 r
= cg_slice_to_path(c
->slice
, &slice_path
);
317 escaped
= cg_escape(c
->unit
);
321 p
= strjoina(s
->cgroup_root
, "/", slice_path
, "/", escaped
);
325 r
= cg_get_xattr(SYSTEMD_CGROUP_CONTROLLER
, p
, "trusted.invocation_id", ids
, 32);
332 return sd_id128_from_string(ids
, &c
->invocation_id
);
335 static void client_context_really_refresh(
338 const struct ucred
*ucred
,
339 const char *label
, size_t label_size
,
345 assert(pid_is_valid(c
->pid
));
347 if (timestamp
== USEC_INFINITY
)
348 timestamp
= now(CLOCK_MONOTONIC
);
350 client_context_read_uid_gid(c
, ucred
);
351 client_context_read_basic(c
);
352 (void) client_context_read_label(c
, label
, label_size
);
354 (void) audit_session_from_pid(c
->pid
, &c
->auditid
);
355 (void) audit_loginuid_from_pid(c
->pid
, &c
->loginuid
);
357 (void) client_context_read_cgroup(s
, c
, unit_id
);
358 (void) client_context_read_invocation_id(s
, c
);
360 c
->timestamp
= timestamp
;
363 assert(c
->n_ref
== 0);
364 assert_se(prioq_reshuffle(s
->client_contexts_lru
, c
, &c
->lru_index
) >= 0);
368 void client_context_maybe_refresh(
371 const struct ucred
*ucred
,
372 const char *label
, size_t label_size
,
379 if (timestamp
== USEC_INFINITY
)
380 timestamp
= now(CLOCK_MONOTONIC
);
382 /* No cached data so far? Let's fill it up */
383 if (c
->timestamp
== USEC_INFINITY
)
386 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
387 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
388 if (c
->n_ref
== 0 && c
->timestamp
+ MAX_USEC
< timestamp
) {
389 client_context_reset(c
);
393 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
394 if (c
->timestamp
+ REFRESH_USEC
< timestamp
)
397 /* If the data passed along doesn't match the cached data we also do a refresh */
398 if (ucred
&& uid_is_valid(ucred
->uid
) && c
->uid
!= ucred
->uid
)
401 if (ucred
&& gid_is_valid(ucred
->gid
) && c
->gid
!= ucred
->gid
)
404 if (label_size
> 0 && (label_size
!= c
->label_size
|| memcmp(label
, c
->label
, label_size
) != 0))
410 client_context_really_refresh(s
, c
, ucred
, label
, label_size
, unit_id
, timestamp
);
413 static void client_context_try_shrink_to(Server
*s
, size_t limit
) {
416 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
417 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
418 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
420 while (hashmap_size(s
->client_contexts
) > limit
) {
423 c
= prioq_pop(s
->client_contexts_lru
);
425 break; /* All remaining entries are pinned, give up */
428 assert(c
->n_ref
== 0);
432 client_context_free(s
, c
);
436 void client_context_flush_all(Server
*s
) {
439 /* Flush out all remaining entries. This assumes all references are already dropped. */
441 s
->my_context
= client_context_release(s
, s
->my_context
);
442 s
->pid1_context
= client_context_release(s
, s
->pid1_context
);
444 client_context_try_shrink_to(s
, 0);
446 assert(prioq_size(s
->client_contexts_lru
) == 0);
447 assert(hashmap_size(s
->client_contexts
) == 0);
449 s
->client_contexts_lru
= prioq_free(s
->client_contexts_lru
);
450 s
->client_contexts
= hashmap_free(s
->client_contexts
);
453 static int client_context_get_internal(
456 const struct ucred
*ucred
,
457 const char *label
, size_t label_len
,
460 ClientContext
**ret
) {
468 if (!pid_is_valid(pid
))
471 c
= hashmap_get(s
->client_contexts
, PID_TO_PTR(pid
));
476 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
477 assert(c
->n_ref
== 0);
478 assert_se(prioq_remove(s
->client_contexts_lru
, c
, &c
->lru_index
) >= 0);
485 client_context_maybe_refresh(s
, c
, ucred
, label
, label_len
, unit_id
, USEC_INFINITY
);
491 client_context_try_shrink_to(s
, CACHE_MAX
-1);
493 r
= client_context_new(s
, pid
, &c
);
500 r
= prioq_put(s
->client_contexts_lru
, c
, &c
->lru_index
);
502 client_context_free(s
, c
);
509 client_context_really_refresh(s
, c
, ucred
, label
, label_len
, unit_id
, USEC_INFINITY
);
515 int client_context_get(
518 const struct ucred
*ucred
,
519 const char *label
, size_t label_len
,
521 ClientContext
**ret
) {
523 return client_context_get_internal(s
, pid
, ucred
, label
, label_len
, unit_id
, false, ret
);
526 int client_context_acquire(
529 const struct ucred
*ucred
,
530 const char *label
, size_t label_len
,
532 ClientContext
**ret
) {
534 return client_context_get_internal(s
, pid
, ucred
, label
, label_len
, unit_id
, true, ret
);
537 ClientContext
*client_context_release(Server
*s
, ClientContext
*c
) {
543 assert(c
->n_ref
> 0);
550 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
553 if (prioq_put(s
->client_contexts_lru
, c
, &c
->lru_index
) < 0)
554 client_context_free(s
, c
);
561 void client_context_acquire_default(Server
*s
) {
566 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
567 * generate driver messages. */
569 if (!s
->my_context
) {
570 struct ucred ucred
= {
571 .pid
= getpid_cached(),
576 r
= client_context_acquire(s
, ucred
.pid
, &ucred
, NULL
, 0, NULL
, &s
->my_context
);
578 log_warning_errno(r
, "Failed to acquire our own context, ignoring: %m");
581 if (!s
->pid1_context
) {
583 r
= client_context_acquire(s
, 1, NULL
, NULL
, 0, NULL
, &s
->pid1_context
);
585 log_warning_errno(r
, "Failed to acquire PID1's context, ignoring: %m");