1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
4 #include <selinux/selinux.h>
7 #include "alloc-util.h"
8 #include "audit-util.h"
9 #include "cgroup-util.h"
14 #include "iovec-util.h"
15 #include "journal-internal.h"
16 #include "journal-util.h"
17 #include "journald-client.h"
18 #include "journald-context.h"
19 #include "parse-util.h"
20 #include "path-util.h"
21 #include "process-util.h"
22 #include "procfs-util.h"
23 #include "string-util.h"
24 #include "syslog-util.h"
25 #include "unaligned.h"
26 #include "user-util.h"
28 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
29 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
30 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
31 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
33 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
34 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
35 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
36 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
37 * flushed out). Data newer than 1s is used immediately without refresh.
39 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
40 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
41 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
43 * Caching metadata like this has two major benefits:
45 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
47 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
48 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
49 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
50 * stream connection. This should improve cases where a service process logs immediately before exiting and we
51 * previously had trouble associating the log message with the service.
53 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
54 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slightly older
55 * and sometimes slightly newer than what was current at the log event).
58 /* We refresh every 1s */
59 #define REFRESH_USEC (1*USEC_PER_SEC)
61 /* Data older than 5s we flush out */
62 #define MAX_USEC (5*USEC_PER_SEC)
64 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
65 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
66 * clients itself is limited.) */
67 #define CACHE_MAX_FALLBACK 128U
68 #define CACHE_MAX_MAX (16*1024U)
69 #define CACHE_MAX_MIN 64U
71 static size_t cache_max(void) {
72 static size_t cached
= -1;
74 if (cached
== SIZE_MAX
) {
78 r
= procfs_memory_get(&mem_total
, NULL
);
80 log_warning_errno(r
, "Cannot query /proc/meminfo for MemTotal: %m");
81 cached
= CACHE_MAX_FALLBACK
;
83 /* Cache entries are usually a few kB, but the process cmdline is controlled by the
84 * user and can be up to _SC_ARG_MAX, usually 2MB. Let's say that approximately up to
85 * 1/8th of memory may be used by the cache.
87 * In the common case, this formula gives 64 cache entries for each GB of RAM.
89 cached
= CLAMP(mem_total
/ 8 / sc_arg_max(), CACHE_MAX_MIN
, CACHE_MAX_MAX
);
95 static int client_context_compare(const void *a
, const void *b
) {
96 const ClientContext
*x
= a
, *y
= b
;
99 r
= CMP(x
->timestamp
, y
->timestamp
);
103 return CMP(x
->pid
, y
->pid
);
106 static int client_context_new(Server
*s
, pid_t pid
, ClientContext
**ret
) {
107 _cleanup_free_ ClientContext
*c
= NULL
;
111 assert(pid_is_valid(pid
));
114 r
= prioq_ensure_allocated(&s
->client_contexts_lru
, client_context_compare
);
118 c
= new(ClientContext
, 1);
122 *c
= (ClientContext
) {
126 .auditid
= AUDIT_SESSION_INVALID
,
127 .loginuid
= UID_INVALID
,
128 .owner_uid
= UID_INVALID
,
129 .lru_index
= PRIOQ_IDX_NULL
,
130 .timestamp
= USEC_INFINITY
,
131 .extra_fields_mtime
= NSEC_INFINITY
,
133 .log_ratelimit_interval
= s
->ratelimit_interval
,
134 .log_ratelimit_burst
= s
->ratelimit_burst
,
137 r
= hashmap_ensure_put(&s
->client_contexts
, NULL
, PID_TO_PTR(pid
), c
);
145 static void client_context_reset(Server
*s
, ClientContext
*c
) {
149 c
->timestamp
= USEC_INFINITY
;
151 c
->uid
= UID_INVALID
;
152 c
->gid
= GID_INVALID
;
154 c
->comm
= mfree(c
->comm
);
155 c
->exe
= mfree(c
->exe
);
156 c
->cmdline
= mfree(c
->cmdline
);
157 c
->capeff
= mfree(c
->capeff
);
159 c
->auditid
= AUDIT_SESSION_INVALID
;
160 c
->loginuid
= UID_INVALID
;
162 c
->cgroup
= mfree(c
->cgroup
);
163 c
->session
= mfree(c
->session
);
164 c
->owner_uid
= UID_INVALID
;
165 c
->unit
= mfree(c
->unit
);
166 c
->user_unit
= mfree(c
->user_unit
);
167 c
->slice
= mfree(c
->slice
);
168 c
->user_slice
= mfree(c
->user_slice
);
170 c
->invocation_id
= SD_ID128_NULL
;
172 c
->label
= mfree(c
->label
);
175 c
->extra_fields_iovec
= mfree(c
->extra_fields_iovec
);
176 c
->extra_fields_n_iovec
= 0;
177 c
->extra_fields_data
= mfree(c
->extra_fields_data
);
178 c
->extra_fields_mtime
= NSEC_INFINITY
;
180 c
->log_level_max
= -1;
182 c
->log_ratelimit_interval
= s
->ratelimit_interval
;
183 c
->log_ratelimit_burst
= s
->ratelimit_burst
;
185 c
->log_filter_allowed_patterns
= set_free_free(c
->log_filter_allowed_patterns
);
186 c
->log_filter_denied_patterns
= set_free_free(c
->log_filter_denied_patterns
);
189 static ClientContext
* client_context_free(Server
*s
, ClientContext
*c
) {
195 assert_se(hashmap_remove(s
->client_contexts
, PID_TO_PTR(c
->pid
)) == c
);
198 assert_se(prioq_remove(s
->client_contexts_lru
, c
, &c
->lru_index
) >= 0);
200 client_context_reset(s
, c
);
205 static void client_context_read_uid_gid(ClientContext
*c
, const struct ucred
*ucred
) {
207 assert(pid_is_valid(c
->pid
));
209 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
210 if (ucred
&& uid_is_valid(ucred
->uid
))
213 (void) pid_get_uid(c
->pid
, &c
->uid
);
215 if (ucred
&& gid_is_valid(ucred
->gid
))
218 (void) get_process_gid(c
->pid
, &c
->gid
);
221 static void client_context_read_basic(ClientContext
*c
) {
225 assert(pid_is_valid(c
->pid
));
227 if (pid_get_comm(c
->pid
, &t
) >= 0)
228 free_and_replace(c
->comm
, t
);
230 if (get_process_exe(c
->pid
, &t
) >= 0)
231 free_and_replace(c
->exe
, t
);
233 if (pid_get_cmdline(c
->pid
, SIZE_MAX
, PROCESS_CMDLINE_QUOTE
, &t
) >= 0)
234 free_and_replace(c
->cmdline
, t
);
236 if (get_process_capeff(c
->pid
, &t
) >= 0)
237 free_and_replace(c
->capeff
, t
);
240 static int client_context_read_label(
242 const char *label
, size_t label_size
) {
245 assert(pid_is_valid(c
->pid
));
246 assert(label_size
== 0 || label
);
248 if (label_size
> 0) {
251 /* If we got an SELinux label passed in it counts. */
253 l
= newdup_suffix0(char, label
, label_size
);
257 free_and_replace(c
->label
, l
);
258 c
->label_size
= label_size
;
264 /* If we got no SELinux label passed in, let's try to acquire one */
266 if (getpidcon(c
->pid
, &con
) >= 0 && con
) {
267 free_and_replace(c
->label
, con
);
268 c
->label_size
= strlen(c
->label
);
276 static int client_context_read_cgroup(Server
*s
, ClientContext
*c
, const char *unit_id
) {
277 _cleanup_free_
char *t
= NULL
;
282 /* Try to acquire the current cgroup path */
283 r
= cg_pid_get_path_shifted(c
->pid
, s
->cgroup_root
, &t
);
284 if (r
< 0 || empty_or_root(t
)) {
285 /* We use the unit ID passed in as fallback if we have nothing cached yet and cg_pid_get_path_shifted()
286 * failed or process is running in a root cgroup. Zombie processes are automatically migrated to root cgroup
287 * on cgroup v1 and we want to be able to map log messages from them too. */
288 if (unit_id
&& !c
->unit
) {
289 c
->unit
= strdup(unit_id
);
297 (void) client_context_read_log_filter_patterns(c
, t
);
299 /* Let's shortcut this if the cgroup path didn't change */
300 if (streq_ptr(c
->cgroup
, t
))
303 free_and_replace(c
->cgroup
, t
);
305 (void) cg_path_get_session(c
->cgroup
, &t
);
306 free_and_replace(c
->session
, t
);
308 if (cg_path_get_owner_uid(c
->cgroup
, &c
->owner_uid
) < 0)
309 c
->owner_uid
= UID_INVALID
;
311 (void) cg_path_get_unit(c
->cgroup
, &t
);
312 free_and_replace(c
->unit
, t
);
314 (void) cg_path_get_user_unit(c
->cgroup
, &t
);
315 free_and_replace(c
->user_unit
, t
);
317 (void) cg_path_get_slice(c
->cgroup
, &t
);
318 free_and_replace(c
->slice
, t
);
320 (void) cg_path_get_user_slice(c
->cgroup
, &t
);
321 free_and_replace(c
->user_slice
, t
);
326 static int client_context_read_invocation_id(
330 _cleanup_free_
char *p
= NULL
, *value
= NULL
;
336 /* Read the invocation ID of a unit off a unit.
337 * PID 1 stores it in a per-unit symlink in /run/systemd/units/
338 * User managers store it in a per-unit symlink under /run/user/<uid>/systemd/units/ */
344 r
= asprintf(&p
, "/run/user/" UID_FMT
"/systemd/units/invocation:%s", c
->owner_uid
, c
->user_unit
);
348 p
= strjoin("/run/systemd/units/invocation:", c
->unit
);
353 r
= readlink_malloc(p
, &value
);
357 return sd_id128_from_string(value
, &c
->invocation_id
);
360 static int client_context_read_log_level_max(
364 _cleanup_free_
char *value
= NULL
;
371 p
= strjoina("/run/systemd/units/log-level-max:", c
->unit
);
372 r
= readlink_malloc(p
, &value
);
376 ll
= log_level_from_string(value
);
380 c
->log_level_max
= ll
;
384 static int client_context_read_extra_fields(
388 _cleanup_free_
struct iovec
*iovec
= NULL
;
389 size_t size
= 0, n_iovec
= 0, left
;
390 _cleanup_free_
void *data
= NULL
;
391 _cleanup_fclose_
FILE *f
= NULL
;
400 p
= strjoina("/run/systemd/units/log-extra-fields:", c
->unit
);
402 if (c
->extra_fields_mtime
!= NSEC_INFINITY
) {
403 if (stat(p
, &st
) < 0) {
410 if (timespec_load_nsec(&st
.st_mtim
) == c
->extra_fields_mtime
)
422 if (fstat(fileno(f
), &st
) < 0) /* The file might have been replaced since the stat() above, let's get a new
423 * one, that matches the stuff we are reading */
426 r
= read_full_stream(f
, (char**) &data
, &size
);
430 q
= data
, left
= size
;
435 if (left
< sizeof(uint64_t))
438 v
= unaligned_read_le64(q
);
442 n
= sizeof(uint64_t) + v
;
446 field
= q
+ sizeof(uint64_t);
448 eq
= memchr(field
, '=', v
);
452 if (!journal_field_valid((const char *) field
, eq
- field
, false))
455 if (!GREEDY_REALLOC(iovec
, n_iovec
+1))
458 iovec
[n_iovec
++] = IOVEC_MAKE(field
, v
);
463 free(c
->extra_fields_iovec
);
464 free(c
->extra_fields_data
);
466 c
->extra_fields_iovec
= TAKE_PTR(iovec
);
467 c
->extra_fields_n_iovec
= n_iovec
;
468 c
->extra_fields_data
= TAKE_PTR(data
);
469 c
->extra_fields_mtime
= timespec_load_nsec(&st
.st_mtim
);
474 static int client_context_read_log_ratelimit_interval(ClientContext
*c
) {
475 _cleanup_free_
char *value
= NULL
;
484 p
= strjoina("/run/systemd/units/log-rate-limit-interval:", c
->unit
);
485 r
= readlink_malloc(p
, &value
);
489 return safe_atou64(value
, &c
->log_ratelimit_interval
);
492 static int client_context_read_log_ratelimit_burst(ClientContext
*c
) {
493 _cleanup_free_
char *value
= NULL
;
502 p
= strjoina("/run/systemd/units/log-rate-limit-burst:", c
->unit
);
503 r
= readlink_malloc(p
, &value
);
507 return safe_atou(value
, &c
->log_ratelimit_burst
);
510 static void client_context_really_refresh(
513 const struct ucred
*ucred
,
514 const char *label
, size_t label_size
,
520 assert(pid_is_valid(c
->pid
));
522 if (timestamp
== USEC_INFINITY
)
523 timestamp
= now(CLOCK_MONOTONIC
);
525 client_context_read_uid_gid(c
, ucred
);
526 client_context_read_basic(c
);
527 (void) client_context_read_label(c
, label
, label_size
);
529 (void) audit_session_from_pid(c
->pid
, &c
->auditid
);
530 (void) audit_loginuid_from_pid(c
->pid
, &c
->loginuid
);
532 (void) client_context_read_cgroup(s
, c
, unit_id
);
533 (void) client_context_read_invocation_id(s
, c
);
534 (void) client_context_read_log_level_max(s
, c
);
535 (void) client_context_read_extra_fields(s
, c
);
536 (void) client_context_read_log_ratelimit_interval(c
);
537 (void) client_context_read_log_ratelimit_burst(c
);
539 c
->timestamp
= timestamp
;
542 assert(c
->n_ref
== 0);
543 prioq_reshuffle(s
->client_contexts_lru
, c
, &c
->lru_index
);
547 void client_context_maybe_refresh(
550 const struct ucred
*ucred
,
551 const char *label
, size_t label_size
,
558 if (timestamp
== USEC_INFINITY
)
559 timestamp
= now(CLOCK_MONOTONIC
);
561 /* No cached data so far? Let's fill it up */
562 if (c
->timestamp
== USEC_INFINITY
)
565 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
566 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
567 if (c
->n_ref
== 0 && c
->timestamp
+ MAX_USEC
< timestamp
) {
568 client_context_reset(s
, c
);
572 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
573 if (c
->timestamp
+ REFRESH_USEC
< timestamp
)
576 /* If the data passed along doesn't match the cached data we also do a refresh */
577 if (ucred
&& uid_is_valid(ucred
->uid
) && c
->uid
!= ucred
->uid
)
580 if (ucred
&& gid_is_valid(ucred
->gid
) && c
->gid
!= ucred
->gid
)
583 if (label_size
> 0 && (label_size
!= c
->label_size
|| memcmp(label
, c
->label
, label_size
) != 0))
589 client_context_really_refresh(s
, c
, ucred
, label
, label_size
, unit_id
, timestamp
);
592 static void client_context_try_shrink_to(Server
*s
, size_t limit
) {
598 /* Flush any cache entries for PIDs that have already moved on. Don't do this
599 * too often, since it's a slow process. */
600 t
= now(CLOCK_MONOTONIC
);
601 if (s
->last_cache_pid_flush
+ MAX_USEC
< t
) {
602 unsigned n
= prioq_size(s
->client_contexts_lru
), idx
= 0;
604 /* We do a number of iterations based on the initial size of the prioq. When we remove an
605 * item, a new item is moved into its places, and items to the right might be reshuffled.
607 for (unsigned i
= 0; i
< n
; i
++) {
608 c
= prioq_peek_by_index(s
->client_contexts_lru
, idx
);
610 assert(c
->n_ref
== 0);
612 if (pid_is_unwaited(c
->pid
) == 0)
613 client_context_free(s
, c
);
618 s
->last_cache_pid_flush
= t
;
621 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
622 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
623 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
625 while (hashmap_size(s
->client_contexts
) > limit
) {
626 c
= prioq_pop(s
->client_contexts_lru
);
628 break; /* All remaining entries are pinned, give up */
631 assert(c
->n_ref
== 0);
635 client_context_free(s
, c
);
639 void client_context_flush_regular(Server
*s
) {
640 client_context_try_shrink_to(s
, 0);
643 void client_context_flush_all(Server
*s
) {
646 /* Flush out all remaining entries. This assumes all references are already dropped. */
648 s
->my_context
= client_context_release(s
, s
->my_context
);
649 s
->pid1_context
= client_context_release(s
, s
->pid1_context
);
651 client_context_flush_regular(s
);
653 assert(prioq_isempty(s
->client_contexts_lru
));
654 assert(hashmap_isempty(s
->client_contexts
));
656 s
->client_contexts_lru
= prioq_free(s
->client_contexts_lru
);
657 s
->client_contexts
= hashmap_free(s
->client_contexts
);
660 static int client_context_get_internal(
663 const struct ucred
*ucred
,
664 const char *label
, size_t label_len
,
667 ClientContext
**ret
) {
675 if (!pid_is_valid(pid
))
678 c
= hashmap_get(s
->client_contexts
, PID_TO_PTR(pid
));
683 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
684 assert(c
->n_ref
== 0);
685 assert_se(prioq_remove(s
->client_contexts_lru
, c
, &c
->lru_index
) >= 0);
692 client_context_maybe_refresh(s
, c
, ucred
, label
, label_len
, unit_id
, USEC_INFINITY
);
698 client_context_try_shrink_to(s
, cache_max()-1);
700 r
= client_context_new(s
, pid
, &c
);
707 r
= prioq_put(s
->client_contexts_lru
, c
, &c
->lru_index
);
709 client_context_free(s
, c
);
716 client_context_really_refresh(s
, c
, ucred
, label
, label_len
, unit_id
, USEC_INFINITY
);
722 int client_context_get(
725 const struct ucred
*ucred
,
726 const char *label
, size_t label_len
,
728 ClientContext
**ret
) {
730 return client_context_get_internal(s
, pid
, ucred
, label
, label_len
, unit_id
, false, ret
);
733 int client_context_acquire(
736 const struct ucred
*ucred
,
737 const char *label
, size_t label_len
,
739 ClientContext
**ret
) {
741 return client_context_get_internal(s
, pid
, ucred
, label
, label_len
, unit_id
, true, ret
);
744 ClientContext
*client_context_release(Server
*s
, ClientContext
*c
) {
750 assert(c
->n_ref
> 0);
757 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
760 if (prioq_put(s
->client_contexts_lru
, c
, &c
->lru_index
) < 0)
761 client_context_free(s
, c
);
768 void client_context_acquire_default(Server
*s
) {
773 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
774 * generate driver messages. */
776 if (!s
->my_context
) {
777 struct ucred ucred
= {
778 .pid
= getpid_cached(),
783 r
= client_context_acquire(s
, ucred
.pid
, &ucred
, NULL
, 0, NULL
, &s
->my_context
);
785 log_ratelimit_warning_errno(r
, JOURNAL_LOG_RATELIMIT
,
786 "Failed to acquire our own context, ignoring: %m");
789 if (!s
->namespace && !s
->pid1_context
) {
790 /* Acquire PID1's context, but only if we are in non-namespaced mode, since PID 1 is only
791 * going to log to the non-namespaced journal instance. */
793 r
= client_context_acquire(s
, 1, NULL
, NULL
, 0, NULL
, &s
->pid1_context
);
795 log_ratelimit_warning_errno(r
, JOURNAL_LOG_RATELIMIT
,
796 "Failed to acquire PID1's context, ignoring: %m");