]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journald-context.c
tree-wide: drop 'This file is part of systemd' blurb
[thirdparty/systemd.git] / src / journal / journald-context.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 Copyright 2017 Lennart Poettering
4 ***/
5
6 #if HAVE_SELINUX
7 #include <selinux/selinux.h>
8 #endif
9
10 #include "alloc-util.h"
11 #include "audit-util.h"
12 #include "cgroup-util.h"
13 #include "fd-util.h"
14 #include "fileio.h"
15 #include "fs-util.h"
16 #include "io-util.h"
17 #include "journal-util.h"
18 #include "journald-context.h"
19 #include "process-util.h"
20 #include "string-util.h"
21 #include "syslog-util.h"
22 #include "unaligned.h"
23 #include "user-util.h"
24
25 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
26 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
27 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
28 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
29 *
30 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
31 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
32 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
33 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
34 * flushed out). Data newer than 1s is used immediately without refresh.
35 *
36 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
37 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
38 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
39 *
40 * Caching metadata like this has two major benefits:
41 *
42 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
43 *
44 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
45 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
46 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
47 * stream connection. This should improve cases where a service process logs immediately before exiting and we
48 * previously had trouble associating the log message with the service.
49 *
50 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
51 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
52 * and sometimes slightly newer than what was current at the log event).
53 */
54
55 /* We refresh every 1s */
56 #define REFRESH_USEC (1*USEC_PER_SEC)
57
58 /* Data older than 5s we flush out */
59 #define MAX_USEC (5*USEC_PER_SEC)
60
61 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
62 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
63 * clients itself is limited.) */
64 #define CACHE_MAX (16*1024)
65
66 static int client_context_compare(const void *a, const void *b) {
67 const ClientContext *x = a, *y = b;
68
69 if (x->timestamp < y->timestamp)
70 return -1;
71 if (x->timestamp > y->timestamp)
72 return 1;
73
74 if (x->pid < y->pid)
75 return -1;
76 if (x->pid > y->pid)
77 return 1;
78
79 return 0;
80 }
81
82 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
83 ClientContext *c;
84 int r;
85
86 assert(s);
87 assert(pid_is_valid(pid));
88 assert(ret);
89
90 r = hashmap_ensure_allocated(&s->client_contexts, NULL);
91 if (r < 0)
92 return r;
93
94 r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
95 if (r < 0)
96 return r;
97
98 c = new0(ClientContext, 1);
99 if (!c)
100 return -ENOMEM;
101
102 c->pid = pid;
103
104 c->uid = UID_INVALID;
105 c->gid = GID_INVALID;
106 c->auditid = AUDIT_SESSION_INVALID;
107 c->loginuid = UID_INVALID;
108 c->owner_uid = UID_INVALID;
109 c->lru_index = PRIOQ_IDX_NULL;
110 c->timestamp = USEC_INFINITY;
111 c->extra_fields_mtime = NSEC_INFINITY;
112 c->log_level_max = -1;
113
114 r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
115 if (r < 0) {
116 free(c);
117 return r;
118 }
119
120 *ret = c;
121 return 0;
122 }
123
124 static void client_context_reset(ClientContext *c) {
125 assert(c);
126
127 c->timestamp = USEC_INFINITY;
128
129 c->uid = UID_INVALID;
130 c->gid = GID_INVALID;
131
132 c->comm = mfree(c->comm);
133 c->exe = mfree(c->exe);
134 c->cmdline = mfree(c->cmdline);
135 c->capeff = mfree(c->capeff);
136
137 c->auditid = AUDIT_SESSION_INVALID;
138 c->loginuid = UID_INVALID;
139
140 c->cgroup = mfree(c->cgroup);
141 c->session = mfree(c->session);
142 c->owner_uid = UID_INVALID;
143 c->unit = mfree(c->unit);
144 c->user_unit = mfree(c->user_unit);
145 c->slice = mfree(c->slice);
146 c->user_slice = mfree(c->user_slice);
147
148 c->invocation_id = SD_ID128_NULL;
149
150 c->label = mfree(c->label);
151 c->label_size = 0;
152
153 c->extra_fields_iovec = mfree(c->extra_fields_iovec);
154 c->extra_fields_n_iovec = 0;
155 c->extra_fields_data = mfree(c->extra_fields_data);
156 c->extra_fields_mtime = NSEC_INFINITY;
157
158 c->log_level_max = -1;
159 }
160
161 static ClientContext* client_context_free(Server *s, ClientContext *c) {
162 assert(s);
163
164 if (!c)
165 return NULL;
166
167 assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
168
169 if (c->in_lru)
170 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
171
172 client_context_reset(c);
173
174 return mfree(c);
175 }
176
177 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
178 assert(c);
179 assert(pid_is_valid(c->pid));
180
181 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
182 if (ucred && uid_is_valid(ucred->uid))
183 c->uid = ucred->uid;
184 else
185 (void) get_process_uid(c->pid, &c->uid);
186
187 if (ucred && gid_is_valid(ucred->gid))
188 c->gid = ucred->gid;
189 else
190 (void) get_process_gid(c->pid, &c->gid);
191 }
192
193 static void client_context_read_basic(ClientContext *c) {
194 char *t;
195
196 assert(c);
197 assert(pid_is_valid(c->pid));
198
199 if (get_process_comm(c->pid, &t) >= 0)
200 free_and_replace(c->comm, t);
201
202 if (get_process_exe(c->pid, &t) >= 0)
203 free_and_replace(c->exe, t);
204
205 if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
206 free_and_replace(c->cmdline, t);
207
208 if (get_process_capeff(c->pid, &t) >= 0)
209 free_and_replace(c->capeff, t);
210 }
211
212 static int client_context_read_label(
213 ClientContext *c,
214 const char *label, size_t label_size) {
215
216 assert(c);
217 assert(pid_is_valid(c->pid));
218 assert(label_size == 0 || label);
219
220 if (label_size > 0) {
221 char *l;
222
223 /* If we got an SELinux label passed in it counts. */
224
225 l = newdup_suffix0(char, label, label_size);
226 if (!l)
227 return -ENOMEM;
228
229 free_and_replace(c->label, l);
230 c->label_size = label_size;
231 }
232 #if HAVE_SELINUX
233 else {
234 char *con;
235
236 /* If we got no SELinux label passed in, let's try to acquire one */
237
238 if (getpidcon(c->pid, &con) >= 0) {
239 free_and_replace(c->label, con);
240 c->label_size = strlen(c->label);
241 }
242 }
243 #endif
244
245 return 0;
246 }
247
248 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
249 char *t = NULL;
250 int r;
251
252 assert(c);
253
254 /* Try to acquire the current cgroup path */
255 r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
256 if (r < 0) {
257
258 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
259 if (unit_id && !c->unit) {
260 c->unit = strdup(unit_id);
261 if (c->unit)
262 return 0;
263 }
264
265 return r;
266 }
267
268 /* Let's shortcut this if the cgroup path didn't change */
269 if (streq_ptr(c->cgroup, t)) {
270 free(t);
271 return 0;
272 }
273
274 free_and_replace(c->cgroup, t);
275
276 (void) cg_path_get_session(c->cgroup, &t);
277 free_and_replace(c->session, t);
278
279 if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
280 c->owner_uid = UID_INVALID;
281
282 (void) cg_path_get_unit(c->cgroup, &t);
283 free_and_replace(c->unit, t);
284
285 (void) cg_path_get_user_unit(c->cgroup, &t);
286 free_and_replace(c->user_unit, t);
287
288 (void) cg_path_get_slice(c->cgroup, &t);
289 free_and_replace(c->slice, t);
290
291 (void) cg_path_get_user_slice(c->cgroup, &t);
292 free_and_replace(c->user_slice, t);
293
294 return 0;
295 }
296
297 static int client_context_read_invocation_id(
298 Server *s,
299 ClientContext *c) {
300
301 _cleanup_free_ char *value = NULL;
302 const char *p;
303 int r;
304
305 assert(s);
306 assert(c);
307
308 /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
309
310 if (!c->unit)
311 return 0;
312
313 p = strjoina("/run/systemd/units/invocation:", c->unit);
314 r = readlink_malloc(p, &value);
315 if (r < 0)
316 return r;
317
318 return sd_id128_from_string(value, &c->invocation_id);
319 }
320
321 static int client_context_read_log_level_max(
322 Server *s,
323 ClientContext *c) {
324
325 _cleanup_free_ char *value = NULL;
326 const char *p;
327 int r, ll;
328
329 if (!c->unit)
330 return 0;
331
332 p = strjoina("/run/systemd/units/log-level-max:", c->unit);
333 r = readlink_malloc(p, &value);
334 if (r < 0)
335 return r;
336
337 ll = log_level_from_string(value);
338 if (ll < 0)
339 return -EINVAL;
340
341 c->log_level_max = ll;
342 return 0;
343 }
344
345 static int client_context_read_extra_fields(
346 Server *s,
347 ClientContext *c) {
348
349 size_t size = 0, n_iovec = 0, n_allocated = 0, left;
350 _cleanup_free_ struct iovec *iovec = NULL;
351 _cleanup_free_ void *data = NULL;
352 _cleanup_fclose_ FILE *f = NULL;
353 struct stat st;
354 const char *p;
355 uint8_t *q;
356 int r;
357
358 if (!c->unit)
359 return 0;
360
361 p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
362
363 if (c->extra_fields_mtime != NSEC_INFINITY) {
364 if (stat(p, &st) < 0) {
365 if (errno == ENOENT)
366 return 0;
367
368 return -errno;
369 }
370
371 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
372 return 0;
373 }
374
375 f = fopen(p, "re");
376 if (!f) {
377 if (errno == ENOENT)
378 return 0;
379
380 return -errno;
381 }
382
383 if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
384 * one, that matches the stuff we are reading */
385 return -errno;
386
387 r = read_full_stream(f, (char**) &data, &size);
388 if (r < 0)
389 return r;
390
391 q = data, left = size;
392 while (left > 0) {
393 uint8_t *field, *eq;
394 uint64_t v, n;
395
396 if (left < sizeof(uint64_t))
397 return -EBADMSG;
398
399 v = unaligned_read_le64(q);
400 if (v < 2)
401 return -EBADMSG;
402
403 n = sizeof(uint64_t) + v;
404 if (left < n)
405 return -EBADMSG;
406
407 field = q + sizeof(uint64_t);
408
409 eq = memchr(field, '=', v);
410 if (!eq)
411 return -EBADMSG;
412
413 if (!journal_field_valid((const char *) field, eq - field, false))
414 return -EBADMSG;
415
416 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
417 return -ENOMEM;
418
419 iovec[n_iovec++] = IOVEC_MAKE(field, v);
420
421 left -= n, q += n;
422 }
423
424 free(c->extra_fields_iovec);
425 free(c->extra_fields_data);
426
427 c->extra_fields_iovec = TAKE_PTR(iovec);
428 c->extra_fields_n_iovec = n_iovec;
429 c->extra_fields_data = TAKE_PTR(data);
430 c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
431
432 return 0;
433 }
434
435 static void client_context_really_refresh(
436 Server *s,
437 ClientContext *c,
438 const struct ucred *ucred,
439 const char *label, size_t label_size,
440 const char *unit_id,
441 usec_t timestamp) {
442
443 assert(s);
444 assert(c);
445 assert(pid_is_valid(c->pid));
446
447 if (timestamp == USEC_INFINITY)
448 timestamp = now(CLOCK_MONOTONIC);
449
450 client_context_read_uid_gid(c, ucred);
451 client_context_read_basic(c);
452 (void) client_context_read_label(c, label, label_size);
453
454 (void) audit_session_from_pid(c->pid, &c->auditid);
455 (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
456
457 (void) client_context_read_cgroup(s, c, unit_id);
458 (void) client_context_read_invocation_id(s, c);
459 (void) client_context_read_log_level_max(s, c);
460 (void) client_context_read_extra_fields(s, c);
461
462 c->timestamp = timestamp;
463
464 if (c->in_lru) {
465 assert(c->n_ref == 0);
466 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
467 }
468 }
469
470 void client_context_maybe_refresh(
471 Server *s,
472 ClientContext *c,
473 const struct ucred *ucred,
474 const char *label, size_t label_size,
475 const char *unit_id,
476 usec_t timestamp) {
477
478 assert(s);
479 assert(c);
480
481 if (timestamp == USEC_INFINITY)
482 timestamp = now(CLOCK_MONOTONIC);
483
484 /* No cached data so far? Let's fill it up */
485 if (c->timestamp == USEC_INFINITY)
486 goto refresh;
487
488 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
489 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
490 if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
491 client_context_reset(c);
492 goto refresh;
493 }
494
495 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
496 if (c->timestamp + REFRESH_USEC < timestamp)
497 goto refresh;
498
499 /* If the data passed along doesn't match the cached data we also do a refresh */
500 if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
501 goto refresh;
502
503 if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
504 goto refresh;
505
506 if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
507 goto refresh;
508
509 return;
510
511 refresh:
512 client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
513 }
514
515 static void client_context_try_shrink_to(Server *s, size_t limit) {
516 assert(s);
517
518 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
519 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
520 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
521
522 while (hashmap_size(s->client_contexts) > limit) {
523 ClientContext *c;
524
525 c = prioq_pop(s->client_contexts_lru);
526 if (!c)
527 break; /* All remaining entries are pinned, give up */
528
529 assert(c->in_lru);
530 assert(c->n_ref == 0);
531
532 c->in_lru = false;
533
534 client_context_free(s, c);
535 }
536 }
537
538 void client_context_flush_all(Server *s) {
539 assert(s);
540
541 /* Flush out all remaining entries. This assumes all references are already dropped. */
542
543 s->my_context = client_context_release(s, s->my_context);
544 s->pid1_context = client_context_release(s, s->pid1_context);
545
546 client_context_try_shrink_to(s, 0);
547
548 assert(prioq_size(s->client_contexts_lru) == 0);
549 assert(hashmap_size(s->client_contexts) == 0);
550
551 s->client_contexts_lru = prioq_free(s->client_contexts_lru);
552 s->client_contexts = hashmap_free(s->client_contexts);
553 }
554
555 static int client_context_get_internal(
556 Server *s,
557 pid_t pid,
558 const struct ucred *ucred,
559 const char *label, size_t label_len,
560 const char *unit_id,
561 bool add_ref,
562 ClientContext **ret) {
563
564 ClientContext *c;
565 int r;
566
567 assert(s);
568 assert(ret);
569
570 if (!pid_is_valid(pid))
571 return -EINVAL;
572
573 c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
574 if (c) {
575
576 if (add_ref) {
577 if (c->in_lru) {
578 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
579 assert(c->n_ref == 0);
580 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
581 c->in_lru = false;
582 }
583
584 c->n_ref++;
585 }
586
587 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
588
589 *ret = c;
590 return 0;
591 }
592
593 client_context_try_shrink_to(s, CACHE_MAX-1);
594
595 r = client_context_new(s, pid, &c);
596 if (r < 0)
597 return r;
598
599 if (add_ref)
600 c->n_ref++;
601 else {
602 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
603 if (r < 0) {
604 client_context_free(s, c);
605 return r;
606 }
607
608 c->in_lru = true;
609 }
610
611 client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
612
613 *ret = c;
614 return 0;
615 }
616
617 int client_context_get(
618 Server *s,
619 pid_t pid,
620 const struct ucred *ucred,
621 const char *label, size_t label_len,
622 const char *unit_id,
623 ClientContext **ret) {
624
625 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
626 }
627
628 int client_context_acquire(
629 Server *s,
630 pid_t pid,
631 const struct ucred *ucred,
632 const char *label, size_t label_len,
633 const char *unit_id,
634 ClientContext **ret) {
635
636 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
637 };
638
639 ClientContext *client_context_release(Server *s, ClientContext *c) {
640 assert(s);
641
642 if (!c)
643 return NULL;
644
645 assert(c->n_ref > 0);
646 assert(!c->in_lru);
647
648 c->n_ref--;
649 if (c->n_ref > 0)
650 return NULL;
651
652 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
653 * right-away */
654
655 if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
656 client_context_free(s, c);
657 else
658 c->in_lru = true;
659
660 return NULL;
661 }
662
663 void client_context_acquire_default(Server *s) {
664 int r;
665
666 assert(s);
667
668 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
669 * generate driver messages. */
670
671 if (!s->my_context) {
672 struct ucred ucred = {
673 .pid = getpid_cached(),
674 .uid = getuid(),
675 .gid = getgid(),
676 };
677
678 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
679 if (r < 0)
680 log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
681 }
682
683 if (!s->pid1_context) {
684
685 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
686 if (r < 0)
687 log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
688
689 }
690 }