]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journald-context.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / journal / journald-context.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
22e3a02b
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2017 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
349cc4a5 21#if HAVE_SELINUX
22e3a02b
LP
22#include <selinux/selinux.h>
23#endif
24
25#include "alloc-util.h"
26#include "audit-util.h"
27#include "cgroup-util.h"
d3070fbd
LP
28#include "fd-util.h"
29#include "fileio.h"
30#include "fs-util.h"
31#include "io-util.h"
32#include "journal-util.h"
22e3a02b
LP
33#include "journald-context.h"
34#include "process-util.h"
35#include "string-util.h"
d3070fbd
LP
36#include "syslog-util.h"
37#include "unaligned.h"
22e3a02b
LP
38#include "user-util.h"
39
40/* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
41 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
42 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
43 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
44 *
45 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
46 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
47 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
48 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
49 * flushed out). Data newer than 1s is used immediately without refresh.
50 *
51 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
52 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
53 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
54 *
55 * Caching metadata like this has two major benefits:
56 *
57 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
58 *
59 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
60 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
61 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
62 * stream connection. This should improve cases where a service process logs immediately before exiting and we
63 * previously had trouble associating the log message with the service.
64 *
65 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
66 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
67 * and sometimes slightly newer than what was current at the log event).
68 */
69
70/* We refresh every 1s */
71#define REFRESH_USEC (1*USEC_PER_SEC)
72
73/* Data older than 5s we flush out */
74#define MAX_USEC (5*USEC_PER_SEC)
75
76/* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
77 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
78 * clients itself is limited.) */
79#define CACHE_MAX (16*1024)
80
81static int client_context_compare(const void *a, const void *b) {
82 const ClientContext *x = a, *y = b;
83
84 if (x->timestamp < y->timestamp)
85 return -1;
86 if (x->timestamp > y->timestamp)
87 return 1;
88
89 if (x->pid < y->pid)
90 return -1;
91 if (x->pid > y->pid)
92 return 1;
93
94 return 0;
95}
96
97static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
98 ClientContext *c;
99 int r;
100
101 assert(s);
102 assert(pid_is_valid(pid));
103 assert(ret);
104
105 r = hashmap_ensure_allocated(&s->client_contexts, NULL);
106 if (r < 0)
107 return r;
108
109 r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
110 if (r < 0)
111 return r;
112
113 c = new0(ClientContext, 1);
114 if (!c)
115 return -ENOMEM;
116
117 c->pid = pid;
118
119 c->uid = UID_INVALID;
120 c->gid = GID_INVALID;
121 c->auditid = AUDIT_SESSION_INVALID;
122 c->loginuid = UID_INVALID;
123 c->owner_uid = UID_INVALID;
124 c->lru_index = PRIOQ_IDX_NULL;
125 c->timestamp = USEC_INFINITY;
d3070fbd
LP
126 c->extra_fields_mtime = NSEC_INFINITY;
127 c->log_level_max = -1;
22e3a02b
LP
128
129 r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
130 if (r < 0) {
131 free(c);
132 return r;
133 }
134
135 *ret = c;
136 return 0;
137}
138
139static void client_context_reset(ClientContext *c) {
140 assert(c);
141
142 c->timestamp = USEC_INFINITY;
143
144 c->uid = UID_INVALID;
145 c->gid = GID_INVALID;
146
147 c->comm = mfree(c->comm);
148 c->exe = mfree(c->exe);
149 c->cmdline = mfree(c->cmdline);
150 c->capeff = mfree(c->capeff);
151
152 c->auditid = AUDIT_SESSION_INVALID;
153 c->loginuid = UID_INVALID;
154
155 c->cgroup = mfree(c->cgroup);
156 c->session = mfree(c->session);
157 c->owner_uid = UID_INVALID;
158 c->unit = mfree(c->unit);
159 c->user_unit = mfree(c->user_unit);
160 c->slice = mfree(c->slice);
161 c->user_slice = mfree(c->user_slice);
162
163 c->invocation_id = SD_ID128_NULL;
164
165 c->label = mfree(c->label);
166 c->label_size = 0;
d3070fbd
LP
167
168 c->extra_fields_iovec = mfree(c->extra_fields_iovec);
169 c->extra_fields_n_iovec = 0;
170 c->extra_fields_data = mfree(c->extra_fields_data);
171 c->extra_fields_mtime = NSEC_INFINITY;
172
173 c->log_level_max = -1;
22e3a02b
LP
174}
175
176static ClientContext* client_context_free(Server *s, ClientContext *c) {
177 assert(s);
178
179 if (!c)
180 return NULL;
181
182 assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
183
184 if (c->in_lru)
185 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
186
187 client_context_reset(c);
188
189 return mfree(c);
190}
191
192static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
193 assert(c);
194 assert(pid_is_valid(c->pid));
195
196 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
197 if (ucred && uid_is_valid(ucred->uid))
198 c->uid = ucred->uid;
199 else
200 (void) get_process_uid(c->pid, &c->uid);
201
202 if (ucred && gid_is_valid(ucred->gid))
203 c->gid = ucred->gid;
204 else
205 (void) get_process_gid(c->pid, &c->gid);
206}
207
208static void client_context_read_basic(ClientContext *c) {
209 char *t;
210
211 assert(c);
212 assert(pid_is_valid(c->pid));
213
214 if (get_process_comm(c->pid, &t) >= 0)
215 free_and_replace(c->comm, t);
216
217 if (get_process_exe(c->pid, &t) >= 0)
218 free_and_replace(c->exe, t);
219
220 if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
221 free_and_replace(c->cmdline, t);
222
223 if (get_process_capeff(c->pid, &t) >= 0)
224 free_and_replace(c->capeff, t);
225}
226
227static int client_context_read_label(
228 ClientContext *c,
229 const char *label, size_t label_size) {
230
231 assert(c);
232 assert(pid_is_valid(c->pid));
233 assert(label_size == 0 || label);
234
235 if (label_size > 0) {
236 char *l;
237
238 /* If we got an SELinux label passed in it counts. */
239
240 l = newdup_suffix0(char, label, label_size);
241 if (!l)
242 return -ENOMEM;
243
244 free_and_replace(c->label, l);
245 c->label_size = label_size;
246 }
349cc4a5 247#if HAVE_SELINUX
22e3a02b
LP
248 else {
249 char *con;
250
251 /* If we got no SELinux label passed in, let's try to acquire one */
252
253 if (getpidcon(c->pid, &con) >= 0) {
254 free_and_replace(c->label, con);
255 c->label_size = strlen(c->label);
256 }
257 }
258#endif
259
260 return 0;
261}
262
263static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
264 char *t = NULL;
265 int r;
266
267 assert(c);
268
269 /* Try to acquire the current cgroup path */
270 r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
271 if (r < 0) {
272
273 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
274 if (unit_id && !c->unit) {
275 c->unit = strdup(unit_id);
276 if (c->unit)
277 return 0;
278 }
279
280 return r;
281 }
282
283 /* Let's shortcut this if the cgroup path didn't change */
284 if (streq_ptr(c->cgroup, t)) {
285 free(t);
286 return 0;
287 }
288
289 free_and_replace(c->cgroup, t);
290
291 (void) cg_path_get_session(c->cgroup, &t);
292 free_and_replace(c->session, t);
293
294 if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
295 c->owner_uid = UID_INVALID;
296
297 (void) cg_path_get_unit(c->cgroup, &t);
298 free_and_replace(c->unit, t);
299
300 (void) cg_path_get_user_unit(c->cgroup, &t);
301 free_and_replace(c->user_unit, t);
302
303 (void) cg_path_get_slice(c->cgroup, &t);
304 free_and_replace(c->slice, t);
305
306 (void) cg_path_get_user_slice(c->cgroup, &t);
307 free_and_replace(c->user_slice, t);
308
309 return 0;
310}
311
312static int client_context_read_invocation_id(
313 Server *s,
314 ClientContext *c) {
315
d3070fbd 316 _cleanup_free_ char *value = NULL;
22e3a02b
LP
317 const char *p;
318 int r;
319
320 assert(s);
321 assert(c);
322
d3070fbd 323 /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
22e3a02b 324
d3070fbd 325 if (!c->unit)
22e3a02b
LP
326 return 0;
327
d3070fbd
LP
328 p = strjoina("/run/systemd/units/invocation:", c->unit);
329 r = readlink_malloc(p, &value);
22e3a02b
LP
330 if (r < 0)
331 return r;
332
d3070fbd
LP
333 return sd_id128_from_string(value, &c->invocation_id);
334}
22e3a02b 335
d3070fbd
LP
336static int client_context_read_log_level_max(
337 Server *s,
338 ClientContext *c) {
22e3a02b 339
d3070fbd
LP
340 _cleanup_free_ char *value = NULL;
341 const char *p;
342 int r, ll;
343
344 if (!c->unit)
345 return 0;
346
347 p = strjoina("/run/systemd/units/log-level-max:", c->unit);
348 r = readlink_malloc(p, &value);
22e3a02b
LP
349 if (r < 0)
350 return r;
d3070fbd
LP
351
352 ll = log_level_from_string(value);
353 if (ll < 0)
22e3a02b 354 return -EINVAL;
22e3a02b 355
d3070fbd
LP
356 c->log_level_max = ll;
357 return 0;
358}
359
360static int client_context_read_extra_fields(
361 Server *s,
362 ClientContext *c) {
363
364 size_t size = 0, n_iovec = 0, n_allocated = 0, left;
365 _cleanup_free_ struct iovec *iovec = NULL;
366 _cleanup_free_ void *data = NULL;
367 _cleanup_fclose_ FILE *f = NULL;
368 struct stat st;
369 const char *p;
370 uint8_t *q;
371 int r;
372
373 if (!c->unit)
374 return 0;
375
376 p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
377
378 if (c->extra_fields_mtime != NSEC_INFINITY) {
379 if (stat(p, &st) < 0) {
380 if (errno == ENOENT)
381 return 0;
382
383 return -errno;
384 }
385
386 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
387 return 0;
388 }
389
390 f = fopen(p, "re");
391 if (!f) {
392 if (errno == ENOENT)
393 return 0;
394
395 return -errno;
396 }
397
398 if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
399 * one, that matches the stuff we are reading */
400 return -errno;
401
402 r = read_full_stream(f, (char**) &data, &size);
403 if (r < 0)
404 return r;
405
406 q = data, left = size;
407 while (left > 0) {
408 uint8_t *field, *eq;
409 uint64_t v, n;
410
411 if (left < sizeof(uint64_t))
412 return -EBADMSG;
413
414 v = unaligned_read_le64(q);
415 if (v < 2)
416 return -EBADMSG;
417
418 n = sizeof(uint64_t) + v;
419 if (left < n)
420 return -EBADMSG;
421
422 field = q + sizeof(uint64_t);
423
424 eq = memchr(field, '=', v);
425 if (!eq)
426 return -EBADMSG;
427
428 if (!journal_field_valid((const char *) field, eq - field, false))
429 return -EBADMSG;
430
431 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
432 return -ENOMEM;
433
434 iovec[n_iovec++] = IOVEC_MAKE(field, v);
435
436 left -= n, q += n;
437 }
438
439 free(c->extra_fields_iovec);
440 free(c->extra_fields_data);
441
442 c->extra_fields_iovec = iovec;
443 c->extra_fields_n_iovec = n_iovec;
444 c->extra_fields_data = data;
445 c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
446
447 iovec = NULL;
448 data = NULL;
449
450 return 0;
22e3a02b
LP
451}
452
453static void client_context_really_refresh(
454 Server *s,
455 ClientContext *c,
456 const struct ucred *ucred,
457 const char *label, size_t label_size,
458 const char *unit_id,
459 usec_t timestamp) {
460
461 assert(s);
462 assert(c);
463 assert(pid_is_valid(c->pid));
464
465 if (timestamp == USEC_INFINITY)
466 timestamp = now(CLOCK_MONOTONIC);
467
468 client_context_read_uid_gid(c, ucred);
469 client_context_read_basic(c);
470 (void) client_context_read_label(c, label, label_size);
471
472 (void) audit_session_from_pid(c->pid, &c->auditid);
473 (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
474
475 (void) client_context_read_cgroup(s, c, unit_id);
476 (void) client_context_read_invocation_id(s, c);
d3070fbd
LP
477 (void) client_context_read_log_level_max(s, c);
478 (void) client_context_read_extra_fields(s, c);
22e3a02b
LP
479
480 c->timestamp = timestamp;
481
482 if (c->in_lru) {
483 assert(c->n_ref == 0);
484 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
485 }
486}
487
488void client_context_maybe_refresh(
489 Server *s,
490 ClientContext *c,
491 const struct ucred *ucred,
492 const char *label, size_t label_size,
493 const char *unit_id,
494 usec_t timestamp) {
495
496 assert(s);
497 assert(c);
498
499 if (timestamp == USEC_INFINITY)
500 timestamp = now(CLOCK_MONOTONIC);
501
502 /* No cached data so far? Let's fill it up */
503 if (c->timestamp == USEC_INFINITY)
504 goto refresh;
505
506 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
507 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
508 if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
509 client_context_reset(c);
510 goto refresh;
511 }
512
513 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
514 if (c->timestamp + REFRESH_USEC < timestamp)
515 goto refresh;
516
517 /* If the data passed along doesn't match the cached data we also do a refresh */
518 if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
519 goto refresh;
520
521 if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
522 goto refresh;
523
524 if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
525 goto refresh;
526
527 return;
528
529refresh:
530 client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
531}
532
533static void client_context_try_shrink_to(Server *s, size_t limit) {
534 assert(s);
535
536 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
537 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
538 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
539
540 while (hashmap_size(s->client_contexts) > limit) {
541 ClientContext *c;
542
543 c = prioq_pop(s->client_contexts_lru);
544 if (!c)
545 break; /* All remaining entries are pinned, give up */
546
547 assert(c->in_lru);
548 assert(c->n_ref == 0);
549
550 c->in_lru = false;
551
552 client_context_free(s, c);
553 }
554}
555
556void client_context_flush_all(Server *s) {
557 assert(s);
558
559 /* Flush out all remaining entries. This assumes all references are already dropped. */
560
561 s->my_context = client_context_release(s, s->my_context);
562 s->pid1_context = client_context_release(s, s->pid1_context);
563
564 client_context_try_shrink_to(s, 0);
565
566 assert(prioq_size(s->client_contexts_lru) == 0);
567 assert(hashmap_size(s->client_contexts) == 0);
568
569 s->client_contexts_lru = prioq_free(s->client_contexts_lru);
570 s->client_contexts = hashmap_free(s->client_contexts);
571}
572
573static int client_context_get_internal(
574 Server *s,
575 pid_t pid,
576 const struct ucred *ucred,
577 const char *label, size_t label_len,
578 const char *unit_id,
579 bool add_ref,
580 ClientContext **ret) {
581
582 ClientContext *c;
583 int r;
584
585 assert(s);
586 assert(ret);
587
588 if (!pid_is_valid(pid))
589 return -EINVAL;
590
591 c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
592 if (c) {
593
594 if (add_ref) {
595 if (c->in_lru) {
596 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
597 assert(c->n_ref == 0);
598 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
599 c->in_lru = false;
600 }
601
602 c->n_ref++;
603 }
604
605 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
606
607 *ret = c;
608 return 0;
609 }
610
611 client_context_try_shrink_to(s, CACHE_MAX-1);
612
613 r = client_context_new(s, pid, &c);
614 if (r < 0)
615 return r;
616
617 if (add_ref)
618 c->n_ref++;
619 else {
620 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
621 if (r < 0) {
622 client_context_free(s, c);
623 return r;
624 }
625
626 c->in_lru = true;
627 }
628
629 client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
630
631 *ret = c;
632 return 0;
633}
634
635int client_context_get(
636 Server *s,
637 pid_t pid,
638 const struct ucred *ucred,
639 const char *label, size_t label_len,
640 const char *unit_id,
641 ClientContext **ret) {
642
643 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
644}
645
646int client_context_acquire(
647 Server *s,
648 pid_t pid,
649 const struct ucred *ucred,
650 const char *label, size_t label_len,
651 const char *unit_id,
652 ClientContext **ret) {
653
654 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
655};
656
657ClientContext *client_context_release(Server *s, ClientContext *c) {
658 assert(s);
659
660 if (!c)
661 return NULL;
662
663 assert(c->n_ref > 0);
664 assert(!c->in_lru);
665
666 c->n_ref--;
667 if (c->n_ref > 0)
668 return NULL;
669
670 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
671 * right-away */
672
673 if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
674 client_context_free(s, c);
675 else
676 c->in_lru = true;
677
678 return NULL;
679}
680
681void client_context_acquire_default(Server *s) {
682 int r;
683
684 assert(s);
685
686 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
687 * generate driver messages. */
688
689 if (!s->my_context) {
690 struct ucred ucred = {
691 .pid = getpid_cached(),
692 .uid = getuid(),
693 .gid = getgid(),
694 };
695
696 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
697 if (r < 0)
698 log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
699 }
700
701 if (!s->pid1_context) {
702
703 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
704 if (r < 0)
705 log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
706
707 }
708}