]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journald-context.c
Merge pull request #7388 from keszybz/doc-tweak
[thirdparty/systemd.git] / src / journal / journald-context.c
CommitLineData
22e3a02b
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2017 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
349cc4a5 20#if HAVE_SELINUX
22e3a02b
LP
21#include <selinux/selinux.h>
22#endif
23
24#include "alloc-util.h"
25#include "audit-util.h"
26#include "cgroup-util.h"
d3070fbd
LP
27#include "fd-util.h"
28#include "fileio.h"
29#include "fs-util.h"
30#include "io-util.h"
31#include "journal-util.h"
22e3a02b
LP
32#include "journald-context.h"
33#include "process-util.h"
34#include "string-util.h"
d3070fbd
LP
35#include "syslog-util.h"
36#include "unaligned.h"
22e3a02b
LP
37#include "user-util.h"
38
39/* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
40 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
41 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
42 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
43 *
44 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
45 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
46 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
47 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
48 * flushed out). Data newer than 1s is used immediately without refresh.
49 *
50 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
51 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
52 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
53 *
54 * Caching metadata like this has two major benefits:
55 *
56 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
57 *
58 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
59 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
60 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
61 * stream connection. This should improve cases where a service process logs immediately before exiting and we
62 * previously had trouble associating the log message with the service.
63 *
64 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
65 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
66 * and sometimes slightly newer than what was current at the log event).
67 */
68
69/* We refresh every 1s */
70#define REFRESH_USEC (1*USEC_PER_SEC)
71
72/* Data older than 5s we flush out */
73#define MAX_USEC (5*USEC_PER_SEC)
74
75/* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
76 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
77 * clients itself is limited.) */
78#define CACHE_MAX (16*1024)
79
80static int client_context_compare(const void *a, const void *b) {
81 const ClientContext *x = a, *y = b;
82
83 if (x->timestamp < y->timestamp)
84 return -1;
85 if (x->timestamp > y->timestamp)
86 return 1;
87
88 if (x->pid < y->pid)
89 return -1;
90 if (x->pid > y->pid)
91 return 1;
92
93 return 0;
94}
95
96static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
97 ClientContext *c;
98 int r;
99
100 assert(s);
101 assert(pid_is_valid(pid));
102 assert(ret);
103
104 r = hashmap_ensure_allocated(&s->client_contexts, NULL);
105 if (r < 0)
106 return r;
107
108 r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
109 if (r < 0)
110 return r;
111
112 c = new0(ClientContext, 1);
113 if (!c)
114 return -ENOMEM;
115
116 c->pid = pid;
117
118 c->uid = UID_INVALID;
119 c->gid = GID_INVALID;
120 c->auditid = AUDIT_SESSION_INVALID;
121 c->loginuid = UID_INVALID;
122 c->owner_uid = UID_INVALID;
123 c->lru_index = PRIOQ_IDX_NULL;
124 c->timestamp = USEC_INFINITY;
d3070fbd
LP
125 c->extra_fields_mtime = NSEC_INFINITY;
126 c->log_level_max = -1;
22e3a02b
LP
127
128 r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
129 if (r < 0) {
130 free(c);
131 return r;
132 }
133
134 *ret = c;
135 return 0;
136}
137
138static void client_context_reset(ClientContext *c) {
139 assert(c);
140
141 c->timestamp = USEC_INFINITY;
142
143 c->uid = UID_INVALID;
144 c->gid = GID_INVALID;
145
146 c->comm = mfree(c->comm);
147 c->exe = mfree(c->exe);
148 c->cmdline = mfree(c->cmdline);
149 c->capeff = mfree(c->capeff);
150
151 c->auditid = AUDIT_SESSION_INVALID;
152 c->loginuid = UID_INVALID;
153
154 c->cgroup = mfree(c->cgroup);
155 c->session = mfree(c->session);
156 c->owner_uid = UID_INVALID;
157 c->unit = mfree(c->unit);
158 c->user_unit = mfree(c->user_unit);
159 c->slice = mfree(c->slice);
160 c->user_slice = mfree(c->user_slice);
161
162 c->invocation_id = SD_ID128_NULL;
163
164 c->label = mfree(c->label);
165 c->label_size = 0;
d3070fbd
LP
166
167 c->extra_fields_iovec = mfree(c->extra_fields_iovec);
168 c->extra_fields_n_iovec = 0;
169 c->extra_fields_data = mfree(c->extra_fields_data);
170 c->extra_fields_mtime = NSEC_INFINITY;
171
172 c->log_level_max = -1;
22e3a02b
LP
173}
174
175static ClientContext* client_context_free(Server *s, ClientContext *c) {
176 assert(s);
177
178 if (!c)
179 return NULL;
180
181 assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
182
183 if (c->in_lru)
184 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
185
186 client_context_reset(c);
187
188 return mfree(c);
189}
190
191static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
192 assert(c);
193 assert(pid_is_valid(c->pid));
194
195 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
196 if (ucred && uid_is_valid(ucred->uid))
197 c->uid = ucred->uid;
198 else
199 (void) get_process_uid(c->pid, &c->uid);
200
201 if (ucred && gid_is_valid(ucred->gid))
202 c->gid = ucred->gid;
203 else
204 (void) get_process_gid(c->pid, &c->gid);
205}
206
207static void client_context_read_basic(ClientContext *c) {
208 char *t;
209
210 assert(c);
211 assert(pid_is_valid(c->pid));
212
213 if (get_process_comm(c->pid, &t) >= 0)
214 free_and_replace(c->comm, t);
215
216 if (get_process_exe(c->pid, &t) >= 0)
217 free_and_replace(c->exe, t);
218
219 if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
220 free_and_replace(c->cmdline, t);
221
222 if (get_process_capeff(c->pid, &t) >= 0)
223 free_and_replace(c->capeff, t);
224}
225
226static int client_context_read_label(
227 ClientContext *c,
228 const char *label, size_t label_size) {
229
230 assert(c);
231 assert(pid_is_valid(c->pid));
232 assert(label_size == 0 || label);
233
234 if (label_size > 0) {
235 char *l;
236
237 /* If we got an SELinux label passed in it counts. */
238
239 l = newdup_suffix0(char, label, label_size);
240 if (!l)
241 return -ENOMEM;
242
243 free_and_replace(c->label, l);
244 c->label_size = label_size;
245 }
349cc4a5 246#if HAVE_SELINUX
22e3a02b
LP
247 else {
248 char *con;
249
250 /* If we got no SELinux label passed in, let's try to acquire one */
251
252 if (getpidcon(c->pid, &con) >= 0) {
253 free_and_replace(c->label, con);
254 c->label_size = strlen(c->label);
255 }
256 }
257#endif
258
259 return 0;
260}
261
262static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
263 char *t = NULL;
264 int r;
265
266 assert(c);
267
268 /* Try to acquire the current cgroup path */
269 r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
270 if (r < 0) {
271
272 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
273 if (unit_id && !c->unit) {
274 c->unit = strdup(unit_id);
275 if (c->unit)
276 return 0;
277 }
278
279 return r;
280 }
281
282 /* Let's shortcut this if the cgroup path didn't change */
283 if (streq_ptr(c->cgroup, t)) {
284 free(t);
285 return 0;
286 }
287
288 free_and_replace(c->cgroup, t);
289
290 (void) cg_path_get_session(c->cgroup, &t);
291 free_and_replace(c->session, t);
292
293 if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
294 c->owner_uid = UID_INVALID;
295
296 (void) cg_path_get_unit(c->cgroup, &t);
297 free_and_replace(c->unit, t);
298
299 (void) cg_path_get_user_unit(c->cgroup, &t);
300 free_and_replace(c->user_unit, t);
301
302 (void) cg_path_get_slice(c->cgroup, &t);
303 free_and_replace(c->slice, t);
304
305 (void) cg_path_get_user_slice(c->cgroup, &t);
306 free_and_replace(c->user_slice, t);
307
308 return 0;
309}
310
311static int client_context_read_invocation_id(
312 Server *s,
313 ClientContext *c) {
314
d3070fbd 315 _cleanup_free_ char *value = NULL;
22e3a02b
LP
316 const char *p;
317 int r;
318
319 assert(s);
320 assert(c);
321
d3070fbd 322 /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
22e3a02b 323
d3070fbd 324 if (!c->unit)
22e3a02b
LP
325 return 0;
326
d3070fbd
LP
327 p = strjoina("/run/systemd/units/invocation:", c->unit);
328 r = readlink_malloc(p, &value);
22e3a02b
LP
329 if (r < 0)
330 return r;
331
d3070fbd
LP
332 return sd_id128_from_string(value, &c->invocation_id);
333}
22e3a02b 334
d3070fbd
LP
335static int client_context_read_log_level_max(
336 Server *s,
337 ClientContext *c) {
22e3a02b 338
d3070fbd
LP
339 _cleanup_free_ char *value = NULL;
340 const char *p;
341 int r, ll;
342
343 if (!c->unit)
344 return 0;
345
346 p = strjoina("/run/systemd/units/log-level-max:", c->unit);
347 r = readlink_malloc(p, &value);
22e3a02b
LP
348 if (r < 0)
349 return r;
d3070fbd
LP
350
351 ll = log_level_from_string(value);
352 if (ll < 0)
22e3a02b 353 return -EINVAL;
22e3a02b 354
d3070fbd
LP
355 c->log_level_max = ll;
356 return 0;
357}
358
359static int client_context_read_extra_fields(
360 Server *s,
361 ClientContext *c) {
362
363 size_t size = 0, n_iovec = 0, n_allocated = 0, left;
364 _cleanup_free_ struct iovec *iovec = NULL;
365 _cleanup_free_ void *data = NULL;
366 _cleanup_fclose_ FILE *f = NULL;
367 struct stat st;
368 const char *p;
369 uint8_t *q;
370 int r;
371
372 if (!c->unit)
373 return 0;
374
375 p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
376
377 if (c->extra_fields_mtime != NSEC_INFINITY) {
378 if (stat(p, &st) < 0) {
379 if (errno == ENOENT)
380 return 0;
381
382 return -errno;
383 }
384
385 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
386 return 0;
387 }
388
389 f = fopen(p, "re");
390 if (!f) {
391 if (errno == ENOENT)
392 return 0;
393
394 return -errno;
395 }
396
397 if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
398 * one, that matches the stuff we are reading */
399 return -errno;
400
401 r = read_full_stream(f, (char**) &data, &size);
402 if (r < 0)
403 return r;
404
405 q = data, left = size;
406 while (left > 0) {
407 uint8_t *field, *eq;
408 uint64_t v, n;
409
410 if (left < sizeof(uint64_t))
411 return -EBADMSG;
412
413 v = unaligned_read_le64(q);
414 if (v < 2)
415 return -EBADMSG;
416
417 n = sizeof(uint64_t) + v;
418 if (left < n)
419 return -EBADMSG;
420
421 field = q + sizeof(uint64_t);
422
423 eq = memchr(field, '=', v);
424 if (!eq)
425 return -EBADMSG;
426
427 if (!journal_field_valid((const char *) field, eq - field, false))
428 return -EBADMSG;
429
430 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
431 return -ENOMEM;
432
433 iovec[n_iovec++] = IOVEC_MAKE(field, v);
434
435 left -= n, q += n;
436 }
437
438 free(c->extra_fields_iovec);
439 free(c->extra_fields_data);
440
441 c->extra_fields_iovec = iovec;
442 c->extra_fields_n_iovec = n_iovec;
443 c->extra_fields_data = data;
444 c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
445
446 iovec = NULL;
447 data = NULL;
448
449 return 0;
22e3a02b
LP
450}
451
452static void client_context_really_refresh(
453 Server *s,
454 ClientContext *c,
455 const struct ucred *ucred,
456 const char *label, size_t label_size,
457 const char *unit_id,
458 usec_t timestamp) {
459
460 assert(s);
461 assert(c);
462 assert(pid_is_valid(c->pid));
463
464 if (timestamp == USEC_INFINITY)
465 timestamp = now(CLOCK_MONOTONIC);
466
467 client_context_read_uid_gid(c, ucred);
468 client_context_read_basic(c);
469 (void) client_context_read_label(c, label, label_size);
470
471 (void) audit_session_from_pid(c->pid, &c->auditid);
472 (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
473
474 (void) client_context_read_cgroup(s, c, unit_id);
475 (void) client_context_read_invocation_id(s, c);
d3070fbd
LP
476 (void) client_context_read_log_level_max(s, c);
477 (void) client_context_read_extra_fields(s, c);
22e3a02b
LP
478
479 c->timestamp = timestamp;
480
481 if (c->in_lru) {
482 assert(c->n_ref == 0);
483 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
484 }
485}
486
487void client_context_maybe_refresh(
488 Server *s,
489 ClientContext *c,
490 const struct ucred *ucred,
491 const char *label, size_t label_size,
492 const char *unit_id,
493 usec_t timestamp) {
494
495 assert(s);
496 assert(c);
497
498 if (timestamp == USEC_INFINITY)
499 timestamp = now(CLOCK_MONOTONIC);
500
501 /* No cached data so far? Let's fill it up */
502 if (c->timestamp == USEC_INFINITY)
503 goto refresh;
504
505 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
506 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
507 if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
508 client_context_reset(c);
509 goto refresh;
510 }
511
512 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
513 if (c->timestamp + REFRESH_USEC < timestamp)
514 goto refresh;
515
516 /* If the data passed along doesn't match the cached data we also do a refresh */
517 if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
518 goto refresh;
519
520 if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
521 goto refresh;
522
523 if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
524 goto refresh;
525
526 return;
527
528refresh:
529 client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
530}
531
532static void client_context_try_shrink_to(Server *s, size_t limit) {
533 assert(s);
534
535 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
536 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
537 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
538
539 while (hashmap_size(s->client_contexts) > limit) {
540 ClientContext *c;
541
542 c = prioq_pop(s->client_contexts_lru);
543 if (!c)
544 break; /* All remaining entries are pinned, give up */
545
546 assert(c->in_lru);
547 assert(c->n_ref == 0);
548
549 c->in_lru = false;
550
551 client_context_free(s, c);
552 }
553}
554
555void client_context_flush_all(Server *s) {
556 assert(s);
557
558 /* Flush out all remaining entries. This assumes all references are already dropped. */
559
560 s->my_context = client_context_release(s, s->my_context);
561 s->pid1_context = client_context_release(s, s->pid1_context);
562
563 client_context_try_shrink_to(s, 0);
564
565 assert(prioq_size(s->client_contexts_lru) == 0);
566 assert(hashmap_size(s->client_contexts) == 0);
567
568 s->client_contexts_lru = prioq_free(s->client_contexts_lru);
569 s->client_contexts = hashmap_free(s->client_contexts);
570}
571
572static int client_context_get_internal(
573 Server *s,
574 pid_t pid,
575 const struct ucred *ucred,
576 const char *label, size_t label_len,
577 const char *unit_id,
578 bool add_ref,
579 ClientContext **ret) {
580
581 ClientContext *c;
582 int r;
583
584 assert(s);
585 assert(ret);
586
587 if (!pid_is_valid(pid))
588 return -EINVAL;
589
590 c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
591 if (c) {
592
593 if (add_ref) {
594 if (c->in_lru) {
595 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
596 assert(c->n_ref == 0);
597 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
598 c->in_lru = false;
599 }
600
601 c->n_ref++;
602 }
603
604 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
605
606 *ret = c;
607 return 0;
608 }
609
610 client_context_try_shrink_to(s, CACHE_MAX-1);
611
612 r = client_context_new(s, pid, &c);
613 if (r < 0)
614 return r;
615
616 if (add_ref)
617 c->n_ref++;
618 else {
619 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
620 if (r < 0) {
621 client_context_free(s, c);
622 return r;
623 }
624
625 c->in_lru = true;
626 }
627
628 client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
629
630 *ret = c;
631 return 0;
632}
633
634int client_context_get(
635 Server *s,
636 pid_t pid,
637 const struct ucred *ucred,
638 const char *label, size_t label_len,
639 const char *unit_id,
640 ClientContext **ret) {
641
642 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
643}
644
645int client_context_acquire(
646 Server *s,
647 pid_t pid,
648 const struct ucred *ucred,
649 const char *label, size_t label_len,
650 const char *unit_id,
651 ClientContext **ret) {
652
653 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
654};
655
656ClientContext *client_context_release(Server *s, ClientContext *c) {
657 assert(s);
658
659 if (!c)
660 return NULL;
661
662 assert(c->n_ref > 0);
663 assert(!c->in_lru);
664
665 c->n_ref--;
666 if (c->n_ref > 0)
667 return NULL;
668
669 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
670 * right-away */
671
672 if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
673 client_context_free(s, c);
674 else
675 c->in_lru = true;
676
677 return NULL;
678}
679
680void client_context_acquire_default(Server *s) {
681 int r;
682
683 assert(s);
684
685 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
686 * generate driver messages. */
687
688 if (!s->my_context) {
689 struct ucred ucred = {
690 .pid = getpid_cached(),
691 .uid = getuid(),
692 .gid = getgid(),
693 };
694
695 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
696 if (r < 0)
697 log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
698 }
699
700 if (!s->pid1_context) {
701
702 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
703 if (r < 0)
704 log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
705
706 }
707}