]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journald-context.c
tree-wide: use TAKE_PTR() and TAKE_FD() macros
[thirdparty/systemd.git] / src / journal / journald-context.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2017 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #if HAVE_SELINUX
22 #include <selinux/selinux.h>
23 #endif
24
25 #include "alloc-util.h"
26 #include "audit-util.h"
27 #include "cgroup-util.h"
28 #include "fd-util.h"
29 #include "fileio.h"
30 #include "fs-util.h"
31 #include "io-util.h"
32 #include "journal-util.h"
33 #include "journald-context.h"
34 #include "process-util.h"
35 #include "string-util.h"
36 #include "syslog-util.h"
37 #include "unaligned.h"
38 #include "user-util.h"
39
40 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
41 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
42 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
43 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
44 *
45 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
46 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
47 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
48 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
49 * flushed out). Data newer than 1s is used immediately without refresh.
50 *
51 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
52 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
53 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
54 *
55 * Caching metadata like this has two major benefits:
56 *
57 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
58 *
59 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
60 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
61 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
62 * stream connection. This should improve cases where a service process logs immediately before exiting and we
63 * previously had trouble associating the log message with the service.
64 *
65 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
66 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
67 * and sometimes slightly newer than what was current at the log event).
68 */
69
70 /* We refresh every 1s */
71 #define REFRESH_USEC (1*USEC_PER_SEC)
72
73 /* Data older than 5s we flush out */
74 #define MAX_USEC (5*USEC_PER_SEC)
75
76 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
77 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
78 * clients itself is limited.) */
79 #define CACHE_MAX (16*1024)
80
81 static int client_context_compare(const void *a, const void *b) {
82 const ClientContext *x = a, *y = b;
83
84 if (x->timestamp < y->timestamp)
85 return -1;
86 if (x->timestamp > y->timestamp)
87 return 1;
88
89 if (x->pid < y->pid)
90 return -1;
91 if (x->pid > y->pid)
92 return 1;
93
94 return 0;
95 }
96
97 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
98 ClientContext *c;
99 int r;
100
101 assert(s);
102 assert(pid_is_valid(pid));
103 assert(ret);
104
105 r = hashmap_ensure_allocated(&s->client_contexts, NULL);
106 if (r < 0)
107 return r;
108
109 r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
110 if (r < 0)
111 return r;
112
113 c = new0(ClientContext, 1);
114 if (!c)
115 return -ENOMEM;
116
117 c->pid = pid;
118
119 c->uid = UID_INVALID;
120 c->gid = GID_INVALID;
121 c->auditid = AUDIT_SESSION_INVALID;
122 c->loginuid = UID_INVALID;
123 c->owner_uid = UID_INVALID;
124 c->lru_index = PRIOQ_IDX_NULL;
125 c->timestamp = USEC_INFINITY;
126 c->extra_fields_mtime = NSEC_INFINITY;
127 c->log_level_max = -1;
128
129 r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
130 if (r < 0) {
131 free(c);
132 return r;
133 }
134
135 *ret = c;
136 return 0;
137 }
138
139 static void client_context_reset(ClientContext *c) {
140 assert(c);
141
142 c->timestamp = USEC_INFINITY;
143
144 c->uid = UID_INVALID;
145 c->gid = GID_INVALID;
146
147 c->comm = mfree(c->comm);
148 c->exe = mfree(c->exe);
149 c->cmdline = mfree(c->cmdline);
150 c->capeff = mfree(c->capeff);
151
152 c->auditid = AUDIT_SESSION_INVALID;
153 c->loginuid = UID_INVALID;
154
155 c->cgroup = mfree(c->cgroup);
156 c->session = mfree(c->session);
157 c->owner_uid = UID_INVALID;
158 c->unit = mfree(c->unit);
159 c->user_unit = mfree(c->user_unit);
160 c->slice = mfree(c->slice);
161 c->user_slice = mfree(c->user_slice);
162
163 c->invocation_id = SD_ID128_NULL;
164
165 c->label = mfree(c->label);
166 c->label_size = 0;
167
168 c->extra_fields_iovec = mfree(c->extra_fields_iovec);
169 c->extra_fields_n_iovec = 0;
170 c->extra_fields_data = mfree(c->extra_fields_data);
171 c->extra_fields_mtime = NSEC_INFINITY;
172
173 c->log_level_max = -1;
174 }
175
176 static ClientContext* client_context_free(Server *s, ClientContext *c) {
177 assert(s);
178
179 if (!c)
180 return NULL;
181
182 assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
183
184 if (c->in_lru)
185 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
186
187 client_context_reset(c);
188
189 return mfree(c);
190 }
191
192 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
193 assert(c);
194 assert(pid_is_valid(c->pid));
195
196 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
197 if (ucred && uid_is_valid(ucred->uid))
198 c->uid = ucred->uid;
199 else
200 (void) get_process_uid(c->pid, &c->uid);
201
202 if (ucred && gid_is_valid(ucred->gid))
203 c->gid = ucred->gid;
204 else
205 (void) get_process_gid(c->pid, &c->gid);
206 }
207
208 static void client_context_read_basic(ClientContext *c) {
209 char *t;
210
211 assert(c);
212 assert(pid_is_valid(c->pid));
213
214 if (get_process_comm(c->pid, &t) >= 0)
215 free_and_replace(c->comm, t);
216
217 if (get_process_exe(c->pid, &t) >= 0)
218 free_and_replace(c->exe, t);
219
220 if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
221 free_and_replace(c->cmdline, t);
222
223 if (get_process_capeff(c->pid, &t) >= 0)
224 free_and_replace(c->capeff, t);
225 }
226
227 static int client_context_read_label(
228 ClientContext *c,
229 const char *label, size_t label_size) {
230
231 assert(c);
232 assert(pid_is_valid(c->pid));
233 assert(label_size == 0 || label);
234
235 if (label_size > 0) {
236 char *l;
237
238 /* If we got an SELinux label passed in it counts. */
239
240 l = newdup_suffix0(char, label, label_size);
241 if (!l)
242 return -ENOMEM;
243
244 free_and_replace(c->label, l);
245 c->label_size = label_size;
246 }
247 #if HAVE_SELINUX
248 else {
249 char *con;
250
251 /* If we got no SELinux label passed in, let's try to acquire one */
252
253 if (getpidcon(c->pid, &con) >= 0) {
254 free_and_replace(c->label, con);
255 c->label_size = strlen(c->label);
256 }
257 }
258 #endif
259
260 return 0;
261 }
262
263 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
264 char *t = NULL;
265 int r;
266
267 assert(c);
268
269 /* Try to acquire the current cgroup path */
270 r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
271 if (r < 0) {
272
273 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
274 if (unit_id && !c->unit) {
275 c->unit = strdup(unit_id);
276 if (c->unit)
277 return 0;
278 }
279
280 return r;
281 }
282
283 /* Let's shortcut this if the cgroup path didn't change */
284 if (streq_ptr(c->cgroup, t)) {
285 free(t);
286 return 0;
287 }
288
289 free_and_replace(c->cgroup, t);
290
291 (void) cg_path_get_session(c->cgroup, &t);
292 free_and_replace(c->session, t);
293
294 if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
295 c->owner_uid = UID_INVALID;
296
297 (void) cg_path_get_unit(c->cgroup, &t);
298 free_and_replace(c->unit, t);
299
300 (void) cg_path_get_user_unit(c->cgroup, &t);
301 free_and_replace(c->user_unit, t);
302
303 (void) cg_path_get_slice(c->cgroup, &t);
304 free_and_replace(c->slice, t);
305
306 (void) cg_path_get_user_slice(c->cgroup, &t);
307 free_and_replace(c->user_slice, t);
308
309 return 0;
310 }
311
312 static int client_context_read_invocation_id(
313 Server *s,
314 ClientContext *c) {
315
316 _cleanup_free_ char *value = NULL;
317 const char *p;
318 int r;
319
320 assert(s);
321 assert(c);
322
323 /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
324
325 if (!c->unit)
326 return 0;
327
328 p = strjoina("/run/systemd/units/invocation:", c->unit);
329 r = readlink_malloc(p, &value);
330 if (r < 0)
331 return r;
332
333 return sd_id128_from_string(value, &c->invocation_id);
334 }
335
336 static int client_context_read_log_level_max(
337 Server *s,
338 ClientContext *c) {
339
340 _cleanup_free_ char *value = NULL;
341 const char *p;
342 int r, ll;
343
344 if (!c->unit)
345 return 0;
346
347 p = strjoina("/run/systemd/units/log-level-max:", c->unit);
348 r = readlink_malloc(p, &value);
349 if (r < 0)
350 return r;
351
352 ll = log_level_from_string(value);
353 if (ll < 0)
354 return -EINVAL;
355
356 c->log_level_max = ll;
357 return 0;
358 }
359
360 static int client_context_read_extra_fields(
361 Server *s,
362 ClientContext *c) {
363
364 size_t size = 0, n_iovec = 0, n_allocated = 0, left;
365 _cleanup_free_ struct iovec *iovec = NULL;
366 _cleanup_free_ void *data = NULL;
367 _cleanup_fclose_ FILE *f = NULL;
368 struct stat st;
369 const char *p;
370 uint8_t *q;
371 int r;
372
373 if (!c->unit)
374 return 0;
375
376 p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
377
378 if (c->extra_fields_mtime != NSEC_INFINITY) {
379 if (stat(p, &st) < 0) {
380 if (errno == ENOENT)
381 return 0;
382
383 return -errno;
384 }
385
386 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
387 return 0;
388 }
389
390 f = fopen(p, "re");
391 if (!f) {
392 if (errno == ENOENT)
393 return 0;
394
395 return -errno;
396 }
397
398 if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
399 * one, that matches the stuff we are reading */
400 return -errno;
401
402 r = read_full_stream(f, (char**) &data, &size);
403 if (r < 0)
404 return r;
405
406 q = data, left = size;
407 while (left > 0) {
408 uint8_t *field, *eq;
409 uint64_t v, n;
410
411 if (left < sizeof(uint64_t))
412 return -EBADMSG;
413
414 v = unaligned_read_le64(q);
415 if (v < 2)
416 return -EBADMSG;
417
418 n = sizeof(uint64_t) + v;
419 if (left < n)
420 return -EBADMSG;
421
422 field = q + sizeof(uint64_t);
423
424 eq = memchr(field, '=', v);
425 if (!eq)
426 return -EBADMSG;
427
428 if (!journal_field_valid((const char *) field, eq - field, false))
429 return -EBADMSG;
430
431 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
432 return -ENOMEM;
433
434 iovec[n_iovec++] = IOVEC_MAKE(field, v);
435
436 left -= n, q += n;
437 }
438
439 free(c->extra_fields_iovec);
440 free(c->extra_fields_data);
441
442 c->extra_fields_iovec = TAKE_PTR(iovec);
443 c->extra_fields_n_iovec = n_iovec;
444 c->extra_fields_data = TAKE_PTR(data);
445 c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
446
447 return 0;
448 }
449
450 static void client_context_really_refresh(
451 Server *s,
452 ClientContext *c,
453 const struct ucred *ucred,
454 const char *label, size_t label_size,
455 const char *unit_id,
456 usec_t timestamp) {
457
458 assert(s);
459 assert(c);
460 assert(pid_is_valid(c->pid));
461
462 if (timestamp == USEC_INFINITY)
463 timestamp = now(CLOCK_MONOTONIC);
464
465 client_context_read_uid_gid(c, ucred);
466 client_context_read_basic(c);
467 (void) client_context_read_label(c, label, label_size);
468
469 (void) audit_session_from_pid(c->pid, &c->auditid);
470 (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
471
472 (void) client_context_read_cgroup(s, c, unit_id);
473 (void) client_context_read_invocation_id(s, c);
474 (void) client_context_read_log_level_max(s, c);
475 (void) client_context_read_extra_fields(s, c);
476
477 c->timestamp = timestamp;
478
479 if (c->in_lru) {
480 assert(c->n_ref == 0);
481 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
482 }
483 }
484
485 void client_context_maybe_refresh(
486 Server *s,
487 ClientContext *c,
488 const struct ucred *ucred,
489 const char *label, size_t label_size,
490 const char *unit_id,
491 usec_t timestamp) {
492
493 assert(s);
494 assert(c);
495
496 if (timestamp == USEC_INFINITY)
497 timestamp = now(CLOCK_MONOTONIC);
498
499 /* No cached data so far? Let's fill it up */
500 if (c->timestamp == USEC_INFINITY)
501 goto refresh;
502
503 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
504 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
505 if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
506 client_context_reset(c);
507 goto refresh;
508 }
509
510 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
511 if (c->timestamp + REFRESH_USEC < timestamp)
512 goto refresh;
513
514 /* If the data passed along doesn't match the cached data we also do a refresh */
515 if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
516 goto refresh;
517
518 if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
519 goto refresh;
520
521 if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
522 goto refresh;
523
524 return;
525
526 refresh:
527 client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
528 }
529
530 static void client_context_try_shrink_to(Server *s, size_t limit) {
531 assert(s);
532
533 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
534 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
535 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
536
537 while (hashmap_size(s->client_contexts) > limit) {
538 ClientContext *c;
539
540 c = prioq_pop(s->client_contexts_lru);
541 if (!c)
542 break; /* All remaining entries are pinned, give up */
543
544 assert(c->in_lru);
545 assert(c->n_ref == 0);
546
547 c->in_lru = false;
548
549 client_context_free(s, c);
550 }
551 }
552
553 void client_context_flush_all(Server *s) {
554 assert(s);
555
556 /* Flush out all remaining entries. This assumes all references are already dropped. */
557
558 s->my_context = client_context_release(s, s->my_context);
559 s->pid1_context = client_context_release(s, s->pid1_context);
560
561 client_context_try_shrink_to(s, 0);
562
563 assert(prioq_size(s->client_contexts_lru) == 0);
564 assert(hashmap_size(s->client_contexts) == 0);
565
566 s->client_contexts_lru = prioq_free(s->client_contexts_lru);
567 s->client_contexts = hashmap_free(s->client_contexts);
568 }
569
570 static int client_context_get_internal(
571 Server *s,
572 pid_t pid,
573 const struct ucred *ucred,
574 const char *label, size_t label_len,
575 const char *unit_id,
576 bool add_ref,
577 ClientContext **ret) {
578
579 ClientContext *c;
580 int r;
581
582 assert(s);
583 assert(ret);
584
585 if (!pid_is_valid(pid))
586 return -EINVAL;
587
588 c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
589 if (c) {
590
591 if (add_ref) {
592 if (c->in_lru) {
593 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
594 assert(c->n_ref == 0);
595 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
596 c->in_lru = false;
597 }
598
599 c->n_ref++;
600 }
601
602 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
603
604 *ret = c;
605 return 0;
606 }
607
608 client_context_try_shrink_to(s, CACHE_MAX-1);
609
610 r = client_context_new(s, pid, &c);
611 if (r < 0)
612 return r;
613
614 if (add_ref)
615 c->n_ref++;
616 else {
617 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
618 if (r < 0) {
619 client_context_free(s, c);
620 return r;
621 }
622
623 c->in_lru = true;
624 }
625
626 client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
627
628 *ret = c;
629 return 0;
630 }
631
632 int client_context_get(
633 Server *s,
634 pid_t pid,
635 const struct ucred *ucred,
636 const char *label, size_t label_len,
637 const char *unit_id,
638 ClientContext **ret) {
639
640 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
641 }
642
643 int client_context_acquire(
644 Server *s,
645 pid_t pid,
646 const struct ucred *ucred,
647 const char *label, size_t label_len,
648 const char *unit_id,
649 ClientContext **ret) {
650
651 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
652 };
653
654 ClientContext *client_context_release(Server *s, ClientContext *c) {
655 assert(s);
656
657 if (!c)
658 return NULL;
659
660 assert(c->n_ref > 0);
661 assert(!c->in_lru);
662
663 c->n_ref--;
664 if (c->n_ref > 0)
665 return NULL;
666
667 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
668 * right-away */
669
670 if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
671 client_context_free(s, c);
672 else
673 c->in_lru = true;
674
675 return NULL;
676 }
677
678 void client_context_acquire_default(Server *s) {
679 int r;
680
681 assert(s);
682
683 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
684 * generate driver messages. */
685
686 if (!s->my_context) {
687 struct ucred ucred = {
688 .pid = getpid_cached(),
689 .uid = getuid(),
690 .gid = getgid(),
691 };
692
693 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
694 if (r < 0)
695 log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
696 }
697
698 if (!s->pid1_context) {
699
700 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
701 if (r < 0)
702 log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
703
704 }
705 }