]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journald-context.c
Merge pull request #10334 from keszybz/nomempool
[thirdparty/systemd.git] / src / journal / journald-context.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #if HAVE_SELINUX
4 #include <selinux/selinux.h>
5 #endif
6
7 #include "alloc-util.h"
8 #include "audit-util.h"
9 #include "cgroup-util.h"
10 #include "fd-util.h"
11 #include "fileio.h"
12 #include "fs-util.h"
13 #include "io-util.h"
14 #include "journal-util.h"
15 #include "journald-context.h"
16 #include "process-util.h"
17 #include "string-util.h"
18 #include "syslog-util.h"
19 #include "unaligned.h"
20 #include "user-util.h"
21
22 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
23 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
24 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
25 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
26 *
27 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
28 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
29 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
30 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
31 * flushed out). Data newer than 1s is used immediately without refresh.
32 *
33 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
34 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
35 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
36 *
37 * Caching metadata like this has two major benefits:
38 *
39 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
40 *
41 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
42 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
43 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
44 * stream connection. This should improve cases where a service process logs immediately before exiting and we
45 * previously had trouble associating the log message with the service.
46 *
47 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
48 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
49 * and sometimes slightly newer than what was current at the log event).
50 */
51
52 /* We refresh every 1s */
53 #define REFRESH_USEC (1*USEC_PER_SEC)
54
55 /* Data older than 5s we flush out */
56 #define MAX_USEC (5*USEC_PER_SEC)
57
58 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
59 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
60 * clients itself is limited.) */
61 #define CACHE_MAX (16*1024)
62
63 static int client_context_compare(const void *a, const void *b) {
64 const ClientContext *x = a, *y = b;
65 int r;
66
67 r = CMP(x->timestamp, y->timestamp);
68 if (r != 0)
69 return r;
70
71 return CMP(x->pid, y->pid);
72 }
73
74 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
75 ClientContext *c;
76 int r;
77
78 assert(s);
79 assert(pid_is_valid(pid));
80 assert(ret);
81
82 r = hashmap_ensure_allocated(&s->client_contexts, NULL);
83 if (r < 0)
84 return r;
85
86 r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
87 if (r < 0)
88 return r;
89
90 c = new0(ClientContext, 1);
91 if (!c)
92 return -ENOMEM;
93
94 c->pid = pid;
95
96 c->uid = UID_INVALID;
97 c->gid = GID_INVALID;
98 c->auditid = AUDIT_SESSION_INVALID;
99 c->loginuid = UID_INVALID;
100 c->owner_uid = UID_INVALID;
101 c->lru_index = PRIOQ_IDX_NULL;
102 c->timestamp = USEC_INFINITY;
103 c->extra_fields_mtime = NSEC_INFINITY;
104 c->log_level_max = -1;
105
106 r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
107 if (r < 0) {
108 free(c);
109 return r;
110 }
111
112 *ret = c;
113 return 0;
114 }
115
116 static void client_context_reset(ClientContext *c) {
117 assert(c);
118
119 c->timestamp = USEC_INFINITY;
120
121 c->uid = UID_INVALID;
122 c->gid = GID_INVALID;
123
124 c->comm = mfree(c->comm);
125 c->exe = mfree(c->exe);
126 c->cmdline = mfree(c->cmdline);
127 c->capeff = mfree(c->capeff);
128
129 c->auditid = AUDIT_SESSION_INVALID;
130 c->loginuid = UID_INVALID;
131
132 c->cgroup = mfree(c->cgroup);
133 c->session = mfree(c->session);
134 c->owner_uid = UID_INVALID;
135 c->unit = mfree(c->unit);
136 c->user_unit = mfree(c->user_unit);
137 c->slice = mfree(c->slice);
138 c->user_slice = mfree(c->user_slice);
139
140 c->invocation_id = SD_ID128_NULL;
141
142 c->label = mfree(c->label);
143 c->label_size = 0;
144
145 c->extra_fields_iovec = mfree(c->extra_fields_iovec);
146 c->extra_fields_n_iovec = 0;
147 c->extra_fields_data = mfree(c->extra_fields_data);
148 c->extra_fields_mtime = NSEC_INFINITY;
149
150 c->log_level_max = -1;
151 }
152
153 static ClientContext* client_context_free(Server *s, ClientContext *c) {
154 assert(s);
155
156 if (!c)
157 return NULL;
158
159 assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
160
161 if (c->in_lru)
162 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
163
164 client_context_reset(c);
165
166 return mfree(c);
167 }
168
169 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
170 assert(c);
171 assert(pid_is_valid(c->pid));
172
173 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
174 if (ucred && uid_is_valid(ucred->uid))
175 c->uid = ucred->uid;
176 else
177 (void) get_process_uid(c->pid, &c->uid);
178
179 if (ucred && gid_is_valid(ucred->gid))
180 c->gid = ucred->gid;
181 else
182 (void) get_process_gid(c->pid, &c->gid);
183 }
184
185 static void client_context_read_basic(ClientContext *c) {
186 char *t;
187
188 assert(c);
189 assert(pid_is_valid(c->pid));
190
191 if (get_process_comm(c->pid, &t) >= 0)
192 free_and_replace(c->comm, t);
193
194 if (get_process_exe(c->pid, &t) >= 0)
195 free_and_replace(c->exe, t);
196
197 if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
198 free_and_replace(c->cmdline, t);
199
200 if (get_process_capeff(c->pid, &t) >= 0)
201 free_and_replace(c->capeff, t);
202 }
203
204 static int client_context_read_label(
205 ClientContext *c,
206 const char *label, size_t label_size) {
207
208 assert(c);
209 assert(pid_is_valid(c->pid));
210 assert(label_size == 0 || label);
211
212 if (label_size > 0) {
213 char *l;
214
215 /* If we got an SELinux label passed in it counts. */
216
217 l = newdup_suffix0(char, label, label_size);
218 if (!l)
219 return -ENOMEM;
220
221 free_and_replace(c->label, l);
222 c->label_size = label_size;
223 }
224 #if HAVE_SELINUX
225 else {
226 char *con;
227
228 /* If we got no SELinux label passed in, let's try to acquire one */
229
230 if (getpidcon(c->pid, &con) >= 0) {
231 free_and_replace(c->label, con);
232 c->label_size = strlen(c->label);
233 }
234 }
235 #endif
236
237 return 0;
238 }
239
240 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
241 char *t = NULL;
242 int r;
243
244 assert(c);
245
246 /* Try to acquire the current cgroup path */
247 r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
248 if (r < 0) {
249
250 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
251 if (unit_id && !c->unit) {
252 c->unit = strdup(unit_id);
253 if (c->unit)
254 return 0;
255 }
256
257 return r;
258 }
259
260 /* Let's shortcut this if the cgroup path didn't change */
261 if (streq_ptr(c->cgroup, t)) {
262 free(t);
263 return 0;
264 }
265
266 free_and_replace(c->cgroup, t);
267
268 (void) cg_path_get_session(c->cgroup, &t);
269 free_and_replace(c->session, t);
270
271 if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
272 c->owner_uid = UID_INVALID;
273
274 (void) cg_path_get_unit(c->cgroup, &t);
275 free_and_replace(c->unit, t);
276
277 (void) cg_path_get_user_unit(c->cgroup, &t);
278 free_and_replace(c->user_unit, t);
279
280 (void) cg_path_get_slice(c->cgroup, &t);
281 free_and_replace(c->slice, t);
282
283 (void) cg_path_get_user_slice(c->cgroup, &t);
284 free_and_replace(c->user_slice, t);
285
286 return 0;
287 }
288
289 static int client_context_read_invocation_id(
290 Server *s,
291 ClientContext *c) {
292
293 _cleanup_free_ char *value = NULL;
294 const char *p;
295 int r;
296
297 assert(s);
298 assert(c);
299
300 /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
301
302 if (!c->unit)
303 return 0;
304
305 p = strjoina("/run/systemd/units/invocation:", c->unit);
306 r = readlink_malloc(p, &value);
307 if (r < 0)
308 return r;
309
310 return sd_id128_from_string(value, &c->invocation_id);
311 }
312
313 static int client_context_read_log_level_max(
314 Server *s,
315 ClientContext *c) {
316
317 _cleanup_free_ char *value = NULL;
318 const char *p;
319 int r, ll;
320
321 if (!c->unit)
322 return 0;
323
324 p = strjoina("/run/systemd/units/log-level-max:", c->unit);
325 r = readlink_malloc(p, &value);
326 if (r < 0)
327 return r;
328
329 ll = log_level_from_string(value);
330 if (ll < 0)
331 return -EINVAL;
332
333 c->log_level_max = ll;
334 return 0;
335 }
336
337 static int client_context_read_extra_fields(
338 Server *s,
339 ClientContext *c) {
340
341 size_t size = 0, n_iovec = 0, n_allocated = 0, left;
342 _cleanup_free_ struct iovec *iovec = NULL;
343 _cleanup_free_ void *data = NULL;
344 _cleanup_fclose_ FILE *f = NULL;
345 struct stat st;
346 const char *p;
347 uint8_t *q;
348 int r;
349
350 if (!c->unit)
351 return 0;
352
353 p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
354
355 if (c->extra_fields_mtime != NSEC_INFINITY) {
356 if (stat(p, &st) < 0) {
357 if (errno == ENOENT)
358 return 0;
359
360 return -errno;
361 }
362
363 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
364 return 0;
365 }
366
367 f = fopen(p, "re");
368 if (!f) {
369 if (errno == ENOENT)
370 return 0;
371
372 return -errno;
373 }
374
375 if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
376 * one, that matches the stuff we are reading */
377 return -errno;
378
379 r = read_full_stream(f, (char**) &data, &size);
380 if (r < 0)
381 return r;
382
383 q = data, left = size;
384 while (left > 0) {
385 uint8_t *field, *eq;
386 uint64_t v, n;
387
388 if (left < sizeof(uint64_t))
389 return -EBADMSG;
390
391 v = unaligned_read_le64(q);
392 if (v < 2)
393 return -EBADMSG;
394
395 n = sizeof(uint64_t) + v;
396 if (left < n)
397 return -EBADMSG;
398
399 field = q + sizeof(uint64_t);
400
401 eq = memchr(field, '=', v);
402 if (!eq)
403 return -EBADMSG;
404
405 if (!journal_field_valid((const char *) field, eq - field, false))
406 return -EBADMSG;
407
408 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
409 return -ENOMEM;
410
411 iovec[n_iovec++] = IOVEC_MAKE(field, v);
412
413 left -= n, q += n;
414 }
415
416 free(c->extra_fields_iovec);
417 free(c->extra_fields_data);
418
419 c->extra_fields_iovec = TAKE_PTR(iovec);
420 c->extra_fields_n_iovec = n_iovec;
421 c->extra_fields_data = TAKE_PTR(data);
422 c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
423
424 return 0;
425 }
426
427 static void client_context_really_refresh(
428 Server *s,
429 ClientContext *c,
430 const struct ucred *ucred,
431 const char *label, size_t label_size,
432 const char *unit_id,
433 usec_t timestamp) {
434
435 assert(s);
436 assert(c);
437 assert(pid_is_valid(c->pid));
438
439 if (timestamp == USEC_INFINITY)
440 timestamp = now(CLOCK_MONOTONIC);
441
442 client_context_read_uid_gid(c, ucred);
443 client_context_read_basic(c);
444 (void) client_context_read_label(c, label, label_size);
445
446 (void) audit_session_from_pid(c->pid, &c->auditid);
447 (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
448
449 (void) client_context_read_cgroup(s, c, unit_id);
450 (void) client_context_read_invocation_id(s, c);
451 (void) client_context_read_log_level_max(s, c);
452 (void) client_context_read_extra_fields(s, c);
453
454 c->timestamp = timestamp;
455
456 if (c->in_lru) {
457 assert(c->n_ref == 0);
458 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
459 }
460 }
461
462 void client_context_maybe_refresh(
463 Server *s,
464 ClientContext *c,
465 const struct ucred *ucred,
466 const char *label, size_t label_size,
467 const char *unit_id,
468 usec_t timestamp) {
469
470 assert(s);
471 assert(c);
472
473 if (timestamp == USEC_INFINITY)
474 timestamp = now(CLOCK_MONOTONIC);
475
476 /* No cached data so far? Let's fill it up */
477 if (c->timestamp == USEC_INFINITY)
478 goto refresh;
479
480 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
481 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
482 if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
483 client_context_reset(c);
484 goto refresh;
485 }
486
487 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
488 if (c->timestamp + REFRESH_USEC < timestamp)
489 goto refresh;
490
491 /* If the data passed along doesn't match the cached data we also do a refresh */
492 if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
493 goto refresh;
494
495 if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
496 goto refresh;
497
498 if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
499 goto refresh;
500
501 return;
502
503 refresh:
504 client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
505 }
506
507 static void client_context_try_shrink_to(Server *s, size_t limit) {
508 assert(s);
509
510 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
511 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
512 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
513
514 while (hashmap_size(s->client_contexts) > limit) {
515 ClientContext *c;
516
517 c = prioq_pop(s->client_contexts_lru);
518 if (!c)
519 break; /* All remaining entries are pinned, give up */
520
521 assert(c->in_lru);
522 assert(c->n_ref == 0);
523
524 c->in_lru = false;
525
526 client_context_free(s, c);
527 }
528 }
529
530 void client_context_flush_all(Server *s) {
531 assert(s);
532
533 /* Flush out all remaining entries. This assumes all references are already dropped. */
534
535 s->my_context = client_context_release(s, s->my_context);
536 s->pid1_context = client_context_release(s, s->pid1_context);
537
538 client_context_try_shrink_to(s, 0);
539
540 assert(prioq_size(s->client_contexts_lru) == 0);
541 assert(hashmap_size(s->client_contexts) == 0);
542
543 s->client_contexts_lru = prioq_free(s->client_contexts_lru);
544 s->client_contexts = hashmap_free(s->client_contexts);
545 }
546
547 static int client_context_get_internal(
548 Server *s,
549 pid_t pid,
550 const struct ucred *ucred,
551 const char *label, size_t label_len,
552 const char *unit_id,
553 bool add_ref,
554 ClientContext **ret) {
555
556 ClientContext *c;
557 int r;
558
559 assert(s);
560 assert(ret);
561
562 if (!pid_is_valid(pid))
563 return -EINVAL;
564
565 c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
566 if (c) {
567
568 if (add_ref) {
569 if (c->in_lru) {
570 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
571 assert(c->n_ref == 0);
572 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
573 c->in_lru = false;
574 }
575
576 c->n_ref++;
577 }
578
579 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
580
581 *ret = c;
582 return 0;
583 }
584
585 client_context_try_shrink_to(s, CACHE_MAX-1);
586
587 r = client_context_new(s, pid, &c);
588 if (r < 0)
589 return r;
590
591 if (add_ref)
592 c->n_ref++;
593 else {
594 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
595 if (r < 0) {
596 client_context_free(s, c);
597 return r;
598 }
599
600 c->in_lru = true;
601 }
602
603 client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
604
605 *ret = c;
606 return 0;
607 }
608
609 int client_context_get(
610 Server *s,
611 pid_t pid,
612 const struct ucred *ucred,
613 const char *label, size_t label_len,
614 const char *unit_id,
615 ClientContext **ret) {
616
617 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
618 }
619
620 int client_context_acquire(
621 Server *s,
622 pid_t pid,
623 const struct ucred *ucred,
624 const char *label, size_t label_len,
625 const char *unit_id,
626 ClientContext **ret) {
627
628 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
629 };
630
631 ClientContext *client_context_release(Server *s, ClientContext *c) {
632 assert(s);
633
634 if (!c)
635 return NULL;
636
637 assert(c->n_ref > 0);
638 assert(!c->in_lru);
639
640 c->n_ref--;
641 if (c->n_ref > 0)
642 return NULL;
643
644 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
645 * right-away */
646
647 if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
648 client_context_free(s, c);
649 else
650 c->in_lru = true;
651
652 return NULL;
653 }
654
655 void client_context_acquire_default(Server *s) {
656 int r;
657
658 assert(s);
659
660 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
661 * generate driver messages. */
662
663 if (!s->my_context) {
664 struct ucred ucred = {
665 .pid = getpid_cached(),
666 .uid = getuid(),
667 .gid = getgid(),
668 };
669
670 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
671 if (r < 0)
672 log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
673 }
674
675 if (!s->pid1_context) {
676
677 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
678 if (r < 0)
679 log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
680
681 }
682 }