]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journald-context.c
Merge pull request #8417 from brauner/2018-03-09/add_bind_mount_fallback_to_private_d...
[thirdparty/systemd.git] / src / journal / journald-context.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2017 Lennart Poettering
6 ***/
7
8 #if HAVE_SELINUX
9 #include <selinux/selinux.h>
10 #endif
11
12 #include "alloc-util.h"
13 #include "audit-util.h"
14 #include "cgroup-util.h"
15 #include "fd-util.h"
16 #include "fileio.h"
17 #include "fs-util.h"
18 #include "io-util.h"
19 #include "journal-util.h"
20 #include "journald-context.h"
21 #include "process-util.h"
22 #include "string-util.h"
23 #include "syslog-util.h"
24 #include "unaligned.h"
25 #include "user-util.h"
26
27 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
28 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
29 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
30 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
31 *
32 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
33 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
34 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
35 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
36 * flushed out). Data newer than 1s is used immediately without refresh.
37 *
38 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
39 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
40 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
41 *
42 * Caching metadata like this has two major benefits:
43 *
44 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
45 *
46 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
47 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
48 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
49 * stream connection. This should improve cases where a service process logs immediately before exiting and we
50 * previously had trouble associating the log message with the service.
51 *
52 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
53 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
54 * and sometimes slightly newer than what was current at the log event).
55 */
56
57 /* We refresh every 1s */
58 #define REFRESH_USEC (1*USEC_PER_SEC)
59
60 /* Data older than 5s we flush out */
61 #define MAX_USEC (5*USEC_PER_SEC)
62
63 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
64 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
65 * clients itself is limited.) */
66 #define CACHE_MAX (16*1024)
67
68 static int client_context_compare(const void *a, const void *b) {
69 const ClientContext *x = a, *y = b;
70
71 if (x->timestamp < y->timestamp)
72 return -1;
73 if (x->timestamp > y->timestamp)
74 return 1;
75
76 if (x->pid < y->pid)
77 return -1;
78 if (x->pid > y->pid)
79 return 1;
80
81 return 0;
82 }
83
84 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
85 ClientContext *c;
86 int r;
87
88 assert(s);
89 assert(pid_is_valid(pid));
90 assert(ret);
91
92 r = hashmap_ensure_allocated(&s->client_contexts, NULL);
93 if (r < 0)
94 return r;
95
96 r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
97 if (r < 0)
98 return r;
99
100 c = new0(ClientContext, 1);
101 if (!c)
102 return -ENOMEM;
103
104 c->pid = pid;
105
106 c->uid = UID_INVALID;
107 c->gid = GID_INVALID;
108 c->auditid = AUDIT_SESSION_INVALID;
109 c->loginuid = UID_INVALID;
110 c->owner_uid = UID_INVALID;
111 c->lru_index = PRIOQ_IDX_NULL;
112 c->timestamp = USEC_INFINITY;
113 c->extra_fields_mtime = NSEC_INFINITY;
114 c->log_level_max = -1;
115
116 r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
117 if (r < 0) {
118 free(c);
119 return r;
120 }
121
122 *ret = c;
123 return 0;
124 }
125
126 static void client_context_reset(ClientContext *c) {
127 assert(c);
128
129 c->timestamp = USEC_INFINITY;
130
131 c->uid = UID_INVALID;
132 c->gid = GID_INVALID;
133
134 c->comm = mfree(c->comm);
135 c->exe = mfree(c->exe);
136 c->cmdline = mfree(c->cmdline);
137 c->capeff = mfree(c->capeff);
138
139 c->auditid = AUDIT_SESSION_INVALID;
140 c->loginuid = UID_INVALID;
141
142 c->cgroup = mfree(c->cgroup);
143 c->session = mfree(c->session);
144 c->owner_uid = UID_INVALID;
145 c->unit = mfree(c->unit);
146 c->user_unit = mfree(c->user_unit);
147 c->slice = mfree(c->slice);
148 c->user_slice = mfree(c->user_slice);
149
150 c->invocation_id = SD_ID128_NULL;
151
152 c->label = mfree(c->label);
153 c->label_size = 0;
154
155 c->extra_fields_iovec = mfree(c->extra_fields_iovec);
156 c->extra_fields_n_iovec = 0;
157 c->extra_fields_data = mfree(c->extra_fields_data);
158 c->extra_fields_mtime = NSEC_INFINITY;
159
160 c->log_level_max = -1;
161 }
162
163 static ClientContext* client_context_free(Server *s, ClientContext *c) {
164 assert(s);
165
166 if (!c)
167 return NULL;
168
169 assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
170
171 if (c->in_lru)
172 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
173
174 client_context_reset(c);
175
176 return mfree(c);
177 }
178
179 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
180 assert(c);
181 assert(pid_is_valid(c->pid));
182
183 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
184 if (ucred && uid_is_valid(ucred->uid))
185 c->uid = ucred->uid;
186 else
187 (void) get_process_uid(c->pid, &c->uid);
188
189 if (ucred && gid_is_valid(ucred->gid))
190 c->gid = ucred->gid;
191 else
192 (void) get_process_gid(c->pid, &c->gid);
193 }
194
195 static void client_context_read_basic(ClientContext *c) {
196 char *t;
197
198 assert(c);
199 assert(pid_is_valid(c->pid));
200
201 if (get_process_comm(c->pid, &t) >= 0)
202 free_and_replace(c->comm, t);
203
204 if (get_process_exe(c->pid, &t) >= 0)
205 free_and_replace(c->exe, t);
206
207 if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
208 free_and_replace(c->cmdline, t);
209
210 if (get_process_capeff(c->pid, &t) >= 0)
211 free_and_replace(c->capeff, t);
212 }
213
214 static int client_context_read_label(
215 ClientContext *c,
216 const char *label, size_t label_size) {
217
218 assert(c);
219 assert(pid_is_valid(c->pid));
220 assert(label_size == 0 || label);
221
222 if (label_size > 0) {
223 char *l;
224
225 /* If we got an SELinux label passed in it counts. */
226
227 l = newdup_suffix0(char, label, label_size);
228 if (!l)
229 return -ENOMEM;
230
231 free_and_replace(c->label, l);
232 c->label_size = label_size;
233 }
234 #if HAVE_SELINUX
235 else {
236 char *con;
237
238 /* If we got no SELinux label passed in, let's try to acquire one */
239
240 if (getpidcon(c->pid, &con) >= 0) {
241 free_and_replace(c->label, con);
242 c->label_size = strlen(c->label);
243 }
244 }
245 #endif
246
247 return 0;
248 }
249
250 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
251 char *t = NULL;
252 int r;
253
254 assert(c);
255
256 /* Try to acquire the current cgroup path */
257 r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
258 if (r < 0) {
259
260 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
261 if (unit_id && !c->unit) {
262 c->unit = strdup(unit_id);
263 if (c->unit)
264 return 0;
265 }
266
267 return r;
268 }
269
270 /* Let's shortcut this if the cgroup path didn't change */
271 if (streq_ptr(c->cgroup, t)) {
272 free(t);
273 return 0;
274 }
275
276 free_and_replace(c->cgroup, t);
277
278 (void) cg_path_get_session(c->cgroup, &t);
279 free_and_replace(c->session, t);
280
281 if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
282 c->owner_uid = UID_INVALID;
283
284 (void) cg_path_get_unit(c->cgroup, &t);
285 free_and_replace(c->unit, t);
286
287 (void) cg_path_get_user_unit(c->cgroup, &t);
288 free_and_replace(c->user_unit, t);
289
290 (void) cg_path_get_slice(c->cgroup, &t);
291 free_and_replace(c->slice, t);
292
293 (void) cg_path_get_user_slice(c->cgroup, &t);
294 free_and_replace(c->user_slice, t);
295
296 return 0;
297 }
298
299 static int client_context_read_invocation_id(
300 Server *s,
301 ClientContext *c) {
302
303 _cleanup_free_ char *value = NULL;
304 const char *p;
305 int r;
306
307 assert(s);
308 assert(c);
309
310 /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
311
312 if (!c->unit)
313 return 0;
314
315 p = strjoina("/run/systemd/units/invocation:", c->unit);
316 r = readlink_malloc(p, &value);
317 if (r < 0)
318 return r;
319
320 return sd_id128_from_string(value, &c->invocation_id);
321 }
322
323 static int client_context_read_log_level_max(
324 Server *s,
325 ClientContext *c) {
326
327 _cleanup_free_ char *value = NULL;
328 const char *p;
329 int r, ll;
330
331 if (!c->unit)
332 return 0;
333
334 p = strjoina("/run/systemd/units/log-level-max:", c->unit);
335 r = readlink_malloc(p, &value);
336 if (r < 0)
337 return r;
338
339 ll = log_level_from_string(value);
340 if (ll < 0)
341 return -EINVAL;
342
343 c->log_level_max = ll;
344 return 0;
345 }
346
347 static int client_context_read_extra_fields(
348 Server *s,
349 ClientContext *c) {
350
351 size_t size = 0, n_iovec = 0, n_allocated = 0, left;
352 _cleanup_free_ struct iovec *iovec = NULL;
353 _cleanup_free_ void *data = NULL;
354 _cleanup_fclose_ FILE *f = NULL;
355 struct stat st;
356 const char *p;
357 uint8_t *q;
358 int r;
359
360 if (!c->unit)
361 return 0;
362
363 p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
364
365 if (c->extra_fields_mtime != NSEC_INFINITY) {
366 if (stat(p, &st) < 0) {
367 if (errno == ENOENT)
368 return 0;
369
370 return -errno;
371 }
372
373 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
374 return 0;
375 }
376
377 f = fopen(p, "re");
378 if (!f) {
379 if (errno == ENOENT)
380 return 0;
381
382 return -errno;
383 }
384
385 if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
386 * one, that matches the stuff we are reading */
387 return -errno;
388
389 r = read_full_stream(f, (char**) &data, &size);
390 if (r < 0)
391 return r;
392
393 q = data, left = size;
394 while (left > 0) {
395 uint8_t *field, *eq;
396 uint64_t v, n;
397
398 if (left < sizeof(uint64_t))
399 return -EBADMSG;
400
401 v = unaligned_read_le64(q);
402 if (v < 2)
403 return -EBADMSG;
404
405 n = sizeof(uint64_t) + v;
406 if (left < n)
407 return -EBADMSG;
408
409 field = q + sizeof(uint64_t);
410
411 eq = memchr(field, '=', v);
412 if (!eq)
413 return -EBADMSG;
414
415 if (!journal_field_valid((const char *) field, eq - field, false))
416 return -EBADMSG;
417
418 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
419 return -ENOMEM;
420
421 iovec[n_iovec++] = IOVEC_MAKE(field, v);
422
423 left -= n, q += n;
424 }
425
426 free(c->extra_fields_iovec);
427 free(c->extra_fields_data);
428
429 c->extra_fields_iovec = TAKE_PTR(iovec);
430 c->extra_fields_n_iovec = n_iovec;
431 c->extra_fields_data = TAKE_PTR(data);
432 c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
433
434 return 0;
435 }
436
437 static void client_context_really_refresh(
438 Server *s,
439 ClientContext *c,
440 const struct ucred *ucred,
441 const char *label, size_t label_size,
442 const char *unit_id,
443 usec_t timestamp) {
444
445 assert(s);
446 assert(c);
447 assert(pid_is_valid(c->pid));
448
449 if (timestamp == USEC_INFINITY)
450 timestamp = now(CLOCK_MONOTONIC);
451
452 client_context_read_uid_gid(c, ucred);
453 client_context_read_basic(c);
454 (void) client_context_read_label(c, label, label_size);
455
456 (void) audit_session_from_pid(c->pid, &c->auditid);
457 (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
458
459 (void) client_context_read_cgroup(s, c, unit_id);
460 (void) client_context_read_invocation_id(s, c);
461 (void) client_context_read_log_level_max(s, c);
462 (void) client_context_read_extra_fields(s, c);
463
464 c->timestamp = timestamp;
465
466 if (c->in_lru) {
467 assert(c->n_ref == 0);
468 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
469 }
470 }
471
472 void client_context_maybe_refresh(
473 Server *s,
474 ClientContext *c,
475 const struct ucred *ucred,
476 const char *label, size_t label_size,
477 const char *unit_id,
478 usec_t timestamp) {
479
480 assert(s);
481 assert(c);
482
483 if (timestamp == USEC_INFINITY)
484 timestamp = now(CLOCK_MONOTONIC);
485
486 /* No cached data so far? Let's fill it up */
487 if (c->timestamp == USEC_INFINITY)
488 goto refresh;
489
490 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
491 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
492 if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
493 client_context_reset(c);
494 goto refresh;
495 }
496
497 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
498 if (c->timestamp + REFRESH_USEC < timestamp)
499 goto refresh;
500
501 /* If the data passed along doesn't match the cached data we also do a refresh */
502 if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
503 goto refresh;
504
505 if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
506 goto refresh;
507
508 if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
509 goto refresh;
510
511 return;
512
513 refresh:
514 client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
515 }
516
517 static void client_context_try_shrink_to(Server *s, size_t limit) {
518 assert(s);
519
520 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
521 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
522 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
523
524 while (hashmap_size(s->client_contexts) > limit) {
525 ClientContext *c;
526
527 c = prioq_pop(s->client_contexts_lru);
528 if (!c)
529 break; /* All remaining entries are pinned, give up */
530
531 assert(c->in_lru);
532 assert(c->n_ref == 0);
533
534 c->in_lru = false;
535
536 client_context_free(s, c);
537 }
538 }
539
540 void client_context_flush_all(Server *s) {
541 assert(s);
542
543 /* Flush out all remaining entries. This assumes all references are already dropped. */
544
545 s->my_context = client_context_release(s, s->my_context);
546 s->pid1_context = client_context_release(s, s->pid1_context);
547
548 client_context_try_shrink_to(s, 0);
549
550 assert(prioq_size(s->client_contexts_lru) == 0);
551 assert(hashmap_size(s->client_contexts) == 0);
552
553 s->client_contexts_lru = prioq_free(s->client_contexts_lru);
554 s->client_contexts = hashmap_free(s->client_contexts);
555 }
556
557 static int client_context_get_internal(
558 Server *s,
559 pid_t pid,
560 const struct ucred *ucred,
561 const char *label, size_t label_len,
562 const char *unit_id,
563 bool add_ref,
564 ClientContext **ret) {
565
566 ClientContext *c;
567 int r;
568
569 assert(s);
570 assert(ret);
571
572 if (!pid_is_valid(pid))
573 return -EINVAL;
574
575 c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
576 if (c) {
577
578 if (add_ref) {
579 if (c->in_lru) {
580 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
581 assert(c->n_ref == 0);
582 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
583 c->in_lru = false;
584 }
585
586 c->n_ref++;
587 }
588
589 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
590
591 *ret = c;
592 return 0;
593 }
594
595 client_context_try_shrink_to(s, CACHE_MAX-1);
596
597 r = client_context_new(s, pid, &c);
598 if (r < 0)
599 return r;
600
601 if (add_ref)
602 c->n_ref++;
603 else {
604 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
605 if (r < 0) {
606 client_context_free(s, c);
607 return r;
608 }
609
610 c->in_lru = true;
611 }
612
613 client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
614
615 *ret = c;
616 return 0;
617 }
618
619 int client_context_get(
620 Server *s,
621 pid_t pid,
622 const struct ucred *ucred,
623 const char *label, size_t label_len,
624 const char *unit_id,
625 ClientContext **ret) {
626
627 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
628 }
629
630 int client_context_acquire(
631 Server *s,
632 pid_t pid,
633 const struct ucred *ucred,
634 const char *label, size_t label_len,
635 const char *unit_id,
636 ClientContext **ret) {
637
638 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
639 };
640
641 ClientContext *client_context_release(Server *s, ClientContext *c) {
642 assert(s);
643
644 if (!c)
645 return NULL;
646
647 assert(c->n_ref > 0);
648 assert(!c->in_lru);
649
650 c->n_ref--;
651 if (c->n_ref > 0)
652 return NULL;
653
654 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
655 * right-away */
656
657 if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
658 client_context_free(s, c);
659 else
660 c->in_lru = true;
661
662 return NULL;
663 }
664
665 void client_context_acquire_default(Server *s) {
666 int r;
667
668 assert(s);
669
670 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
671 * generate driver messages. */
672
673 if (!s->my_context) {
674 struct ucred ucred = {
675 .pid = getpid_cached(),
676 .uid = getuid(),
677 .gid = getgid(),
678 };
679
680 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
681 if (r < 0)
682 log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
683 }
684
685 if (!s->pid1_context) {
686
687 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
688 if (r < 0)
689 log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
690
691 }
692 }