]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journald-context.c
core: implement per unit journal rate limiting
[thirdparty/systemd.git] / src / journal / journald-context.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
22e3a02b 2
349cc4a5 3#if HAVE_SELINUX
22e3a02b
LP
4#include <selinux/selinux.h>
5#endif
6
7#include "alloc-util.h"
8#include "audit-util.h"
9#include "cgroup-util.h"
d3070fbd
LP
10#include "fd-util.h"
11#include "fileio.h"
12#include "fs-util.h"
13#include "io-util.h"
14#include "journal-util.h"
22e3a02b 15#include "journald-context.h"
90fc172e 16#include "parse-util.h"
22e3a02b
LP
17#include "process-util.h"
18#include "string-util.h"
d3070fbd
LP
19#include "syslog-util.h"
20#include "unaligned.h"
22e3a02b
LP
21#include "user-util.h"
22
23/* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
24 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
25 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
26 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
27 *
28 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
29 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
30 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
31 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
32 * flushed out). Data newer than 1s is used immediately without refresh.
33 *
34 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
35 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
36 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
37 *
38 * Caching metadata like this has two major benefits:
39 *
40 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
41 *
42 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
43 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
44 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
45 * stream connection. This should improve cases where a service process logs immediately before exiting and we
46 * previously had trouble associating the log message with the service.
47 *
48 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
49 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
50 * and sometimes slightly newer than what was current at the log event).
51 */
52
53/* We refresh every 1s */
54#define REFRESH_USEC (1*USEC_PER_SEC)
55
56/* Data older than 5s we flush out */
57#define MAX_USEC (5*USEC_PER_SEC)
58
59/* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
60 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
61 * clients itself is limited.) */
62#define CACHE_MAX (16*1024)
63
64static int client_context_compare(const void *a, const void *b) {
65 const ClientContext *x = a, *y = b;
9c57a73b 66 int r;
22e3a02b 67
9c57a73b
YW
68 r = CMP(x->timestamp, y->timestamp);
69 if (r != 0)
70 return r;
22e3a02b 71
9c57a73b 72 return CMP(x->pid, y->pid);
22e3a02b
LP
73}
74
75static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
76 ClientContext *c;
77 int r;
78
79 assert(s);
80 assert(pid_is_valid(pid));
81 assert(ret);
82
83 r = hashmap_ensure_allocated(&s->client_contexts, NULL);
84 if (r < 0)
85 return r;
86
87 r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
88 if (r < 0)
89 return r;
90
91 c = new0(ClientContext, 1);
92 if (!c)
93 return -ENOMEM;
94
95 c->pid = pid;
96
97 c->uid = UID_INVALID;
98 c->gid = GID_INVALID;
99 c->auditid = AUDIT_SESSION_INVALID;
100 c->loginuid = UID_INVALID;
101 c->owner_uid = UID_INVALID;
102 c->lru_index = PRIOQ_IDX_NULL;
103 c->timestamp = USEC_INFINITY;
d3070fbd
LP
104 c->extra_fields_mtime = NSEC_INFINITY;
105 c->log_level_max = -1;
90fc172e
AZ
106 c->log_rate_limit_interval = s->rate_limit_interval;
107 c->log_rate_limit_burst = s->rate_limit_burst;
22e3a02b
LP
108
109 r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
110 if (r < 0) {
111 free(c);
112 return r;
113 }
114
115 *ret = c;
116 return 0;
117}
118
90fc172e
AZ
119static void client_context_reset(Server *s, ClientContext *c) {
120 assert(s);
22e3a02b
LP
121 assert(c);
122
123 c->timestamp = USEC_INFINITY;
124
125 c->uid = UID_INVALID;
126 c->gid = GID_INVALID;
127
128 c->comm = mfree(c->comm);
129 c->exe = mfree(c->exe);
130 c->cmdline = mfree(c->cmdline);
131 c->capeff = mfree(c->capeff);
132
133 c->auditid = AUDIT_SESSION_INVALID;
134 c->loginuid = UID_INVALID;
135
136 c->cgroup = mfree(c->cgroup);
137 c->session = mfree(c->session);
138 c->owner_uid = UID_INVALID;
139 c->unit = mfree(c->unit);
140 c->user_unit = mfree(c->user_unit);
141 c->slice = mfree(c->slice);
142 c->user_slice = mfree(c->user_slice);
143
144 c->invocation_id = SD_ID128_NULL;
145
146 c->label = mfree(c->label);
147 c->label_size = 0;
d3070fbd
LP
148
149 c->extra_fields_iovec = mfree(c->extra_fields_iovec);
150 c->extra_fields_n_iovec = 0;
151 c->extra_fields_data = mfree(c->extra_fields_data);
152 c->extra_fields_mtime = NSEC_INFINITY;
153
154 c->log_level_max = -1;
90fc172e
AZ
155
156 c->log_rate_limit_interval = s->rate_limit_interval;
157 c->log_rate_limit_burst = s->rate_limit_burst;
22e3a02b
LP
158}
159
160static ClientContext* client_context_free(Server *s, ClientContext *c) {
161 assert(s);
162
163 if (!c)
164 return NULL;
165
166 assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
167
168 if (c->in_lru)
169 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
170
90fc172e 171 client_context_reset(s, c);
22e3a02b
LP
172
173 return mfree(c);
174}
175
176static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
177 assert(c);
178 assert(pid_is_valid(c->pid));
179
180 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
181 if (ucred && uid_is_valid(ucred->uid))
182 c->uid = ucred->uid;
183 else
184 (void) get_process_uid(c->pid, &c->uid);
185
186 if (ucred && gid_is_valid(ucred->gid))
187 c->gid = ucred->gid;
188 else
189 (void) get_process_gid(c->pid, &c->gid);
190}
191
192static void client_context_read_basic(ClientContext *c) {
193 char *t;
194
195 assert(c);
196 assert(pid_is_valid(c->pid));
197
198 if (get_process_comm(c->pid, &t) >= 0)
199 free_and_replace(c->comm, t);
200
201 if (get_process_exe(c->pid, &t) >= 0)
202 free_and_replace(c->exe, t);
203
204 if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
205 free_and_replace(c->cmdline, t);
206
207 if (get_process_capeff(c->pid, &t) >= 0)
208 free_and_replace(c->capeff, t);
209}
210
211static int client_context_read_label(
212 ClientContext *c,
213 const char *label, size_t label_size) {
214
215 assert(c);
216 assert(pid_is_valid(c->pid));
217 assert(label_size == 0 || label);
218
219 if (label_size > 0) {
220 char *l;
221
222 /* If we got an SELinux label passed in it counts. */
223
224 l = newdup_suffix0(char, label, label_size);
225 if (!l)
226 return -ENOMEM;
227
228 free_and_replace(c->label, l);
229 c->label_size = label_size;
230 }
349cc4a5 231#if HAVE_SELINUX
22e3a02b
LP
232 else {
233 char *con;
234
235 /* If we got no SELinux label passed in, let's try to acquire one */
236
237 if (getpidcon(c->pid, &con) >= 0) {
238 free_and_replace(c->label, con);
239 c->label_size = strlen(c->label);
240 }
241 }
242#endif
243
244 return 0;
245}
246
247static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
248 char *t = NULL;
249 int r;
250
251 assert(c);
252
253 /* Try to acquire the current cgroup path */
254 r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
255 if (r < 0) {
256
257 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
258 if (unit_id && !c->unit) {
259 c->unit = strdup(unit_id);
260 if (c->unit)
261 return 0;
262 }
263
264 return r;
265 }
266
267 /* Let's shortcut this if the cgroup path didn't change */
268 if (streq_ptr(c->cgroup, t)) {
269 free(t);
270 return 0;
271 }
272
273 free_and_replace(c->cgroup, t);
274
275 (void) cg_path_get_session(c->cgroup, &t);
276 free_and_replace(c->session, t);
277
278 if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
279 c->owner_uid = UID_INVALID;
280
281 (void) cg_path_get_unit(c->cgroup, &t);
282 free_and_replace(c->unit, t);
283
284 (void) cg_path_get_user_unit(c->cgroup, &t);
285 free_and_replace(c->user_unit, t);
286
287 (void) cg_path_get_slice(c->cgroup, &t);
288 free_and_replace(c->slice, t);
289
290 (void) cg_path_get_user_slice(c->cgroup, &t);
291 free_and_replace(c->user_slice, t);
292
293 return 0;
294}
295
296static int client_context_read_invocation_id(
297 Server *s,
298 ClientContext *c) {
299
d3070fbd 300 _cleanup_free_ char *value = NULL;
22e3a02b
LP
301 const char *p;
302 int r;
303
304 assert(s);
305 assert(c);
306
d3070fbd 307 /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
22e3a02b 308
d3070fbd 309 if (!c->unit)
22e3a02b
LP
310 return 0;
311
d3070fbd
LP
312 p = strjoina("/run/systemd/units/invocation:", c->unit);
313 r = readlink_malloc(p, &value);
22e3a02b
LP
314 if (r < 0)
315 return r;
316
d3070fbd
LP
317 return sd_id128_from_string(value, &c->invocation_id);
318}
22e3a02b 319
d3070fbd
LP
320static int client_context_read_log_level_max(
321 Server *s,
322 ClientContext *c) {
22e3a02b 323
d3070fbd
LP
324 _cleanup_free_ char *value = NULL;
325 const char *p;
326 int r, ll;
327
328 if (!c->unit)
329 return 0;
330
331 p = strjoina("/run/systemd/units/log-level-max:", c->unit);
332 r = readlink_malloc(p, &value);
22e3a02b
LP
333 if (r < 0)
334 return r;
d3070fbd
LP
335
336 ll = log_level_from_string(value);
337 if (ll < 0)
22e3a02b 338 return -EINVAL;
22e3a02b 339
d3070fbd
LP
340 c->log_level_max = ll;
341 return 0;
342}
343
344static int client_context_read_extra_fields(
345 Server *s,
346 ClientContext *c) {
347
348 size_t size = 0, n_iovec = 0, n_allocated = 0, left;
349 _cleanup_free_ struct iovec *iovec = NULL;
350 _cleanup_free_ void *data = NULL;
351 _cleanup_fclose_ FILE *f = NULL;
352 struct stat st;
353 const char *p;
354 uint8_t *q;
355 int r;
356
357 if (!c->unit)
358 return 0;
359
360 p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
361
362 if (c->extra_fields_mtime != NSEC_INFINITY) {
363 if (stat(p, &st) < 0) {
364 if (errno == ENOENT)
365 return 0;
366
367 return -errno;
368 }
369
370 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
371 return 0;
372 }
373
374 f = fopen(p, "re");
375 if (!f) {
376 if (errno == ENOENT)
377 return 0;
378
379 return -errno;
380 }
381
382 if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
383 * one, that matches the stuff we are reading */
384 return -errno;
385
386 r = read_full_stream(f, (char**) &data, &size);
387 if (r < 0)
388 return r;
389
390 q = data, left = size;
391 while (left > 0) {
392 uint8_t *field, *eq;
393 uint64_t v, n;
394
395 if (left < sizeof(uint64_t))
396 return -EBADMSG;
397
398 v = unaligned_read_le64(q);
399 if (v < 2)
400 return -EBADMSG;
401
402 n = sizeof(uint64_t) + v;
403 if (left < n)
404 return -EBADMSG;
405
406 field = q + sizeof(uint64_t);
407
408 eq = memchr(field, '=', v);
409 if (!eq)
410 return -EBADMSG;
411
412 if (!journal_field_valid((const char *) field, eq - field, false))
413 return -EBADMSG;
414
415 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
416 return -ENOMEM;
417
418 iovec[n_iovec++] = IOVEC_MAKE(field, v);
419
420 left -= n, q += n;
421 }
422
423 free(c->extra_fields_iovec);
424 free(c->extra_fields_data);
425
1cc6c93a 426 c->extra_fields_iovec = TAKE_PTR(iovec);
d3070fbd 427 c->extra_fields_n_iovec = n_iovec;
1cc6c93a 428 c->extra_fields_data = TAKE_PTR(data);
d3070fbd
LP
429 c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
430
d3070fbd 431 return 0;
22e3a02b
LP
432}
433
90fc172e
AZ
434static int client_context_read_log_rate_limit_interval(ClientContext *c) {
435 _cleanup_free_ char *value = NULL;
436 const char *p;
437 int r;
438
439 assert(c);
440
441 if (!c->unit)
442 return 0;
443
444 p = strjoina("/run/systemd/units/log-rate-limit-interval:", c->unit);
445 r = readlink_malloc(p, &value);
446 if (r < 0)
447 return r;
448
449 return safe_atou64(value, &c->log_rate_limit_interval);
450}
451
452static int client_context_read_log_rate_limit_burst(ClientContext *c) {
453 _cleanup_free_ char *value = NULL;
454 const char *p;
455 int r;
456
457 assert(c);
458
459 if (!c->unit)
460 return 0;
461
462 p = strjoina("/run/systemd/units/log-rate-limit-burst:", c->unit);
463 r = readlink_malloc(p, &value);
464 if (r < 0)
465 return r;
466
467 return safe_atou(value, &c->log_rate_limit_burst);
468}
469
22e3a02b
LP
470static void client_context_really_refresh(
471 Server *s,
472 ClientContext *c,
473 const struct ucred *ucred,
474 const char *label, size_t label_size,
475 const char *unit_id,
476 usec_t timestamp) {
477
478 assert(s);
479 assert(c);
480 assert(pid_is_valid(c->pid));
481
482 if (timestamp == USEC_INFINITY)
483 timestamp = now(CLOCK_MONOTONIC);
484
485 client_context_read_uid_gid(c, ucred);
486 client_context_read_basic(c);
487 (void) client_context_read_label(c, label, label_size);
488
489 (void) audit_session_from_pid(c->pid, &c->auditid);
490 (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
491
492 (void) client_context_read_cgroup(s, c, unit_id);
493 (void) client_context_read_invocation_id(s, c);
d3070fbd
LP
494 (void) client_context_read_log_level_max(s, c);
495 (void) client_context_read_extra_fields(s, c);
90fc172e
AZ
496 (void) client_context_read_log_rate_limit_interval(c);
497 (void) client_context_read_log_rate_limit_burst(c);
22e3a02b
LP
498
499 c->timestamp = timestamp;
500
501 if (c->in_lru) {
502 assert(c->n_ref == 0);
503 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
504 }
505}
506
507void client_context_maybe_refresh(
508 Server *s,
509 ClientContext *c,
510 const struct ucred *ucred,
511 const char *label, size_t label_size,
512 const char *unit_id,
513 usec_t timestamp) {
514
515 assert(s);
516 assert(c);
517
518 if (timestamp == USEC_INFINITY)
519 timestamp = now(CLOCK_MONOTONIC);
520
521 /* No cached data so far? Let's fill it up */
522 if (c->timestamp == USEC_INFINITY)
523 goto refresh;
524
525 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
526 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
527 if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
90fc172e 528 client_context_reset(s, c);
22e3a02b
LP
529 goto refresh;
530 }
531
532 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
533 if (c->timestamp + REFRESH_USEC < timestamp)
534 goto refresh;
535
536 /* If the data passed along doesn't match the cached data we also do a refresh */
537 if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
538 goto refresh;
539
540 if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
541 goto refresh;
542
543 if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
544 goto refresh;
545
546 return;
547
548refresh:
549 client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
550}
551
552static void client_context_try_shrink_to(Server *s, size_t limit) {
553 assert(s);
554
555 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
556 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
557 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
558
559 while (hashmap_size(s->client_contexts) > limit) {
560 ClientContext *c;
561
562 c = prioq_pop(s->client_contexts_lru);
563 if (!c)
564 break; /* All remaining entries are pinned, give up */
565
566 assert(c->in_lru);
567 assert(c->n_ref == 0);
568
569 c->in_lru = false;
570
571 client_context_free(s, c);
572 }
573}
574
575void client_context_flush_all(Server *s) {
576 assert(s);
577
578 /* Flush out all remaining entries. This assumes all references are already dropped. */
579
580 s->my_context = client_context_release(s, s->my_context);
581 s->pid1_context = client_context_release(s, s->pid1_context);
582
583 client_context_try_shrink_to(s, 0);
584
585 assert(prioq_size(s->client_contexts_lru) == 0);
586 assert(hashmap_size(s->client_contexts) == 0);
587
588 s->client_contexts_lru = prioq_free(s->client_contexts_lru);
589 s->client_contexts = hashmap_free(s->client_contexts);
590}
591
592static int client_context_get_internal(
593 Server *s,
594 pid_t pid,
595 const struct ucred *ucred,
596 const char *label, size_t label_len,
597 const char *unit_id,
598 bool add_ref,
599 ClientContext **ret) {
600
601 ClientContext *c;
602 int r;
603
604 assert(s);
605 assert(ret);
606
607 if (!pid_is_valid(pid))
608 return -EINVAL;
609
610 c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
611 if (c) {
612
613 if (add_ref) {
614 if (c->in_lru) {
615 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
616 assert(c->n_ref == 0);
617 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
618 c->in_lru = false;
619 }
620
621 c->n_ref++;
622 }
623
624 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
625
626 *ret = c;
627 return 0;
628 }
629
630 client_context_try_shrink_to(s, CACHE_MAX-1);
631
632 r = client_context_new(s, pid, &c);
633 if (r < 0)
634 return r;
635
636 if (add_ref)
637 c->n_ref++;
638 else {
639 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
640 if (r < 0) {
641 client_context_free(s, c);
642 return r;
643 }
644
645 c->in_lru = true;
646 }
647
648 client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
649
650 *ret = c;
651 return 0;
652}
653
654int client_context_get(
655 Server *s,
656 pid_t pid,
657 const struct ucred *ucred,
658 const char *label, size_t label_len,
659 const char *unit_id,
660 ClientContext **ret) {
661
662 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
663}
664
665int client_context_acquire(
666 Server *s,
667 pid_t pid,
668 const struct ucred *ucred,
669 const char *label, size_t label_len,
670 const char *unit_id,
671 ClientContext **ret) {
672
673 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
674};
675
676ClientContext *client_context_release(Server *s, ClientContext *c) {
677 assert(s);
678
679 if (!c)
680 return NULL;
681
682 assert(c->n_ref > 0);
683 assert(!c->in_lru);
684
685 c->n_ref--;
686 if (c->n_ref > 0)
687 return NULL;
688
689 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
690 * right-away */
691
692 if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
693 client_context_free(s, c);
694 else
695 c->in_lru = true;
696
697 return NULL;
698}
699
700void client_context_acquire_default(Server *s) {
701 int r;
702
703 assert(s);
704
705 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
706 * generate driver messages. */
707
708 if (!s->my_context) {
709 struct ucred ucred = {
710 .pid = getpid_cached(),
711 .uid = getuid(),
712 .gid = getgid(),
713 };
714
715 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
716 if (r < 0)
717 log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
718 }
719
720 if (!s->pid1_context) {
721
722 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
723 if (r < 0)
724 log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
725
726 }
727}