]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journald-context.c
build-sys: use #if Y instead of #ifdef Y everywhere
[thirdparty/systemd.git] / src / journal / journald-context.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2017 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #if HAVE_SELINUX
21 #include <selinux/selinux.h>
22 #endif
23
24 #include "alloc-util.h"
25 #include "audit-util.h"
26 #include "cgroup-util.h"
27 #include "journald-context.h"
28 #include "process-util.h"
29 #include "string-util.h"
30 #include "user-util.h"
31
32 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
33 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
34 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
35 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
36 *
37 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
38 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
39 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
40 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
41 * flushed out). Data newer than 1s is used immediately without refresh.
42 *
43 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
44 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
45 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
46 *
47 * Caching metadata like this has two major benefits:
48 *
49 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
50 *
51 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
52 * entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
53 * service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
54 * stream connection. This should improve cases where a service process logs immediately before exiting and we
55 * previously had trouble associating the log message with the service.
56 *
57 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
58 * UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
59 * and sometimes slightly newer than what was current at the log event).
60 */
61
62 /* We refresh every 1s */
63 #define REFRESH_USEC (1*USEC_PER_SEC)
64
65 /* Data older than 5s we flush out */
66 #define MAX_USEC (5*USEC_PER_SEC)
67
68 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
69 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
70 * clients itself is limited.) */
71 #define CACHE_MAX (16*1024)
72
73 static int client_context_compare(const void *a, const void *b) {
74 const ClientContext *x = a, *y = b;
75
76 if (x->timestamp < y->timestamp)
77 return -1;
78 if (x->timestamp > y->timestamp)
79 return 1;
80
81 if (x->pid < y->pid)
82 return -1;
83 if (x->pid > y->pid)
84 return 1;
85
86 return 0;
87 }
88
89 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
90 ClientContext *c;
91 int r;
92
93 assert(s);
94 assert(pid_is_valid(pid));
95 assert(ret);
96
97 r = hashmap_ensure_allocated(&s->client_contexts, NULL);
98 if (r < 0)
99 return r;
100
101 r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
102 if (r < 0)
103 return r;
104
105 c = new0(ClientContext, 1);
106 if (!c)
107 return -ENOMEM;
108
109 c->pid = pid;
110
111 c->uid = UID_INVALID;
112 c->gid = GID_INVALID;
113 c->auditid = AUDIT_SESSION_INVALID;
114 c->loginuid = UID_INVALID;
115 c->owner_uid = UID_INVALID;
116 c->lru_index = PRIOQ_IDX_NULL;
117 c->timestamp = USEC_INFINITY;
118
119 r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
120 if (r < 0) {
121 free(c);
122 return r;
123 }
124
125 *ret = c;
126 return 0;
127 }
128
129 static void client_context_reset(ClientContext *c) {
130 assert(c);
131
132 c->timestamp = USEC_INFINITY;
133
134 c->uid = UID_INVALID;
135 c->gid = GID_INVALID;
136
137 c->comm = mfree(c->comm);
138 c->exe = mfree(c->exe);
139 c->cmdline = mfree(c->cmdline);
140 c->capeff = mfree(c->capeff);
141
142 c->auditid = AUDIT_SESSION_INVALID;
143 c->loginuid = UID_INVALID;
144
145 c->cgroup = mfree(c->cgroup);
146 c->session = mfree(c->session);
147 c->owner_uid = UID_INVALID;
148 c->unit = mfree(c->unit);
149 c->user_unit = mfree(c->user_unit);
150 c->slice = mfree(c->slice);
151 c->user_slice = mfree(c->user_slice);
152
153 c->invocation_id = SD_ID128_NULL;
154
155 c->label = mfree(c->label);
156 c->label_size = 0;
157 }
158
159 static ClientContext* client_context_free(Server *s, ClientContext *c) {
160 assert(s);
161
162 if (!c)
163 return NULL;
164
165 assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
166
167 if (c->in_lru)
168 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
169
170 client_context_reset(c);
171
172 return mfree(c);
173 }
174
175 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
176 assert(c);
177 assert(pid_is_valid(c->pid));
178
179 /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
180 if (ucred && uid_is_valid(ucred->uid))
181 c->uid = ucred->uid;
182 else
183 (void) get_process_uid(c->pid, &c->uid);
184
185 if (ucred && gid_is_valid(ucred->gid))
186 c->gid = ucred->gid;
187 else
188 (void) get_process_gid(c->pid, &c->gid);
189 }
190
191 static void client_context_read_basic(ClientContext *c) {
192 char *t;
193
194 assert(c);
195 assert(pid_is_valid(c->pid));
196
197 if (get_process_comm(c->pid, &t) >= 0)
198 free_and_replace(c->comm, t);
199
200 if (get_process_exe(c->pid, &t) >= 0)
201 free_and_replace(c->exe, t);
202
203 if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
204 free_and_replace(c->cmdline, t);
205
206 if (get_process_capeff(c->pid, &t) >= 0)
207 free_and_replace(c->capeff, t);
208 }
209
210 static int client_context_read_label(
211 ClientContext *c,
212 const char *label, size_t label_size) {
213
214 assert(c);
215 assert(pid_is_valid(c->pid));
216 assert(label_size == 0 || label);
217
218 if (label_size > 0) {
219 char *l;
220
221 /* If we got an SELinux label passed in it counts. */
222
223 l = newdup_suffix0(char, label, label_size);
224 if (!l)
225 return -ENOMEM;
226
227 free_and_replace(c->label, l);
228 c->label_size = label_size;
229 }
230 #if HAVE_SELINUX
231 else {
232 char *con;
233
234 /* If we got no SELinux label passed in, let's try to acquire one */
235
236 if (getpidcon(c->pid, &con) >= 0) {
237 free_and_replace(c->label, con);
238 c->label_size = strlen(c->label);
239 }
240 }
241 #endif
242
243 return 0;
244 }
245
246 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
247 char *t = NULL;
248 int r;
249
250 assert(c);
251
252 /* Try to acquire the current cgroup path */
253 r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
254 if (r < 0) {
255
256 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
257 if (unit_id && !c->unit) {
258 c->unit = strdup(unit_id);
259 if (c->unit)
260 return 0;
261 }
262
263 return r;
264 }
265
266 /* Let's shortcut this if the cgroup path didn't change */
267 if (streq_ptr(c->cgroup, t)) {
268 free(t);
269 return 0;
270 }
271
272 free_and_replace(c->cgroup, t);
273
274 (void) cg_path_get_session(c->cgroup, &t);
275 free_and_replace(c->session, t);
276
277 if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
278 c->owner_uid = UID_INVALID;
279
280 (void) cg_path_get_unit(c->cgroup, &t);
281 free_and_replace(c->unit, t);
282
283 (void) cg_path_get_user_unit(c->cgroup, &t);
284 free_and_replace(c->user_unit, t);
285
286 (void) cg_path_get_slice(c->cgroup, &t);
287 free_and_replace(c->slice, t);
288
289 (void) cg_path_get_user_slice(c->cgroup, &t);
290 free_and_replace(c->user_slice, t);
291
292 return 0;
293 }
294
295 static int client_context_read_invocation_id(
296 Server *s,
297 ClientContext *c) {
298
299 _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
300 char ids[SD_ID128_STRING_MAX];
301 const char *p;
302 int r;
303
304 assert(s);
305 assert(c);
306
307 /* Read the invocation ID of a unit off a unit. It's stored in the "trusted.invocation_id" extended attribute
308 * on the cgroup path. */
309
310 if (!c->unit || !c->slice)
311 return 0;
312
313 r = cg_slice_to_path(c->slice, &slice_path);
314 if (r < 0)
315 return r;
316
317 escaped = cg_escape(c->unit);
318 if (!escaped)
319 return -ENOMEM;
320
321 p = strjoina(s->cgroup_root, "/", slice_path, "/", escaped);
322 if (!p)
323 return -ENOMEM;
324
325 r = cg_get_xattr(SYSTEMD_CGROUP_CONTROLLER, p, "trusted.invocation_id", ids, 32);
326 if (r < 0)
327 return r;
328 if (r != 32)
329 return -EINVAL;
330 ids[32] = 0;
331
332 return sd_id128_from_string(ids, &c->invocation_id);
333 }
334
335 static void client_context_really_refresh(
336 Server *s,
337 ClientContext *c,
338 const struct ucred *ucred,
339 const char *label, size_t label_size,
340 const char *unit_id,
341 usec_t timestamp) {
342
343 assert(s);
344 assert(c);
345 assert(pid_is_valid(c->pid));
346
347 if (timestamp == USEC_INFINITY)
348 timestamp = now(CLOCK_MONOTONIC);
349
350 client_context_read_uid_gid(c, ucred);
351 client_context_read_basic(c);
352 (void) client_context_read_label(c, label, label_size);
353
354 (void) audit_session_from_pid(c->pid, &c->auditid);
355 (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
356
357 (void) client_context_read_cgroup(s, c, unit_id);
358 (void) client_context_read_invocation_id(s, c);
359
360 c->timestamp = timestamp;
361
362 if (c->in_lru) {
363 assert(c->n_ref == 0);
364 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
365 }
366 }
367
368 void client_context_maybe_refresh(
369 Server *s,
370 ClientContext *c,
371 const struct ucred *ucred,
372 const char *label, size_t label_size,
373 const char *unit_id,
374 usec_t timestamp) {
375
376 assert(s);
377 assert(c);
378
379 if (timestamp == USEC_INFINITY)
380 timestamp = now(CLOCK_MONOTONIC);
381
382 /* No cached data so far? Let's fill it up */
383 if (c->timestamp == USEC_INFINITY)
384 goto refresh;
385
386 /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
387 * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
388 if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
389 client_context_reset(c);
390 goto refresh;
391 }
392
393 /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
394 if (c->timestamp + REFRESH_USEC < timestamp)
395 goto refresh;
396
397 /* If the data passed along doesn't match the cached data we also do a refresh */
398 if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
399 goto refresh;
400
401 if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
402 goto refresh;
403
404 if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
405 goto refresh;
406
407 return;
408
409 refresh:
410 client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
411 }
412
413 static void client_context_try_shrink_to(Server *s, size_t limit) {
414 assert(s);
415
416 /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
417 * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
418 * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
419
420 while (hashmap_size(s->client_contexts) > limit) {
421 ClientContext *c;
422
423 c = prioq_pop(s->client_contexts_lru);
424 if (!c)
425 break; /* All remaining entries are pinned, give up */
426
427 assert(c->in_lru);
428 assert(c->n_ref == 0);
429
430 c->in_lru = false;
431
432 client_context_free(s, c);
433 }
434 }
435
436 void client_context_flush_all(Server *s) {
437 assert(s);
438
439 /* Flush out all remaining entries. This assumes all references are already dropped. */
440
441 s->my_context = client_context_release(s, s->my_context);
442 s->pid1_context = client_context_release(s, s->pid1_context);
443
444 client_context_try_shrink_to(s, 0);
445
446 assert(prioq_size(s->client_contexts_lru) == 0);
447 assert(hashmap_size(s->client_contexts) == 0);
448
449 s->client_contexts_lru = prioq_free(s->client_contexts_lru);
450 s->client_contexts = hashmap_free(s->client_contexts);
451 }
452
453 static int client_context_get_internal(
454 Server *s,
455 pid_t pid,
456 const struct ucred *ucred,
457 const char *label, size_t label_len,
458 const char *unit_id,
459 bool add_ref,
460 ClientContext **ret) {
461
462 ClientContext *c;
463 int r;
464
465 assert(s);
466 assert(ret);
467
468 if (!pid_is_valid(pid))
469 return -EINVAL;
470
471 c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
472 if (c) {
473
474 if (add_ref) {
475 if (c->in_lru) {
476 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
477 assert(c->n_ref == 0);
478 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
479 c->in_lru = false;
480 }
481
482 c->n_ref++;
483 }
484
485 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
486
487 *ret = c;
488 return 0;
489 }
490
491 client_context_try_shrink_to(s, CACHE_MAX-1);
492
493 r = client_context_new(s, pid, &c);
494 if (r < 0)
495 return r;
496
497 if (add_ref)
498 c->n_ref++;
499 else {
500 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
501 if (r < 0) {
502 client_context_free(s, c);
503 return r;
504 }
505
506 c->in_lru = true;
507 }
508
509 client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
510
511 *ret = c;
512 return 0;
513 }
514
515 int client_context_get(
516 Server *s,
517 pid_t pid,
518 const struct ucred *ucred,
519 const char *label, size_t label_len,
520 const char *unit_id,
521 ClientContext **ret) {
522
523 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
524 }
525
526 int client_context_acquire(
527 Server *s,
528 pid_t pid,
529 const struct ucred *ucred,
530 const char *label, size_t label_len,
531 const char *unit_id,
532 ClientContext **ret) {
533
534 return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
535 };
536
537 ClientContext *client_context_release(Server *s, ClientContext *c) {
538 assert(s);
539
540 if (!c)
541 return NULL;
542
543 assert(c->n_ref > 0);
544 assert(!c->in_lru);
545
546 c->n_ref--;
547 if (c->n_ref > 0)
548 return NULL;
549
550 /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
551 * right-away */
552
553 if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
554 client_context_free(s, c);
555 else
556 c->in_lru = true;
557
558 return NULL;
559 }
560
561 void client_context_acquire_default(Server *s) {
562 int r;
563
564 assert(s);
565
566 /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
567 * generate driver messages. */
568
569 if (!s->my_context) {
570 struct ucred ucred = {
571 .pid = getpid_cached(),
572 .uid = getuid(),
573 .gid = getgid(),
574 };
575
576 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
577 if (r < 0)
578 log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
579 }
580
581 if (!s->pid1_context) {
582
583 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
584 if (r < 0)
585 log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
586
587 }
588 }