src/journal/journald-context.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2017 Lennart Poettering
   6 ***/
   7
   8 #if HAVE_SELINUX
   9 #include <selinux/selinux.h>
  10 #endif
  11
  12 #include "alloc-util.h"
  13 #include "audit-util.h"
  14 #include "cgroup-util.h"
  15 #include "fd-util.h"
  16 #include "fileio.h"
  17 #include "fs-util.h"
  18 #include "io-util.h"
  19 #include "journal-util.h"
  20 #include "journald-context.h"
  21 #include "process-util.h"
  22 #include "string-util.h"
  23 #include "syslog-util.h"
  24 #include "unaligned.h"
  25 #include "user-util.h"
  26
  27 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
  28  * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
  29  * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
  30  * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
  31  *
  32  * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
  33  * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
  34  * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
  35  * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
  36  * flushed out). Data newer than 1s is used immediately without refresh.
  37  *
  38  * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
  39  * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
  40  * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
  41  *
  42  * Caching metadata like this has two major benefits:
  43  *
  44  * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
  45  *
  46  * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
  47  *    entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
  48  *    service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
  49  *    stream connection. This should improve cases where a service process logs immediately before exiting and we
  50  *    previously had trouble associating the log message with the service.
  51  *
  52  * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
  53  *     UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
  54  *     and sometimes slightly newer than what was current at the log event).
  55  */
  56
  57 /* We refresh every 1s */
  58 #define REFRESH_USEC (1*USEC_PER_SEC)
  59
  60 /* Data older than 5s we flush out */
  61 #define MAX_USEC (5*USEC_PER_SEC)
  62
  63 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
  64  * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
  65  * clients itself is limited.) */
  66 #define CACHE_MAX (16*1024)
  67
  68 static int client_context_compare(const void *a, const void *b) {
  69         const ClientContext *x = a, *y = b;
  70
  71         if (x->timestamp < y->timestamp)
  72                 return -1;
  73         if (x->timestamp > y->timestamp)
  74                 return 1;
  75
  76         if (x->pid < y->pid)
  77                 return -1;
  78         if (x->pid > y->pid)
  79                 return 1;
  80
  81         return 0;
  82 }
  83
  84 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
  85         ClientContext *c;
  86         int r;
  87
  88         assert(s);
  89         assert(pid_is_valid(pid));
  90         assert(ret);
  91
  92         r = hashmap_ensure_allocated(&s->client_contexts, NULL);
  93         if (r < 0)
  94                 return r;
  95
  96         r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
  97         if (r < 0)
  98                 return r;
  99
 100         c = new0(ClientContext, 1);
 101         if (!c)
 102                 return -ENOMEM;
 103
 104         c->pid = pid;
 105
 106         c->uid = UID_INVALID;
 107         c->gid = GID_INVALID;
 108         c->auditid = AUDIT_SESSION_INVALID;
 109         c->loginuid = UID_INVALID;
 110         c->owner_uid = UID_INVALID;
 111         c->lru_index = PRIOQ_IDX_NULL;
 112         c->timestamp = USEC_INFINITY;
 113         c->extra_fields_mtime = NSEC_INFINITY;
 114         c->log_level_max = -1;
 115
 116         r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
 117         if (r < 0) {
 118                 free(c);
 119                 return r;
 120         }
 121
 122         *ret = c;
 123         return 0;
 124 }
 125
 126 static void client_context_reset(ClientContext *c) {
 127         assert(c);
 128
 129         c->timestamp = USEC_INFINITY;
 130
 131         c->uid = UID_INVALID;
 132         c->gid = GID_INVALID;
 133
 134         c->comm = mfree(c->comm);
 135         c->exe = mfree(c->exe);
 136         c->cmdline = mfree(c->cmdline);
 137         c->capeff = mfree(c->capeff);
 138
 139         c->auditid = AUDIT_SESSION_INVALID;
 140         c->loginuid = UID_INVALID;
 141
 142         c->cgroup = mfree(c->cgroup);
 143         c->session = mfree(c->session);
 144         c->owner_uid = UID_INVALID;
 145         c->unit = mfree(c->unit);
 146         c->user_unit = mfree(c->user_unit);
 147         c->slice = mfree(c->slice);
 148         c->user_slice = mfree(c->user_slice);
 149
 150         c->invocation_id = SD_ID128_NULL;
 151
 152         c->label = mfree(c->label);
 153         c->label_size = 0;
 154
 155         c->extra_fields_iovec = mfree(c->extra_fields_iovec);
 156         c->extra_fields_n_iovec = 0;
 157         c->extra_fields_data = mfree(c->extra_fields_data);
 158         c->extra_fields_mtime = NSEC_INFINITY;
 159
 160         c->log_level_max = -1;
 161 }
 162
 163 static ClientContext* client_context_free(Server *s, ClientContext *c) {
 164         assert(s);
 165
 166         if (!c)
 167                 return NULL;
 168
 169         assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
 170
 171         if (c->in_lru)
 172                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 173
 174         client_context_reset(c);
 175
 176         return mfree(c);
 177 }
 178
 179 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
 180         assert(c);
 181         assert(pid_is_valid(c->pid));
 182
 183         /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
 184         if (ucred && uid_is_valid(ucred->uid))
 185                 c->uid = ucred->uid;
 186         else
 187                 (void) get_process_uid(c->pid, &c->uid);
 188
 189         if (ucred && gid_is_valid(ucred->gid))
 190                 c->gid = ucred->gid;
 191         else
 192                 (void) get_process_gid(c->pid, &c->gid);
 193 }
 194
 195 static void client_context_read_basic(ClientContext *c) {
 196         char *t;
 197
 198         assert(c);
 199         assert(pid_is_valid(c->pid));
 200
 201         if (get_process_comm(c->pid, &t) >= 0)
 202                 free_and_replace(c->comm, t);
 203
 204         if (get_process_exe(c->pid, &t) >= 0)
 205                 free_and_replace(c->exe, t);
 206
 207         if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
 208                 free_and_replace(c->cmdline, t);
 209
 210         if (get_process_capeff(c->pid, &t) >= 0)
 211                 free_and_replace(c->capeff, t);
 212 }
 213
 214 static int client_context_read_label(
 215                 ClientContext *c,
 216                 const char *label, size_t label_size) {
 217
 218         assert(c);
 219         assert(pid_is_valid(c->pid));
 220         assert(label_size == 0 || label);
 221
 222         if (label_size > 0) {
 223                 char *l;
 224
 225                 /* If we got an SELinux label passed in it counts. */
 226
 227                 l = newdup_suffix0(char, label, label_size);
 228                 if (!l)
 229                         return -ENOMEM;
 230
 231                 free_and_replace(c->label, l);
 232                 c->label_size = label_size;
 233         }
 234 #if HAVE_SELINUX
 235         else {
 236                 char *con;
 237
 238                 /* If we got no SELinux label passed in, let's try to acquire one */
 239
 240                 if (getpidcon(c->pid, &con) >= 0) {
 241                         free_and_replace(c->label, con);
 242                         c->label_size = strlen(c->label);
 243                 }
 244         }
 245 #endif
 246
 247         return 0;
 248 }
 249
 250 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
 251         char *t = NULL;
 252         int r;
 253
 254         assert(c);
 255
 256         /* Try to acquire the current cgroup path */
 257         r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
 258         if (r < 0) {
 259
 260                 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
 261                 if (unit_id && !c->unit) {
 262                         c->unit = strdup(unit_id);
 263                         if (c->unit)
 264                                 return 0;
 265                 }
 266
 267                 return r;
 268         }
 269
 270         /* Let's shortcut this if the cgroup path didn't change */
 271         if (streq_ptr(c->cgroup, t)) {
 272                 free(t);
 273                 return 0;
 274         }
 275
 276         free_and_replace(c->cgroup, t);
 277
 278         (void) cg_path_get_session(c->cgroup, &t);
 279         free_and_replace(c->session, t);
 280
 281         if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
 282                 c->owner_uid = UID_INVALID;
 283
 284         (void) cg_path_get_unit(c->cgroup, &t);
 285         free_and_replace(c->unit, t);
 286
 287         (void) cg_path_get_user_unit(c->cgroup, &t);
 288         free_and_replace(c->user_unit, t);
 289
 290         (void) cg_path_get_slice(c->cgroup, &t);
 291         free_and_replace(c->slice, t);
 292
 293         (void) cg_path_get_user_slice(c->cgroup, &t);
 294         free_and_replace(c->user_slice, t);
 295
 296         return 0;
 297 }
 298
 299 static int client_context_read_invocation_id(
 300                 Server *s,
 301                 ClientContext *c) {
 302
 303         _cleanup_free_ char *value = NULL;
 304         const char *p;
 305         int r;
 306
 307         assert(s);
 308         assert(c);
 309
 310         /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
 311
 312         if (!c->unit)
 313                 return 0;
 314
 315         p = strjoina("/run/systemd/units/invocation:", c->unit);
 316         r = readlink_malloc(p, &value);
 317         if (r < 0)
 318                 return r;
 319
 320         return sd_id128_from_string(value, &c->invocation_id);
 321 }
 322
 323 static int client_context_read_log_level_max(
 324                 Server *s,
 325                 ClientContext *c) {
 326
 327         _cleanup_free_ char *value = NULL;
 328         const char *p;
 329         int r, ll;
 330
 331         if (!c->unit)
 332                 return 0;
 333
 334         p = strjoina("/run/systemd/units/log-level-max:", c->unit);
 335         r = readlink_malloc(p, &value);
 336         if (r < 0)
 337                 return r;
 338
 339         ll = log_level_from_string(value);
 340         if (ll < 0)
 341                 return -EINVAL;
 342
 343         c->log_level_max = ll;
 344         return 0;
 345 }
 346
 347 static int client_context_read_extra_fields(
 348                 Server *s,
 349                 ClientContext *c) {
 350
 351         size_t size = 0, n_iovec = 0, n_allocated = 0, left;
 352         _cleanup_free_ struct iovec *iovec = NULL;
 353         _cleanup_free_ void *data = NULL;
 354         _cleanup_fclose_ FILE *f = NULL;
 355         struct stat st;
 356         const char *p;
 357         uint8_t *q;
 358         int r;
 359
 360         if (!c->unit)
 361                 return 0;
 362
 363         p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
 364
 365         if (c->extra_fields_mtime != NSEC_INFINITY) {
 366                 if (stat(p, &st) < 0) {
 367                         if (errno == ENOENT)
 368                                 return 0;
 369
 370                         return -errno;
 371                 }
 372
 373                 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
 374                         return 0;
 375         }
 376
 377         f = fopen(p, "re");
 378         if (!f) {
 379                 if (errno == ENOENT)
 380                         return 0;
 381
 382                 return -errno;
 383         }
 384
 385         if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
 386                                         * one, that matches the stuff we are reading */
 387                 return -errno;
 388
 389         r = read_full_stream(f, (char**) &data, &size);
 390         if (r < 0)
 391                 return r;
 392
 393         q = data, left = size;
 394         while (left > 0) {
 395                 uint8_t *field, *eq;
 396                 uint64_t v, n;
 397
 398                 if (left < sizeof(uint64_t))
 399                         return -EBADMSG;
 400
 401                 v = unaligned_read_le64(q);
 402                 if (v < 2)
 403                         return -EBADMSG;
 404
 405                 n = sizeof(uint64_t) + v;
 406                 if (left < n)
 407                         return -EBADMSG;
 408
 409                 field = q + sizeof(uint64_t);
 410
 411                 eq = memchr(field, '=', v);
 412                 if (!eq)
 413                         return -EBADMSG;
 414
 415                 if (!journal_field_valid((const char *) field, eq - field, false))
 416                         return -EBADMSG;
 417
 418                 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
 419                         return -ENOMEM;
 420
 421                 iovec[n_iovec++] = IOVEC_MAKE(field, v);
 422
 423                 left -= n, q += n;
 424         }
 425
 426         free(c->extra_fields_iovec);
 427         free(c->extra_fields_data);
 428
 429         c->extra_fields_iovec = TAKE_PTR(iovec);
 430         c->extra_fields_n_iovec = n_iovec;
 431         c->extra_fields_data = TAKE_PTR(data);
 432         c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
 433
 434         return 0;
 435 }
 436
 437 static void client_context_really_refresh(
 438                 Server *s,
 439                 ClientContext *c,
 440                 const struct ucred *ucred,
 441                 const char *label, size_t label_size,
 442                 const char *unit_id,
 443                 usec_t timestamp) {
 444
 445         assert(s);
 446         assert(c);
 447         assert(pid_is_valid(c->pid));
 448
 449         if (timestamp == USEC_INFINITY)
 450                 timestamp = now(CLOCK_MONOTONIC);
 451
 452         client_context_read_uid_gid(c, ucred);
 453         client_context_read_basic(c);
 454         (void) client_context_read_label(c, label, label_size);
 455
 456         (void) audit_session_from_pid(c->pid, &c->auditid);
 457         (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
 458
 459         (void) client_context_read_cgroup(s, c, unit_id);
 460         (void) client_context_read_invocation_id(s, c);
 461         (void) client_context_read_log_level_max(s, c);
 462         (void) client_context_read_extra_fields(s, c);
 463
 464         c->timestamp = timestamp;
 465
 466         if (c->in_lru) {
 467                 assert(c->n_ref == 0);
 468                 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
 469         }
 470 }
 471
 472 void client_context_maybe_refresh(
 473                 Server *s,
 474                 ClientContext *c,
 475                 const struct ucred *ucred,
 476                 const char *label, size_t label_size,
 477                 const char *unit_id,
 478                 usec_t timestamp) {
 479
 480         assert(s);
 481         assert(c);
 482
 483         if (timestamp == USEC_INFINITY)
 484                 timestamp = now(CLOCK_MONOTONIC);
 485
 486         /* No cached data so far? Let's fill it up */
 487         if (c->timestamp == USEC_INFINITY)
 488                 goto refresh;
 489
 490         /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
 491          * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
 492         if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
 493                 client_context_reset(c);
 494                 goto refresh;
 495         }
 496
 497         /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
 498         if (c->timestamp + REFRESH_USEC < timestamp)
 499                 goto refresh;
 500
 501         /* If the data passed along doesn't match the cached data we also do a refresh */
 502         if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
 503                 goto refresh;
 504
 505         if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
 506                 goto refresh;
 507
 508         if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
 509                 goto refresh;
 510
 511         return;
 512
 513 refresh:
 514         client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
 515 }
 516
 517 static void client_context_try_shrink_to(Server *s, size_t limit) {
 518         assert(s);
 519
 520         /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
 521          * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
 522          * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
 523
 524         while (hashmap_size(s->client_contexts) > limit) {
 525                 ClientContext *c;
 526
 527                 c = prioq_pop(s->client_contexts_lru);
 528                 if (!c)
 529                         break; /* All remaining entries are pinned, give up */
 530
 531                 assert(c->in_lru);
 532                 assert(c->n_ref == 0);
 533
 534                 c->in_lru = false;
 535
 536                 client_context_free(s, c);
 537         }
 538 }
 539
 540 void client_context_flush_all(Server *s) {
 541         assert(s);
 542
 543         /* Flush out all remaining entries. This assumes all references are already dropped. */
 544
 545         s->my_context = client_context_release(s, s->my_context);
 546         s->pid1_context = client_context_release(s, s->pid1_context);
 547
 548         client_context_try_shrink_to(s, 0);
 549
 550         assert(prioq_size(s->client_contexts_lru) == 0);
 551         assert(hashmap_size(s->client_contexts) == 0);
 552
 553         s->client_contexts_lru = prioq_free(s->client_contexts_lru);
 554         s->client_contexts = hashmap_free(s->client_contexts);
 555 }
 556
 557 static int client_context_get_internal(
 558                 Server *s,
 559                 pid_t pid,
 560                 const struct ucred *ucred,
 561                 const char *label, size_t label_len,
 562                 const char *unit_id,
 563                 bool add_ref,
 564                 ClientContext **ret) {
 565
 566         ClientContext *c;
 567         int r;
 568
 569         assert(s);
 570         assert(ret);
 571
 572         if (!pid_is_valid(pid))
 573                 return -EINVAL;
 574
 575         c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
 576         if (c) {
 577
 578                 if (add_ref) {
 579                         if (c->in_lru) {
 580                                 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
 581                                 assert(c->n_ref == 0);
 582                                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 583                                 c->in_lru = false;
 584                         }
 585
 586                         c->n_ref++;
 587                 }
 588
 589                 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 590
 591                 *ret = c;
 592                 return 0;
 593         }
 594
 595         client_context_try_shrink_to(s, CACHE_MAX-1);
 596
 597         r = client_context_new(s, pid, &c);
 598         if (r < 0)
 599                 return r;
 600
 601         if (add_ref)
 602                 c->n_ref++;
 603         else {
 604                 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
 605                 if (r < 0) {
 606                         client_context_free(s, c);
 607                         return r;
 608                 }
 609
 610                 c->in_lru = true;
 611         }
 612
 613         client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 614
 615         *ret = c;
 616         return 0;
 617 }
 618
 619 int client_context_get(
 620                 Server *s,
 621                 pid_t pid,
 622                 const struct ucred *ucred,
 623                 const char *label, size_t label_len,
 624                 const char *unit_id,
 625                 ClientContext **ret) {
 626
 627         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
 628 }
 629
 630 int client_context_acquire(
 631                 Server *s,
 632                 pid_t pid,
 633                 const struct ucred *ucred,
 634                 const char *label, size_t label_len,
 635                 const char *unit_id,
 636                 ClientContext **ret) {
 637
 638         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
 639 };
 640
 641 ClientContext *client_context_release(Server *s, ClientContext *c) {
 642         assert(s);
 643
 644         if (!c)
 645                 return NULL;
 646
 647         assert(c->n_ref > 0);
 648         assert(!c->in_lru);
 649
 650         c->n_ref--;
 651         if (c->n_ref > 0)
 652                 return NULL;
 653
 654         /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
 655          * right-away */
 656
 657         if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
 658                 client_context_free(s, c);
 659         else
 660                 c->in_lru = true;
 661
 662         return NULL;
 663 }
 664
 665 void client_context_acquire_default(Server *s) {
 666         int r;
 667
 668         assert(s);
 669
 670         /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
 671          * generate driver messages. */
 672
 673         if (!s->my_context) {
 674                 struct ucred ucred = {
 675                         .pid = getpid_cached(),
 676                         .uid = getuid(),
 677                         .gid = getgid(),
 678                 };
 679
 680                 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
 681                 if (r < 0)
 682                         log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
 683         }
 684
 685         if (!s->pid1_context) {
 686
 687                 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
 688                 if (r < 0)
 689                         log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
 690
 691         }
 692 }