src/journal/journald-context.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   Copyright 2017 Lennart Poettering
   4 ***/
   5
   6 #if HAVE_SELINUX
   7 #include <selinux/selinux.h>
   8 #endif
   9
  10 #include "alloc-util.h"
  11 #include "audit-util.h"
  12 #include "cgroup-util.h"
  13 #include "fd-util.h"
  14 #include "fileio.h"
  15 #include "fs-util.h"
  16 #include "io-util.h"
  17 #include "journal-util.h"
  18 #include "journald-context.h"
  19 #include "process-util.h"
  20 #include "string-util.h"
  21 #include "syslog-util.h"
  22 #include "unaligned.h"
  23 #include "user-util.h"
  24
  25 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
  26  * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
  27  * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
  28  * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
  29  *
  30  * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
  31  * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
  32  * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
  33  * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
  34  * flushed out). Data newer than 1s is used immediately without refresh.
  35  *
  36  * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
  37  * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
  38  * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
  39  *
  40  * Caching metadata like this has two major benefits:
  41  *
  42  * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
  43  *
  44  * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
  45  *    entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
  46  *    service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
  47  *    stream connection. This should improve cases where a service process logs immediately before exiting and we
  48  *    previously had trouble associating the log message with the service.
  49  *
  50  * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
  51  *     UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
  52  *     and sometimes slightly newer than what was current at the log event).
  53  */
  54
  55 /* We refresh every 1s */
  56 #define REFRESH_USEC (1*USEC_PER_SEC)
  57
  58 /* Data older than 5s we flush out */
  59 #define MAX_USEC (5*USEC_PER_SEC)
  60
  61 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
  62  * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
  63  * clients itself is limited.) */
  64 #define CACHE_MAX (16*1024)
  65
  66 static int client_context_compare(const void *a, const void *b) {
  67         const ClientContext *x = a, *y = b;
  68
  69         if (x->timestamp < y->timestamp)
  70                 return -1;
  71         if (x->timestamp > y->timestamp)
  72                 return 1;
  73
  74         if (x->pid < y->pid)
  75                 return -1;
  76         if (x->pid > y->pid)
  77                 return 1;
  78
  79         return 0;
  80 }
  81
  82 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
  83         ClientContext *c;
  84         int r;
  85
  86         assert(s);
  87         assert(pid_is_valid(pid));
  88         assert(ret);
  89
  90         r = hashmap_ensure_allocated(&s->client_contexts, NULL);
  91         if (r < 0)
  92                 return r;
  93
  94         r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
  95         if (r < 0)
  96                 return r;
  97
  98         c = new0(ClientContext, 1);
  99         if (!c)
 100                 return -ENOMEM;
 101
 102         c->pid = pid;
 103
 104         c->uid = UID_INVALID;
 105         c->gid = GID_INVALID;
 106         c->auditid = AUDIT_SESSION_INVALID;
 107         c->loginuid = UID_INVALID;
 108         c->owner_uid = UID_INVALID;
 109         c->lru_index = PRIOQ_IDX_NULL;
 110         c->timestamp = USEC_INFINITY;
 111         c->extra_fields_mtime = NSEC_INFINITY;
 112         c->log_level_max = -1;
 113
 114         r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
 115         if (r < 0) {
 116                 free(c);
 117                 return r;
 118         }
 119
 120         *ret = c;
 121         return 0;
 122 }
 123
 124 static void client_context_reset(ClientContext *c) {
 125         assert(c);
 126
 127         c->timestamp = USEC_INFINITY;
 128
 129         c->uid = UID_INVALID;
 130         c->gid = GID_INVALID;
 131
 132         c->comm = mfree(c->comm);
 133         c->exe = mfree(c->exe);
 134         c->cmdline = mfree(c->cmdline);
 135         c->capeff = mfree(c->capeff);
 136
 137         c->auditid = AUDIT_SESSION_INVALID;
 138         c->loginuid = UID_INVALID;
 139
 140         c->cgroup = mfree(c->cgroup);
 141         c->session = mfree(c->session);
 142         c->owner_uid = UID_INVALID;
 143         c->unit = mfree(c->unit);
 144         c->user_unit = mfree(c->user_unit);
 145         c->slice = mfree(c->slice);
 146         c->user_slice = mfree(c->user_slice);
 147
 148         c->invocation_id = SD_ID128_NULL;
 149
 150         c->label = mfree(c->label);
 151         c->label_size = 0;
 152
 153         c->extra_fields_iovec = mfree(c->extra_fields_iovec);
 154         c->extra_fields_n_iovec = 0;
 155         c->extra_fields_data = mfree(c->extra_fields_data);
 156         c->extra_fields_mtime = NSEC_INFINITY;
 157
 158         c->log_level_max = -1;
 159 }
 160
 161 static ClientContext* client_context_free(Server *s, ClientContext *c) {
 162         assert(s);
 163
 164         if (!c)
 165                 return NULL;
 166
 167         assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
 168
 169         if (c->in_lru)
 170                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 171
 172         client_context_reset(c);
 173
 174         return mfree(c);
 175 }
 176
 177 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
 178         assert(c);
 179         assert(pid_is_valid(c->pid));
 180
 181         /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
 182         if (ucred && uid_is_valid(ucred->uid))
 183                 c->uid = ucred->uid;
 184         else
 185                 (void) get_process_uid(c->pid, &c->uid);
 186
 187         if (ucred && gid_is_valid(ucred->gid))
 188                 c->gid = ucred->gid;
 189         else
 190                 (void) get_process_gid(c->pid, &c->gid);
 191 }
 192
 193 static void client_context_read_basic(ClientContext *c) {
 194         char *t;
 195
 196         assert(c);
 197         assert(pid_is_valid(c->pid));
 198
 199         if (get_process_comm(c->pid, &t) >= 0)
 200                 free_and_replace(c->comm, t);
 201
 202         if (get_process_exe(c->pid, &t) >= 0)
 203                 free_and_replace(c->exe, t);
 204
 205         if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
 206                 free_and_replace(c->cmdline, t);
 207
 208         if (get_process_capeff(c->pid, &t) >= 0)
 209                 free_and_replace(c->capeff, t);
 210 }
 211
 212 static int client_context_read_label(
 213                 ClientContext *c,
 214                 const char *label, size_t label_size) {
 215
 216         assert(c);
 217         assert(pid_is_valid(c->pid));
 218         assert(label_size == 0 || label);
 219
 220         if (label_size > 0) {
 221                 char *l;
 222
 223                 /* If we got an SELinux label passed in it counts. */
 224
 225                 l = newdup_suffix0(char, label, label_size);
 226                 if (!l)
 227                         return -ENOMEM;
 228
 229                 free_and_replace(c->label, l);
 230                 c->label_size = label_size;
 231         }
 232 #if HAVE_SELINUX
 233         else {
 234                 char *con;
 235
 236                 /* If we got no SELinux label passed in, let's try to acquire one */
 237
 238                 if (getpidcon(c->pid, &con) >= 0) {
 239                         free_and_replace(c->label, con);
 240                         c->label_size = strlen(c->label);
 241                 }
 242         }
 243 #endif
 244
 245         return 0;
 246 }
 247
 248 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
 249         char *t = NULL;
 250         int r;
 251
 252         assert(c);
 253
 254         /* Try to acquire the current cgroup path */
 255         r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
 256         if (r < 0) {
 257
 258                 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
 259                 if (unit_id && !c->unit) {
 260                         c->unit = strdup(unit_id);
 261                         if (c->unit)
 262                                 return 0;
 263                 }
 264
 265                 return r;
 266         }
 267
 268         /* Let's shortcut this if the cgroup path didn't change */
 269         if (streq_ptr(c->cgroup, t)) {
 270                 free(t);
 271                 return 0;
 272         }
 273
 274         free_and_replace(c->cgroup, t);
 275
 276         (void) cg_path_get_session(c->cgroup, &t);
 277         free_and_replace(c->session, t);
 278
 279         if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
 280                 c->owner_uid = UID_INVALID;
 281
 282         (void) cg_path_get_unit(c->cgroup, &t);
 283         free_and_replace(c->unit, t);
 284
 285         (void) cg_path_get_user_unit(c->cgroup, &t);
 286         free_and_replace(c->user_unit, t);
 287
 288         (void) cg_path_get_slice(c->cgroup, &t);
 289         free_and_replace(c->slice, t);
 290
 291         (void) cg_path_get_user_slice(c->cgroup, &t);
 292         free_and_replace(c->user_slice, t);
 293
 294         return 0;
 295 }
 296
 297 static int client_context_read_invocation_id(
 298                 Server *s,
 299                 ClientContext *c) {
 300
 301         _cleanup_free_ char *value = NULL;
 302         const char *p;
 303         int r;
 304
 305         assert(s);
 306         assert(c);
 307
 308         /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
 309
 310         if (!c->unit)
 311                 return 0;
 312
 313         p = strjoina("/run/systemd/units/invocation:", c->unit);
 314         r = readlink_malloc(p, &value);
 315         if (r < 0)
 316                 return r;
 317
 318         return sd_id128_from_string(value, &c->invocation_id);
 319 }
 320
 321 static int client_context_read_log_level_max(
 322                 Server *s,
 323                 ClientContext *c) {
 324
 325         _cleanup_free_ char *value = NULL;
 326         const char *p;
 327         int r, ll;
 328
 329         if (!c->unit)
 330                 return 0;
 331
 332         p = strjoina("/run/systemd/units/log-level-max:", c->unit);
 333         r = readlink_malloc(p, &value);
 334         if (r < 0)
 335                 return r;
 336
 337         ll = log_level_from_string(value);
 338         if (ll < 0)
 339                 return -EINVAL;
 340
 341         c->log_level_max = ll;
 342         return 0;
 343 }
 344
 345 static int client_context_read_extra_fields(
 346                 Server *s,
 347                 ClientContext *c) {
 348
 349         size_t size = 0, n_iovec = 0, n_allocated = 0, left;
 350         _cleanup_free_ struct iovec *iovec = NULL;
 351         _cleanup_free_ void *data = NULL;
 352         _cleanup_fclose_ FILE *f = NULL;
 353         struct stat st;
 354         const char *p;
 355         uint8_t *q;
 356         int r;
 357
 358         if (!c->unit)
 359                 return 0;
 360
 361         p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
 362
 363         if (c->extra_fields_mtime != NSEC_INFINITY) {
 364                 if (stat(p, &st) < 0) {
 365                         if (errno == ENOENT)
 366                                 return 0;
 367
 368                         return -errno;
 369                 }
 370
 371                 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
 372                         return 0;
 373         }
 374
 375         f = fopen(p, "re");
 376         if (!f) {
 377                 if (errno == ENOENT)
 378                         return 0;
 379
 380                 return -errno;
 381         }
 382
 383         if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
 384                                         * one, that matches the stuff we are reading */
 385                 return -errno;
 386
 387         r = read_full_stream(f, (char**) &data, &size);
 388         if (r < 0)
 389                 return r;
 390
 391         q = data, left = size;
 392         while (left > 0) {
 393                 uint8_t *field, *eq;
 394                 uint64_t v, n;
 395
 396                 if (left < sizeof(uint64_t))
 397                         return -EBADMSG;
 398
 399                 v = unaligned_read_le64(q);
 400                 if (v < 2)
 401                         return -EBADMSG;
 402
 403                 n = sizeof(uint64_t) + v;
 404                 if (left < n)
 405                         return -EBADMSG;
 406
 407                 field = q + sizeof(uint64_t);
 408
 409                 eq = memchr(field, '=', v);
 410                 if (!eq)
 411                         return -EBADMSG;
 412
 413                 if (!journal_field_valid((const char *) field, eq - field, false))
 414                         return -EBADMSG;
 415
 416                 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
 417                         return -ENOMEM;
 418
 419                 iovec[n_iovec++] = IOVEC_MAKE(field, v);
 420
 421                 left -= n, q += n;
 422         }
 423
 424         free(c->extra_fields_iovec);
 425         free(c->extra_fields_data);
 426
 427         c->extra_fields_iovec = TAKE_PTR(iovec);
 428         c->extra_fields_n_iovec = n_iovec;
 429         c->extra_fields_data = TAKE_PTR(data);
 430         c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
 431
 432         return 0;
 433 }
 434
 435 static void client_context_really_refresh(
 436                 Server *s,
 437                 ClientContext *c,
 438                 const struct ucred *ucred,
 439                 const char *label, size_t label_size,
 440                 const char *unit_id,
 441                 usec_t timestamp) {
 442
 443         assert(s);
 444         assert(c);
 445         assert(pid_is_valid(c->pid));
 446
 447         if (timestamp == USEC_INFINITY)
 448                 timestamp = now(CLOCK_MONOTONIC);
 449
 450         client_context_read_uid_gid(c, ucred);
 451         client_context_read_basic(c);
 452         (void) client_context_read_label(c, label, label_size);
 453
 454         (void) audit_session_from_pid(c->pid, &c->auditid);
 455         (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
 456
 457         (void) client_context_read_cgroup(s, c, unit_id);
 458         (void) client_context_read_invocation_id(s, c);
 459         (void) client_context_read_log_level_max(s, c);
 460         (void) client_context_read_extra_fields(s, c);
 461
 462         c->timestamp = timestamp;
 463
 464         if (c->in_lru) {
 465                 assert(c->n_ref == 0);
 466                 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
 467         }
 468 }
 469
 470 void client_context_maybe_refresh(
 471                 Server *s,
 472                 ClientContext *c,
 473                 const struct ucred *ucred,
 474                 const char *label, size_t label_size,
 475                 const char *unit_id,
 476                 usec_t timestamp) {
 477
 478         assert(s);
 479         assert(c);
 480
 481         if (timestamp == USEC_INFINITY)
 482                 timestamp = now(CLOCK_MONOTONIC);
 483
 484         /* No cached data so far? Let's fill it up */
 485         if (c->timestamp == USEC_INFINITY)
 486                 goto refresh;
 487
 488         /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
 489          * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
 490         if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
 491                 client_context_reset(c);
 492                 goto refresh;
 493         }
 494
 495         /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
 496         if (c->timestamp + REFRESH_USEC < timestamp)
 497                 goto refresh;
 498
 499         /* If the data passed along doesn't match the cached data we also do a refresh */
 500         if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
 501                 goto refresh;
 502
 503         if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
 504                 goto refresh;
 505
 506         if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
 507                 goto refresh;
 508
 509         return;
 510
 511 refresh:
 512         client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
 513 }
 514
 515 static void client_context_try_shrink_to(Server *s, size_t limit) {
 516         assert(s);
 517
 518         /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
 519          * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
 520          * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
 521
 522         while (hashmap_size(s->client_contexts) > limit) {
 523                 ClientContext *c;
 524
 525                 c = prioq_pop(s->client_contexts_lru);
 526                 if (!c)
 527                         break; /* All remaining entries are pinned, give up */
 528
 529                 assert(c->in_lru);
 530                 assert(c->n_ref == 0);
 531
 532                 c->in_lru = false;
 533
 534                 client_context_free(s, c);
 535         }
 536 }
 537
 538 void client_context_flush_all(Server *s) {
 539         assert(s);
 540
 541         /* Flush out all remaining entries. This assumes all references are already dropped. */
 542
 543         s->my_context = client_context_release(s, s->my_context);
 544         s->pid1_context = client_context_release(s, s->pid1_context);
 545
 546         client_context_try_shrink_to(s, 0);
 547
 548         assert(prioq_size(s->client_contexts_lru) == 0);
 549         assert(hashmap_size(s->client_contexts) == 0);
 550
 551         s->client_contexts_lru = prioq_free(s->client_contexts_lru);
 552         s->client_contexts = hashmap_free(s->client_contexts);
 553 }
 554
 555 static int client_context_get_internal(
 556                 Server *s,
 557                 pid_t pid,
 558                 const struct ucred *ucred,
 559                 const char *label, size_t label_len,
 560                 const char *unit_id,
 561                 bool add_ref,
 562                 ClientContext **ret) {
 563
 564         ClientContext *c;
 565         int r;
 566
 567         assert(s);
 568         assert(ret);
 569
 570         if (!pid_is_valid(pid))
 571                 return -EINVAL;
 572
 573         c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
 574         if (c) {
 575
 576                 if (add_ref) {
 577                         if (c->in_lru) {
 578                                 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
 579                                 assert(c->n_ref == 0);
 580                                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 581                                 c->in_lru = false;
 582                         }
 583
 584                         c->n_ref++;
 585                 }
 586
 587                 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 588
 589                 *ret = c;
 590                 return 0;
 591         }
 592
 593         client_context_try_shrink_to(s, CACHE_MAX-1);
 594
 595         r = client_context_new(s, pid, &c);
 596         if (r < 0)
 597                 return r;
 598
 599         if (add_ref)
 600                 c->n_ref++;
 601         else {
 602                 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
 603                 if (r < 0) {
 604                         client_context_free(s, c);
 605                         return r;
 606                 }
 607
 608                 c->in_lru = true;
 609         }
 610
 611         client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 612
 613         *ret = c;
 614         return 0;
 615 }
 616
 617 int client_context_get(
 618                 Server *s,
 619                 pid_t pid,
 620                 const struct ucred *ucred,
 621                 const char *label, size_t label_len,
 622                 const char *unit_id,
 623                 ClientContext **ret) {
 624
 625         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
 626 }
 627
 628 int client_context_acquire(
 629                 Server *s,
 630                 pid_t pid,
 631                 const struct ucred *ucred,
 632                 const char *label, size_t label_len,
 633                 const char *unit_id,
 634                 ClientContext **ret) {
 635
 636         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
 637 };
 638
 639 ClientContext *client_context_release(Server *s, ClientContext *c) {
 640         assert(s);
 641
 642         if (!c)
 643                 return NULL;
 644
 645         assert(c->n_ref > 0);
 646         assert(!c->in_lru);
 647
 648         c->n_ref--;
 649         if (c->n_ref > 0)
 650                 return NULL;
 651
 652         /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
 653          * right-away */
 654
 655         if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
 656                 client_context_free(s, c);
 657         else
 658                 c->in_lru = true;
 659
 660         return NULL;
 661 }
 662
 663 void client_context_acquire_default(Server *s) {
 664         int r;
 665
 666         assert(s);
 667
 668         /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
 669          * generate driver messages. */
 670
 671         if (!s->my_context) {
 672                 struct ucred ucred = {
 673                         .pid = getpid_cached(),
 674                         .uid = getuid(),
 675                         .gid = getgid(),
 676                 };
 677
 678                 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
 679                 if (r < 0)
 680                         log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
 681         }
 682
 683         if (!s->pid1_context) {
 684
 685                 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
 686                 if (r < 0)
 687                         log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
 688
 689         }
 690 }