src/journal/journald-context.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #if HAVE_SELINUX
   4 #include <selinux/selinux.h>
   5 #endif
   6
   7 #include "alloc-util.h"
   8 #include "audit-util.h"
   9 #include "cgroup-util.h"
  10 #include "fd-util.h"
  11 #include "fileio.h"
  12 #include "fs-util.h"
  13 #include "io-util.h"
  14 #include "journal-util.h"
  15 #include "journald-context.h"
  16 #include "process-util.h"
  17 #include "string-util.h"
  18 #include "syslog-util.h"
  19 #include "unaligned.h"
  20 #include "user-util.h"
  21
  22 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
  23  * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
  24  * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
  25  * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
  26  *
  27  * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
  28  * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
  29  * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
  30  * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
  31  * flushed out). Data newer than 1s is used immediately without refresh.
  32  *
  33  * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
  34  * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
  35  * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
  36  *
  37  * Caching metadata like this has two major benefits:
  38  *
  39  * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
  40  *
  41  * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
  42  *    entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
  43  *    service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
  44  *    stream connection. This should improve cases where a service process logs immediately before exiting and we
  45  *    previously had trouble associating the log message with the service.
  46  *
  47  * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
  48  *     UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
  49  *     and sometimes slightly newer than what was current at the log event).
  50  */
  51
  52 /* We refresh every 1s */
  53 #define REFRESH_USEC (1*USEC_PER_SEC)
  54
  55 /* Data older than 5s we flush out */
  56 #define MAX_USEC (5*USEC_PER_SEC)
  57
  58 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
  59  * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
  60  * clients itself is limited.) */
  61 #define CACHE_MAX (16*1024)
  62
  63 static int client_context_compare(const void *a, const void *b) {
  64         const ClientContext *x = a, *y = b;
  65         int r;
  66
  67         r = CMP(x->timestamp, y->timestamp);
  68         if (r != 0)
  69                 return r;
  70
  71         return CMP(x->pid, y->pid);
  72 }
  73
  74 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
  75         ClientContext *c;
  76         int r;
  77
  78         assert(s);
  79         assert(pid_is_valid(pid));
  80         assert(ret);
  81
  82         r = hashmap_ensure_allocated(&s->client_contexts, NULL);
  83         if (r < 0)
  84                 return r;
  85
  86         r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
  87         if (r < 0)
  88                 return r;
  89
  90         c = new0(ClientContext, 1);
  91         if (!c)
  92                 return -ENOMEM;
  93
  94         c->pid = pid;
  95
  96         c->uid = UID_INVALID;
  97         c->gid = GID_INVALID;
  98         c->auditid = AUDIT_SESSION_INVALID;
  99         c->loginuid = UID_INVALID;
 100         c->owner_uid = UID_INVALID;
 101         c->lru_index = PRIOQ_IDX_NULL;
 102         c->timestamp = USEC_INFINITY;
 103         c->extra_fields_mtime = NSEC_INFINITY;
 104         c->log_level_max = -1;
 105
 106         r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
 107         if (r < 0) {
 108                 free(c);
 109                 return r;
 110         }
 111
 112         *ret = c;
 113         return 0;
 114 }
 115
 116 static void client_context_reset(ClientContext *c) {
 117         assert(c);
 118
 119         c->timestamp = USEC_INFINITY;
 120
 121         c->uid = UID_INVALID;
 122         c->gid = GID_INVALID;
 123
 124         c->comm = mfree(c->comm);
 125         c->exe = mfree(c->exe);
 126         c->cmdline = mfree(c->cmdline);
 127         c->capeff = mfree(c->capeff);
 128
 129         c->auditid = AUDIT_SESSION_INVALID;
 130         c->loginuid = UID_INVALID;
 131
 132         c->cgroup = mfree(c->cgroup);
 133         c->session = mfree(c->session);
 134         c->owner_uid = UID_INVALID;
 135         c->unit = mfree(c->unit);
 136         c->user_unit = mfree(c->user_unit);
 137         c->slice = mfree(c->slice);
 138         c->user_slice = mfree(c->user_slice);
 139
 140         c->invocation_id = SD_ID128_NULL;
 141
 142         c->label = mfree(c->label);
 143         c->label_size = 0;
 144
 145         c->extra_fields_iovec = mfree(c->extra_fields_iovec);
 146         c->extra_fields_n_iovec = 0;
 147         c->extra_fields_data = mfree(c->extra_fields_data);
 148         c->extra_fields_mtime = NSEC_INFINITY;
 149
 150         c->log_level_max = -1;
 151 }
 152
 153 static ClientContext* client_context_free(Server *s, ClientContext *c) {
 154         assert(s);
 155
 156         if (!c)
 157                 return NULL;
 158
 159         assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
 160
 161         if (c->in_lru)
 162                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 163
 164         client_context_reset(c);
 165
 166         return mfree(c);
 167 }
 168
 169 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
 170         assert(c);
 171         assert(pid_is_valid(c->pid));
 172
 173         /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
 174         if (ucred && uid_is_valid(ucred->uid))
 175                 c->uid = ucred->uid;
 176         else
 177                 (void) get_process_uid(c->pid, &c->uid);
 178
 179         if (ucred && gid_is_valid(ucred->gid))
 180                 c->gid = ucred->gid;
 181         else
 182                 (void) get_process_gid(c->pid, &c->gid);
 183 }
 184
 185 static void client_context_read_basic(ClientContext *c) {
 186         char *t;
 187
 188         assert(c);
 189         assert(pid_is_valid(c->pid));
 190
 191         if (get_process_comm(c->pid, &t) >= 0)
 192                 free_and_replace(c->comm, t);
 193
 194         if (get_process_exe(c->pid, &t) >= 0)
 195                 free_and_replace(c->exe, t);
 196
 197         if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
 198                 free_and_replace(c->cmdline, t);
 199
 200         if (get_process_capeff(c->pid, &t) >= 0)
 201                 free_and_replace(c->capeff, t);
 202 }
 203
 204 static int client_context_read_label(
 205                 ClientContext *c,
 206                 const char *label, size_t label_size) {
 207
 208         assert(c);
 209         assert(pid_is_valid(c->pid));
 210         assert(label_size == 0 || label);
 211
 212         if (label_size > 0) {
 213                 char *l;
 214
 215                 /* If we got an SELinux label passed in it counts. */
 216
 217                 l = newdup_suffix0(char, label, label_size);
 218                 if (!l)
 219                         return -ENOMEM;
 220
 221                 free_and_replace(c->label, l);
 222                 c->label_size = label_size;
 223         }
 224 #if HAVE_SELINUX
 225         else {
 226                 char *con;
 227
 228                 /* If we got no SELinux label passed in, let's try to acquire one */
 229
 230                 if (getpidcon(c->pid, &con) >= 0) {
 231                         free_and_replace(c->label, con);
 232                         c->label_size = strlen(c->label);
 233                 }
 234         }
 235 #endif
 236
 237         return 0;
 238 }
 239
 240 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
 241         char *t = NULL;
 242         int r;
 243
 244         assert(c);
 245
 246         /* Try to acquire the current cgroup path */
 247         r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
 248         if (r < 0) {
 249
 250                 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
 251                 if (unit_id && !c->unit) {
 252                         c->unit = strdup(unit_id);
 253                         if (c->unit)
 254                                 return 0;
 255                 }
 256
 257                 return r;
 258         }
 259
 260         /* Let's shortcut this if the cgroup path didn't change */
 261         if (streq_ptr(c->cgroup, t)) {
 262                 free(t);
 263                 return 0;
 264         }
 265
 266         free_and_replace(c->cgroup, t);
 267
 268         (void) cg_path_get_session(c->cgroup, &t);
 269         free_and_replace(c->session, t);
 270
 271         if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
 272                 c->owner_uid = UID_INVALID;
 273
 274         (void) cg_path_get_unit(c->cgroup, &t);
 275         free_and_replace(c->unit, t);
 276
 277         (void) cg_path_get_user_unit(c->cgroup, &t);
 278         free_and_replace(c->user_unit, t);
 279
 280         (void) cg_path_get_slice(c->cgroup, &t);
 281         free_and_replace(c->slice, t);
 282
 283         (void) cg_path_get_user_slice(c->cgroup, &t);
 284         free_and_replace(c->user_slice, t);
 285
 286         return 0;
 287 }
 288
 289 static int client_context_read_invocation_id(
 290                 Server *s,
 291                 ClientContext *c) {
 292
 293         _cleanup_free_ char *value = NULL;
 294         const char *p;
 295         int r;
 296
 297         assert(s);
 298         assert(c);
 299
 300         /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
 301
 302         if (!c->unit)
 303                 return 0;
 304
 305         p = strjoina("/run/systemd/units/invocation:", c->unit);
 306         r = readlink_malloc(p, &value);
 307         if (r < 0)
 308                 return r;
 309
 310         return sd_id128_from_string(value, &c->invocation_id);
 311 }
 312
 313 static int client_context_read_log_level_max(
 314                 Server *s,
 315                 ClientContext *c) {
 316
 317         _cleanup_free_ char *value = NULL;
 318         const char *p;
 319         int r, ll;
 320
 321         if (!c->unit)
 322                 return 0;
 323
 324         p = strjoina("/run/systemd/units/log-level-max:", c->unit);
 325         r = readlink_malloc(p, &value);
 326         if (r < 0)
 327                 return r;
 328
 329         ll = log_level_from_string(value);
 330         if (ll < 0)
 331                 return -EINVAL;
 332
 333         c->log_level_max = ll;
 334         return 0;
 335 }
 336
 337 static int client_context_read_extra_fields(
 338                 Server *s,
 339                 ClientContext *c) {
 340
 341         size_t size = 0, n_iovec = 0, n_allocated = 0, left;
 342         _cleanup_free_ struct iovec *iovec = NULL;
 343         _cleanup_free_ void *data = NULL;
 344         _cleanup_fclose_ FILE *f = NULL;
 345         struct stat st;
 346         const char *p;
 347         uint8_t *q;
 348         int r;
 349
 350         if (!c->unit)
 351                 return 0;
 352
 353         p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
 354
 355         if (c->extra_fields_mtime != NSEC_INFINITY) {
 356                 if (stat(p, &st) < 0) {
 357                         if (errno == ENOENT)
 358                                 return 0;
 359
 360                         return -errno;
 361                 }
 362
 363                 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
 364                         return 0;
 365         }
 366
 367         f = fopen(p, "re");
 368         if (!f) {
 369                 if (errno == ENOENT)
 370                         return 0;
 371
 372                 return -errno;
 373         }
 374
 375         if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
 376                                         * one, that matches the stuff we are reading */
 377                 return -errno;
 378
 379         r = read_full_stream(f, (char**) &data, &size);
 380         if (r < 0)
 381                 return r;
 382
 383         q = data, left = size;
 384         while (left > 0) {
 385                 uint8_t *field, *eq;
 386                 uint64_t v, n;
 387
 388                 if (left < sizeof(uint64_t))
 389                         return -EBADMSG;
 390
 391                 v = unaligned_read_le64(q);
 392                 if (v < 2)
 393                         return -EBADMSG;
 394
 395                 n = sizeof(uint64_t) + v;
 396                 if (left < n)
 397                         return -EBADMSG;
 398
 399                 field = q + sizeof(uint64_t);
 400
 401                 eq = memchr(field, '=', v);
 402                 if (!eq)
 403                         return -EBADMSG;
 404
 405                 if (!journal_field_valid((const char *) field, eq - field, false))
 406                         return -EBADMSG;
 407
 408                 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
 409                         return -ENOMEM;
 410
 411                 iovec[n_iovec++] = IOVEC_MAKE(field, v);
 412
 413                 left -= n, q += n;
 414         }
 415
 416         free(c->extra_fields_iovec);
 417         free(c->extra_fields_data);
 418
 419         c->extra_fields_iovec = TAKE_PTR(iovec);
 420         c->extra_fields_n_iovec = n_iovec;
 421         c->extra_fields_data = TAKE_PTR(data);
 422         c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
 423
 424         return 0;
 425 }
 426
 427 static void client_context_really_refresh(
 428                 Server *s,
 429                 ClientContext *c,
 430                 const struct ucred *ucred,
 431                 const char *label, size_t label_size,
 432                 const char *unit_id,
 433                 usec_t timestamp) {
 434
 435         assert(s);
 436         assert(c);
 437         assert(pid_is_valid(c->pid));
 438
 439         if (timestamp == USEC_INFINITY)
 440                 timestamp = now(CLOCK_MONOTONIC);
 441
 442         client_context_read_uid_gid(c, ucred);
 443         client_context_read_basic(c);
 444         (void) client_context_read_label(c, label, label_size);
 445
 446         (void) audit_session_from_pid(c->pid, &c->auditid);
 447         (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
 448
 449         (void) client_context_read_cgroup(s, c, unit_id);
 450         (void) client_context_read_invocation_id(s, c);
 451         (void) client_context_read_log_level_max(s, c);
 452         (void) client_context_read_extra_fields(s, c);
 453
 454         c->timestamp = timestamp;
 455
 456         if (c->in_lru) {
 457                 assert(c->n_ref == 0);
 458                 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
 459         }
 460 }
 461
 462 void client_context_maybe_refresh(
 463                 Server *s,
 464                 ClientContext *c,
 465                 const struct ucred *ucred,
 466                 const char *label, size_t label_size,
 467                 const char *unit_id,
 468                 usec_t timestamp) {
 469
 470         assert(s);
 471         assert(c);
 472
 473         if (timestamp == USEC_INFINITY)
 474                 timestamp = now(CLOCK_MONOTONIC);
 475
 476         /* No cached data so far? Let's fill it up */
 477         if (c->timestamp == USEC_INFINITY)
 478                 goto refresh;
 479
 480         /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
 481          * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
 482         if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
 483                 client_context_reset(c);
 484                 goto refresh;
 485         }
 486
 487         /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
 488         if (c->timestamp + REFRESH_USEC < timestamp)
 489                 goto refresh;
 490
 491         /* If the data passed along doesn't match the cached data we also do a refresh */
 492         if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
 493                 goto refresh;
 494
 495         if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
 496                 goto refresh;
 497
 498         if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
 499                 goto refresh;
 500
 501         return;
 502
 503 refresh:
 504         client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
 505 }
 506
 507 static void client_context_try_shrink_to(Server *s, size_t limit) {
 508         assert(s);
 509
 510         /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
 511          * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
 512          * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
 513
 514         while (hashmap_size(s->client_contexts) > limit) {
 515                 ClientContext *c;
 516
 517                 c = prioq_pop(s->client_contexts_lru);
 518                 if (!c)
 519                         break; /* All remaining entries are pinned, give up */
 520
 521                 assert(c->in_lru);
 522                 assert(c->n_ref == 0);
 523
 524                 c->in_lru = false;
 525
 526                 client_context_free(s, c);
 527         }
 528 }
 529
 530 void client_context_flush_all(Server *s) {
 531         assert(s);
 532
 533         /* Flush out all remaining entries. This assumes all references are already dropped. */
 534
 535         s->my_context = client_context_release(s, s->my_context);
 536         s->pid1_context = client_context_release(s, s->pid1_context);
 537
 538         client_context_try_shrink_to(s, 0);
 539
 540         assert(prioq_size(s->client_contexts_lru) == 0);
 541         assert(hashmap_size(s->client_contexts) == 0);
 542
 543         s->client_contexts_lru = prioq_free(s->client_contexts_lru);
 544         s->client_contexts = hashmap_free(s->client_contexts);
 545 }
 546
 547 static int client_context_get_internal(
 548                 Server *s,
 549                 pid_t pid,
 550                 const struct ucred *ucred,
 551                 const char *label, size_t label_len,
 552                 const char *unit_id,
 553                 bool add_ref,
 554                 ClientContext **ret) {
 555
 556         ClientContext *c;
 557         int r;
 558
 559         assert(s);
 560         assert(ret);
 561
 562         if (!pid_is_valid(pid))
 563                 return -EINVAL;
 564
 565         c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
 566         if (c) {
 567
 568                 if (add_ref) {
 569                         if (c->in_lru) {
 570                                 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
 571                                 assert(c->n_ref == 0);
 572                                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 573                                 c->in_lru = false;
 574                         }
 575
 576                         c->n_ref++;
 577                 }
 578
 579                 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 580
 581                 *ret = c;
 582                 return 0;
 583         }
 584
 585         client_context_try_shrink_to(s, CACHE_MAX-1);
 586
 587         r = client_context_new(s, pid, &c);
 588         if (r < 0)
 589                 return r;
 590
 591         if (add_ref)
 592                 c->n_ref++;
 593         else {
 594                 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
 595                 if (r < 0) {
 596                         client_context_free(s, c);
 597                         return r;
 598                 }
 599
 600                 c->in_lru = true;
 601         }
 602
 603         client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 604
 605         *ret = c;
 606         return 0;
 607 }
 608
 609 int client_context_get(
 610                 Server *s,
 611                 pid_t pid,
 612                 const struct ucred *ucred,
 613                 const char *label, size_t label_len,
 614                 const char *unit_id,
 615                 ClientContext **ret) {
 616
 617         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
 618 }
 619
 620 int client_context_acquire(
 621                 Server *s,
 622                 pid_t pid,
 623                 const struct ucred *ucred,
 624                 const char *label, size_t label_len,
 625                 const char *unit_id,
 626                 ClientContext **ret) {
 627
 628         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
 629 };
 630
 631 ClientContext *client_context_release(Server *s, ClientContext *c) {
 632         assert(s);
 633
 634         if (!c)
 635                 return NULL;
 636
 637         assert(c->n_ref > 0);
 638         assert(!c->in_lru);
 639
 640         c->n_ref--;
 641         if (c->n_ref > 0)
 642                 return NULL;
 643
 644         /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
 645          * right-away */
 646
 647         if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
 648                 client_context_free(s, c);
 649         else
 650                 c->in_lru = true;
 651
 652         return NULL;
 653 }
 654
 655 void client_context_acquire_default(Server *s) {
 656         int r;
 657
 658         assert(s);
 659
 660         /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
 661          * generate driver messages. */
 662
 663         if (!s->my_context) {
 664                 struct ucred ucred = {
 665                         .pid = getpid_cached(),
 666                         .uid = getuid(),
 667                         .gid = getgid(),
 668                 };
 669
 670                 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
 671                 if (r < 0)
 672                         log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
 673         }
 674
 675         if (!s->pid1_context) {
 676
 677                 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
 678                 if (r < 0)
 679                         log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
 680
 681         }
 682 }