src/journal/journald-context.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2017 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #if HAVE_SELINUX
  22 #include <selinux/selinux.h>
  23 #endif
  24
  25 #include "alloc-util.h"
  26 #include "audit-util.h"
  27 #include "cgroup-util.h"
  28 #include "fd-util.h"
  29 #include "fileio.h"
  30 #include "fs-util.h"
  31 #include "io-util.h"
  32 #include "journal-util.h"
  33 #include "journald-context.h"
  34 #include "process-util.h"
  35 #include "string-util.h"
  36 #include "syslog-util.h"
  37 #include "unaligned.h"
  38 #include "user-util.h"
  39
  40 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
  41  * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
  42  * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
  43  * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
  44  *
  45  * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
  46  * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
  47  * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
  48  * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
  49  * flushed out). Data newer than 1s is used immediately without refresh.
  50  *
  51  * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
  52  * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
  53  * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
  54  *
  55  * Caching metadata like this has two major benefits:
  56  *
  57  * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
  58  *
  59  * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
  60  *    entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
  61  *    service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
  62  *    stream connection. This should improve cases where a service process logs immediately before exiting and we
  63  *    previously had trouble associating the log message with the service.
  64  *
  65  * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
  66  *     UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
  67  *     and sometimes slightly newer than what was current at the log event).
  68  */
  69
  70 /* We refresh every 1s */
  71 #define REFRESH_USEC (1*USEC_PER_SEC)
  72
  73 /* Data older than 5s we flush out */
  74 #define MAX_USEC (5*USEC_PER_SEC)
  75
  76 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
  77  * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
  78  * clients itself is limited.) */
  79 #define CACHE_MAX (16*1024)
  80
  81 static int client_context_compare(const void *a, const void *b) {
  82         const ClientContext *x = a, *y = b;
  83
  84         if (x->timestamp < y->timestamp)
  85                 return -1;
  86         if (x->timestamp > y->timestamp)
  87                 return 1;
  88
  89         if (x->pid < y->pid)
  90                 return -1;
  91         if (x->pid > y->pid)
  92                 return 1;
  93
  94         return 0;
  95 }
  96
  97 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
  98         ClientContext *c;
  99         int r;
 100
 101         assert(s);
 102         assert(pid_is_valid(pid));
 103         assert(ret);
 104
 105         r = hashmap_ensure_allocated(&s->client_contexts, NULL);
 106         if (r < 0)
 107                 return r;
 108
 109         r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
 110         if (r < 0)
 111                 return r;
 112
 113         c = new0(ClientContext, 1);
 114         if (!c)
 115                 return -ENOMEM;
 116
 117         c->pid = pid;
 118
 119         c->uid = UID_INVALID;
 120         c->gid = GID_INVALID;
 121         c->auditid = AUDIT_SESSION_INVALID;
 122         c->loginuid = UID_INVALID;
 123         c->owner_uid = UID_INVALID;
 124         c->lru_index = PRIOQ_IDX_NULL;
 125         c->timestamp = USEC_INFINITY;
 126         c->extra_fields_mtime = NSEC_INFINITY;
 127         c->log_level_max = -1;
 128
 129         r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
 130         if (r < 0) {
 131                 free(c);
 132                 return r;
 133         }
 134
 135         *ret = c;
 136         return 0;
 137 }
 138
 139 static void client_context_reset(ClientContext *c) {
 140         assert(c);
 141
 142         c->timestamp = USEC_INFINITY;
 143
 144         c->uid = UID_INVALID;
 145         c->gid = GID_INVALID;
 146
 147         c->comm = mfree(c->comm);
 148         c->exe = mfree(c->exe);
 149         c->cmdline = mfree(c->cmdline);
 150         c->capeff = mfree(c->capeff);
 151
 152         c->auditid = AUDIT_SESSION_INVALID;
 153         c->loginuid = UID_INVALID;
 154
 155         c->cgroup = mfree(c->cgroup);
 156         c->session = mfree(c->session);
 157         c->owner_uid = UID_INVALID;
 158         c->unit = mfree(c->unit);
 159         c->user_unit = mfree(c->user_unit);
 160         c->slice = mfree(c->slice);
 161         c->user_slice = mfree(c->user_slice);
 162
 163         c->invocation_id = SD_ID128_NULL;
 164
 165         c->label = mfree(c->label);
 166         c->label_size = 0;
 167
 168         c->extra_fields_iovec = mfree(c->extra_fields_iovec);
 169         c->extra_fields_n_iovec = 0;
 170         c->extra_fields_data = mfree(c->extra_fields_data);
 171         c->extra_fields_mtime = NSEC_INFINITY;
 172
 173         c->log_level_max = -1;
 174 }
 175
 176 static ClientContext* client_context_free(Server *s, ClientContext *c) {
 177         assert(s);
 178
 179         if (!c)
 180                 return NULL;
 181
 182         assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
 183
 184         if (c->in_lru)
 185                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 186
 187         client_context_reset(c);
 188
 189         return mfree(c);
 190 }
 191
 192 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
 193         assert(c);
 194         assert(pid_is_valid(c->pid));
 195
 196         /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
 197         if (ucred && uid_is_valid(ucred->uid))
 198                 c->uid = ucred->uid;
 199         else
 200                 (void) get_process_uid(c->pid, &c->uid);
 201
 202         if (ucred && gid_is_valid(ucred->gid))
 203                 c->gid = ucred->gid;
 204         else
 205                 (void) get_process_gid(c->pid, &c->gid);
 206 }
 207
 208 static void client_context_read_basic(ClientContext *c) {
 209         char *t;
 210
 211         assert(c);
 212         assert(pid_is_valid(c->pid));
 213
 214         if (get_process_comm(c->pid, &t) >= 0)
 215                 free_and_replace(c->comm, t);
 216
 217         if (get_process_exe(c->pid, &t) >= 0)
 218                 free_and_replace(c->exe, t);
 219
 220         if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
 221                 free_and_replace(c->cmdline, t);
 222
 223         if (get_process_capeff(c->pid, &t) >= 0)
 224                 free_and_replace(c->capeff, t);
 225 }
 226
 227 static int client_context_read_label(
 228                 ClientContext *c,
 229                 const char *label, size_t label_size) {
 230
 231         assert(c);
 232         assert(pid_is_valid(c->pid));
 233         assert(label_size == 0 || label);
 234
 235         if (label_size > 0) {
 236                 char *l;
 237
 238                 /* If we got an SELinux label passed in it counts. */
 239
 240                 l = newdup_suffix0(char, label, label_size);
 241                 if (!l)
 242                         return -ENOMEM;
 243
 244                 free_and_replace(c->label, l);
 245                 c->label_size = label_size;
 246         }
 247 #if HAVE_SELINUX
 248         else {
 249                 char *con;
 250
 251                 /* If we got no SELinux label passed in, let's try to acquire one */
 252
 253                 if (getpidcon(c->pid, &con) >= 0) {
 254                         free_and_replace(c->label, con);
 255                         c->label_size = strlen(c->label);
 256                 }
 257         }
 258 #endif
 259
 260         return 0;
 261 }
 262
 263 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
 264         char *t = NULL;
 265         int r;
 266
 267         assert(c);
 268
 269         /* Try to acquire the current cgroup path */
 270         r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
 271         if (r < 0) {
 272
 273                 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
 274                 if (unit_id && !c->unit) {
 275                         c->unit = strdup(unit_id);
 276                         if (c->unit)
 277                                 return 0;
 278                 }
 279
 280                 return r;
 281         }
 282
 283         /* Let's shortcut this if the cgroup path didn't change */
 284         if (streq_ptr(c->cgroup, t)) {
 285                 free(t);
 286                 return 0;
 287         }
 288
 289         free_and_replace(c->cgroup, t);
 290
 291         (void) cg_path_get_session(c->cgroup, &t);
 292         free_and_replace(c->session, t);
 293
 294         if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
 295                 c->owner_uid = UID_INVALID;
 296
 297         (void) cg_path_get_unit(c->cgroup, &t);
 298         free_and_replace(c->unit, t);
 299
 300         (void) cg_path_get_user_unit(c->cgroup, &t);
 301         free_and_replace(c->user_unit, t);
 302
 303         (void) cg_path_get_slice(c->cgroup, &t);
 304         free_and_replace(c->slice, t);
 305
 306         (void) cg_path_get_user_slice(c->cgroup, &t);
 307         free_and_replace(c->user_slice, t);
 308
 309         return 0;
 310 }
 311
 312 static int client_context_read_invocation_id(
 313                 Server *s,
 314                 ClientContext *c) {
 315
 316         _cleanup_free_ char *value = NULL;
 317         const char *p;
 318         int r;
 319
 320         assert(s);
 321         assert(c);
 322
 323         /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
 324
 325         if (!c->unit)
 326                 return 0;
 327
 328         p = strjoina("/run/systemd/units/invocation:", c->unit);
 329         r = readlink_malloc(p, &value);
 330         if (r < 0)
 331                 return r;
 332
 333         return sd_id128_from_string(value, &c->invocation_id);
 334 }
 335
 336 static int client_context_read_log_level_max(
 337                 Server *s,
 338                 ClientContext *c) {
 339
 340         _cleanup_free_ char *value = NULL;
 341         const char *p;
 342         int r, ll;
 343
 344         if (!c->unit)
 345                 return 0;
 346
 347         p = strjoina("/run/systemd/units/log-level-max:", c->unit);
 348         r = readlink_malloc(p, &value);
 349         if (r < 0)
 350                 return r;
 351
 352         ll = log_level_from_string(value);
 353         if (ll < 0)
 354                 return -EINVAL;
 355
 356         c->log_level_max = ll;
 357         return 0;
 358 }
 359
 360 static int client_context_read_extra_fields(
 361                 Server *s,
 362                 ClientContext *c) {
 363
 364         size_t size = 0, n_iovec = 0, n_allocated = 0, left;
 365         _cleanup_free_ struct iovec *iovec = NULL;
 366         _cleanup_free_ void *data = NULL;
 367         _cleanup_fclose_ FILE *f = NULL;
 368         struct stat st;
 369         const char *p;
 370         uint8_t *q;
 371         int r;
 372
 373         if (!c->unit)
 374                 return 0;
 375
 376         p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
 377
 378         if (c->extra_fields_mtime != NSEC_INFINITY) {
 379                 if (stat(p, &st) < 0) {
 380                         if (errno == ENOENT)
 381                                 return 0;
 382
 383                         return -errno;
 384                 }
 385
 386                 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
 387                         return 0;
 388         }
 389
 390         f = fopen(p, "re");
 391         if (!f) {
 392                 if (errno == ENOENT)
 393                         return 0;
 394
 395                 return -errno;
 396         }
 397
 398         if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
 399                                         * one, that matches the stuff we are reading */
 400                 return -errno;
 401
 402         r = read_full_stream(f, (char**) &data, &size);
 403         if (r < 0)
 404                 return r;
 405
 406         q = data, left = size;
 407         while (left > 0) {
 408                 uint8_t *field, *eq;
 409                 uint64_t v, n;
 410
 411                 if (left < sizeof(uint64_t))
 412                         return -EBADMSG;
 413
 414                 v = unaligned_read_le64(q);
 415                 if (v < 2)
 416                         return -EBADMSG;
 417
 418                 n = sizeof(uint64_t) + v;
 419                 if (left < n)
 420                         return -EBADMSG;
 421
 422                 field = q + sizeof(uint64_t);
 423
 424                 eq = memchr(field, '=', v);
 425                 if (!eq)
 426                         return -EBADMSG;
 427
 428                 if (!journal_field_valid((const char *) field, eq - field, false))
 429                         return -EBADMSG;
 430
 431                 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
 432                         return -ENOMEM;
 433
 434                 iovec[n_iovec++] = IOVEC_MAKE(field, v);
 435
 436                 left -= n, q += n;
 437         }
 438
 439         free(c->extra_fields_iovec);
 440         free(c->extra_fields_data);
 441
 442         c->extra_fields_iovec = TAKE_PTR(iovec);
 443         c->extra_fields_n_iovec = n_iovec;
 444         c->extra_fields_data = TAKE_PTR(data);
 445         c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
 446
 447         return 0;
 448 }
 449
 450 static void client_context_really_refresh(
 451                 Server *s,
 452                 ClientContext *c,
 453                 const struct ucred *ucred,
 454                 const char *label, size_t label_size,
 455                 const char *unit_id,
 456                 usec_t timestamp) {
 457
 458         assert(s);
 459         assert(c);
 460         assert(pid_is_valid(c->pid));
 461
 462         if (timestamp == USEC_INFINITY)
 463                 timestamp = now(CLOCK_MONOTONIC);
 464
 465         client_context_read_uid_gid(c, ucred);
 466         client_context_read_basic(c);
 467         (void) client_context_read_label(c, label, label_size);
 468
 469         (void) audit_session_from_pid(c->pid, &c->auditid);
 470         (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
 471
 472         (void) client_context_read_cgroup(s, c, unit_id);
 473         (void) client_context_read_invocation_id(s, c);
 474         (void) client_context_read_log_level_max(s, c);
 475         (void) client_context_read_extra_fields(s, c);
 476
 477         c->timestamp = timestamp;
 478
 479         if (c->in_lru) {
 480                 assert(c->n_ref == 0);
 481                 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
 482         }
 483 }
 484
 485 void client_context_maybe_refresh(
 486                 Server *s,
 487                 ClientContext *c,
 488                 const struct ucred *ucred,
 489                 const char *label, size_t label_size,
 490                 const char *unit_id,
 491                 usec_t timestamp) {
 492
 493         assert(s);
 494         assert(c);
 495
 496         if (timestamp == USEC_INFINITY)
 497                 timestamp = now(CLOCK_MONOTONIC);
 498
 499         /* No cached data so far? Let's fill it up */
 500         if (c->timestamp == USEC_INFINITY)
 501                 goto refresh;
 502
 503         /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
 504          * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
 505         if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
 506                 client_context_reset(c);
 507                 goto refresh;
 508         }
 509
 510         /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
 511         if (c->timestamp + REFRESH_USEC < timestamp)
 512                 goto refresh;
 513
 514         /* If the data passed along doesn't match the cached data we also do a refresh */
 515         if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
 516                 goto refresh;
 517
 518         if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
 519                 goto refresh;
 520
 521         if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
 522                 goto refresh;
 523
 524         return;
 525
 526 refresh:
 527         client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
 528 }
 529
 530 static void client_context_try_shrink_to(Server *s, size_t limit) {
 531         assert(s);
 532
 533         /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
 534          * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
 535          * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
 536
 537         while (hashmap_size(s->client_contexts) > limit) {
 538                 ClientContext *c;
 539
 540                 c = prioq_pop(s->client_contexts_lru);
 541                 if (!c)
 542                         break; /* All remaining entries are pinned, give up */
 543
 544                 assert(c->in_lru);
 545                 assert(c->n_ref == 0);
 546
 547                 c->in_lru = false;
 548
 549                 client_context_free(s, c);
 550         }
 551 }
 552
 553 void client_context_flush_all(Server *s) {
 554         assert(s);
 555
 556         /* Flush out all remaining entries. This assumes all references are already dropped. */
 557
 558         s->my_context = client_context_release(s, s->my_context);
 559         s->pid1_context = client_context_release(s, s->pid1_context);
 560
 561         client_context_try_shrink_to(s, 0);
 562
 563         assert(prioq_size(s->client_contexts_lru) == 0);
 564         assert(hashmap_size(s->client_contexts) == 0);
 565
 566         s->client_contexts_lru = prioq_free(s->client_contexts_lru);
 567         s->client_contexts = hashmap_free(s->client_contexts);
 568 }
 569
 570 static int client_context_get_internal(
 571                 Server *s,
 572                 pid_t pid,
 573                 const struct ucred *ucred,
 574                 const char *label, size_t label_len,
 575                 const char *unit_id,
 576                 bool add_ref,
 577                 ClientContext **ret) {
 578
 579         ClientContext *c;
 580         int r;
 581
 582         assert(s);
 583         assert(ret);
 584
 585         if (!pid_is_valid(pid))
 586                 return -EINVAL;
 587
 588         c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
 589         if (c) {
 590
 591                 if (add_ref) {
 592                         if (c->in_lru) {
 593                                 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
 594                                 assert(c->n_ref == 0);
 595                                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 596                                 c->in_lru = false;
 597                         }
 598
 599                         c->n_ref++;
 600                 }
 601
 602                 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 603
 604                 *ret = c;
 605                 return 0;
 606         }
 607
 608         client_context_try_shrink_to(s, CACHE_MAX-1);
 609
 610         r = client_context_new(s, pid, &c);
 611         if (r < 0)
 612                 return r;
 613
 614         if (add_ref)
 615                 c->n_ref++;
 616         else {
 617                 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
 618                 if (r < 0) {
 619                         client_context_free(s, c);
 620                         return r;
 621                 }
 622
 623                 c->in_lru = true;
 624         }
 625
 626         client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 627
 628         *ret = c;
 629         return 0;
 630 }
 631
 632 int client_context_get(
 633                 Server *s,
 634                 pid_t pid,
 635                 const struct ucred *ucred,
 636                 const char *label, size_t label_len,
 637                 const char *unit_id,
 638                 ClientContext **ret) {
 639
 640         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
 641 }
 642
 643 int client_context_acquire(
 644                 Server *s,
 645                 pid_t pid,
 646                 const struct ucred *ucred,
 647                 const char *label, size_t label_len,
 648                 const char *unit_id,
 649                 ClientContext **ret) {
 650
 651         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
 652 };
 653
 654 ClientContext *client_context_release(Server *s, ClientContext *c) {
 655         assert(s);
 656
 657         if (!c)
 658                 return NULL;
 659
 660         assert(c->n_ref > 0);
 661         assert(!c->in_lru);
 662
 663         c->n_ref--;
 664         if (c->n_ref > 0)
 665                 return NULL;
 666
 667         /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
 668          * right-away */
 669
 670         if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
 671                 client_context_free(s, c);
 672         else
 673                 c->in_lru = true;
 674
 675         return NULL;
 676 }
 677
 678 void client_context_acquire_default(Server *s) {
 679         int r;
 680
 681         assert(s);
 682
 683         /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
 684          * generate driver messages. */
 685
 686         if (!s->my_context) {
 687                 struct ucred ucred = {
 688                         .pid = getpid_cached(),
 689                         .uid = getuid(),
 690                         .gid = getgid(),
 691                 };
 692
 693                 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
 694                 if (r < 0)
 695                         log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
 696         }
 697
 698         if (!s->pid1_context) {
 699
 700                 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
 701                 if (r < 0)
 702                         log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
 703
 704         }
 705 }