src/journal/journald-context.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2017 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #if HAVE_SELINUX
  22 #include <selinux/selinux.h>
  23 #endif
  24
  25 #include "alloc-util.h"
  26 #include "audit-util.h"
  27 #include "cgroup-util.h"
  28 #include "fd-util.h"
  29 #include "fileio.h"
  30 #include "fs-util.h"
  31 #include "io-util.h"
  32 #include "journal-util.h"
  33 #include "journald-context.h"
  34 #include "process-util.h"
  35 #include "string-util.h"
  36 #include "syslog-util.h"
  37 #include "unaligned.h"
  38 #include "user-util.h"
  39
  40 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
  41  * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
  42  * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
  43  * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
  44  *
  45  * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
  46  * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
  47  * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
  48  * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
  49  * flushed out). Data newer than 1s is used immediately without refresh.
  50  *
  51  * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
  52  * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
  53  * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
  54  *
  55  * Caching metadata like this has two major benefits:
  56  *
  57  * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
  58  *
  59  * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
  60  *    entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
  61  *    service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
  62  *    stream connection. This should improve cases where a service process logs immediately before exiting and we
  63  *    previously had trouble associating the log message with the service.
  64  *
  65  * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
  66  *     UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
  67  *     and sometimes slightly newer than what was current at the log event).
  68  */
  69
  70 /* We refresh every 1s */
  71 #define REFRESH_USEC (1*USEC_PER_SEC)
  72
  73 /* Data older than 5s we flush out */
  74 #define MAX_USEC (5*USEC_PER_SEC)
  75
  76 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
  77  * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
  78  * clients itself is limited.) */
  79 #define CACHE_MAX (16*1024)
  80
  81 static int client_context_compare(const void *a, const void *b) {
  82         const ClientContext *x = a, *y = b;
  83
  84         if (x->timestamp < y->timestamp)
  85                 return -1;
  86         if (x->timestamp > y->timestamp)
  87                 return 1;
  88
  89         if (x->pid < y->pid)
  90                 return -1;
  91         if (x->pid > y->pid)
  92                 return 1;
  93
  94         return 0;
  95 }
  96
  97 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
  98         ClientContext *c;
  99         int r;
 100
 101         assert(s);
 102         assert(pid_is_valid(pid));
 103         assert(ret);
 104
 105         r = hashmap_ensure_allocated(&s->client_contexts, NULL);
 106         if (r < 0)
 107                 return r;
 108
 109         r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
 110         if (r < 0)
 111                 return r;
 112
 113         c = new0(ClientContext, 1);
 114         if (!c)
 115                 return -ENOMEM;
 116
 117         c->pid = pid;
 118
 119         c->uid = UID_INVALID;
 120         c->gid = GID_INVALID;
 121         c->auditid = AUDIT_SESSION_INVALID;
 122         c->loginuid = UID_INVALID;
 123         c->owner_uid = UID_INVALID;
 124         c->lru_index = PRIOQ_IDX_NULL;
 125         c->timestamp = USEC_INFINITY;
 126         c->extra_fields_mtime = NSEC_INFINITY;
 127         c->log_level_max = -1;
 128
 129         r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
 130         if (r < 0) {
 131                 free(c);
 132                 return r;
 133         }
 134
 135         *ret = c;
 136         return 0;
 137 }
 138
 139 static void client_context_reset(ClientContext *c) {
 140         assert(c);
 141
 142         c->timestamp = USEC_INFINITY;
 143
 144         c->uid = UID_INVALID;
 145         c->gid = GID_INVALID;
 146
 147         c->comm = mfree(c->comm);
 148         c->exe = mfree(c->exe);
 149         c->cmdline = mfree(c->cmdline);
 150         c->capeff = mfree(c->capeff);
 151
 152         c->auditid = AUDIT_SESSION_INVALID;
 153         c->loginuid = UID_INVALID;
 154
 155         c->cgroup = mfree(c->cgroup);
 156         c->session = mfree(c->session);
 157         c->owner_uid = UID_INVALID;
 158         c->unit = mfree(c->unit);
 159         c->user_unit = mfree(c->user_unit);
 160         c->slice = mfree(c->slice);
 161         c->user_slice = mfree(c->user_slice);
 162
 163         c->invocation_id = SD_ID128_NULL;
 164
 165         c->label = mfree(c->label);
 166         c->label_size = 0;
 167
 168         c->extra_fields_iovec = mfree(c->extra_fields_iovec);
 169         c->extra_fields_n_iovec = 0;
 170         c->extra_fields_data = mfree(c->extra_fields_data);
 171         c->extra_fields_mtime = NSEC_INFINITY;
 172
 173         c->log_level_max = -1;
 174 }
 175
 176 static ClientContext* client_context_free(Server *s, ClientContext *c) {
 177         assert(s);
 178
 179         if (!c)
 180                 return NULL;
 181
 182         assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
 183
 184         if (c->in_lru)
 185                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 186
 187         client_context_reset(c);
 188
 189         return mfree(c);
 190 }
 191
 192 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
 193         assert(c);
 194         assert(pid_is_valid(c->pid));
 195
 196         /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
 197         if (ucred && uid_is_valid(ucred->uid))
 198                 c->uid = ucred->uid;
 199         else
 200                 (void) get_process_uid(c->pid, &c->uid);
 201
 202         if (ucred && gid_is_valid(ucred->gid))
 203                 c->gid = ucred->gid;
 204         else
 205                 (void) get_process_gid(c->pid, &c->gid);
 206 }
 207
 208 static void client_context_read_basic(ClientContext *c) {
 209         char *t;
 210
 211         assert(c);
 212         assert(pid_is_valid(c->pid));
 213
 214         if (get_process_comm(c->pid, &t) >= 0)
 215                 free_and_replace(c->comm, t);
 216
 217         if (get_process_exe(c->pid, &t) >= 0)
 218                 free_and_replace(c->exe, t);
 219
 220         if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
 221                 free_and_replace(c->cmdline, t);
 222
 223         if (get_process_capeff(c->pid, &t) >= 0)
 224                 free_and_replace(c->capeff, t);
 225 }
 226
 227 static int client_context_read_label(
 228                 ClientContext *c,
 229                 const char *label, size_t label_size) {
 230
 231         assert(c);
 232         assert(pid_is_valid(c->pid));
 233         assert(label_size == 0 || label);
 234
 235         if (label_size > 0) {
 236                 char *l;
 237
 238                 /* If we got an SELinux label passed in it counts. */
 239
 240                 l = newdup_suffix0(char, label, label_size);
 241                 if (!l)
 242                         return -ENOMEM;
 243
 244                 free_and_replace(c->label, l);
 245                 c->label_size = label_size;
 246         }
 247 #if HAVE_SELINUX
 248         else {
 249                 char *con;
 250
 251                 /* If we got no SELinux label passed in, let's try to acquire one */
 252
 253                 if (getpidcon(c->pid, &con) >= 0) {
 254                         free_and_replace(c->label, con);
 255                         c->label_size = strlen(c->label);
 256                 }
 257         }
 258 #endif
 259
 260         return 0;
 261 }
 262
 263 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
 264         char *t = NULL;
 265         int r;
 266
 267         assert(c);
 268
 269         /* Try to acquire the current cgroup path */
 270         r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
 271         if (r < 0) {
 272
 273                 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
 274                 if (unit_id && !c->unit) {
 275                         c->unit = strdup(unit_id);
 276                         if (c->unit)
 277                                 return 0;
 278                 }
 279
 280                 return r;
 281         }
 282
 283         /* Let's shortcut this if the cgroup path didn't change */
 284         if (streq_ptr(c->cgroup, t)) {
 285                 free(t);
 286                 return 0;
 287         }
 288
 289         free_and_replace(c->cgroup, t);
 290
 291         (void) cg_path_get_session(c->cgroup, &t);
 292         free_and_replace(c->session, t);
 293
 294         if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
 295                 c->owner_uid = UID_INVALID;
 296
 297         (void) cg_path_get_unit(c->cgroup, &t);
 298         free_and_replace(c->unit, t);
 299
 300         (void) cg_path_get_user_unit(c->cgroup, &t);
 301         free_and_replace(c->user_unit, t);
 302
 303         (void) cg_path_get_slice(c->cgroup, &t);
 304         free_and_replace(c->slice, t);
 305
 306         (void) cg_path_get_user_slice(c->cgroup, &t);
 307         free_and_replace(c->user_slice, t);
 308
 309         return 0;
 310 }
 311
 312 static int client_context_read_invocation_id(
 313                 Server *s,
 314                 ClientContext *c) {
 315
 316         _cleanup_free_ char *value = NULL;
 317         const char *p;
 318         int r;
 319
 320         assert(s);
 321         assert(c);
 322
 323         /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
 324
 325         if (!c->unit)
 326                 return 0;
 327
 328         p = strjoina("/run/systemd/units/invocation:", c->unit);
 329         r = readlink_malloc(p, &value);
 330         if (r < 0)
 331                 return r;
 332
 333         return sd_id128_from_string(value, &c->invocation_id);
 334 }
 335
 336 static int client_context_read_log_level_max(
 337                 Server *s,
 338                 ClientContext *c) {
 339
 340         _cleanup_free_ char *value = NULL;
 341         const char *p;
 342         int r, ll;
 343
 344         if (!c->unit)
 345                 return 0;
 346
 347         p = strjoina("/run/systemd/units/log-level-max:", c->unit);
 348         r = readlink_malloc(p, &value);
 349         if (r < 0)
 350                 return r;
 351
 352         ll = log_level_from_string(value);
 353         if (ll < 0)
 354                 return -EINVAL;
 355
 356         c->log_level_max = ll;
 357         return 0;
 358 }
 359
 360 static int client_context_read_extra_fields(
 361                 Server *s,
 362                 ClientContext *c) {
 363
 364         size_t size = 0, n_iovec = 0, n_allocated = 0, left;
 365         _cleanup_free_ struct iovec *iovec = NULL;
 366         _cleanup_free_ void *data = NULL;
 367         _cleanup_fclose_ FILE *f = NULL;
 368         struct stat st;
 369         const char *p;
 370         uint8_t *q;
 371         int r;
 372
 373         if (!c->unit)
 374                 return 0;
 375
 376         p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
 377
 378         if (c->extra_fields_mtime != NSEC_INFINITY) {
 379                 if (stat(p, &st) < 0) {
 380                         if (errno == ENOENT)
 381                                 return 0;
 382
 383                         return -errno;
 384                 }
 385
 386                 if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
 387                         return 0;
 388         }
 389
 390         f = fopen(p, "re");
 391         if (!f) {
 392                 if (errno == ENOENT)
 393                         return 0;
 394
 395                 return -errno;
 396         }
 397
 398         if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
 399                                         * one, that matches the stuff we are reading */
 400                 return -errno;
 401
 402         r = read_full_stream(f, (char**) &data, &size);
 403         if (r < 0)
 404                 return r;
 405
 406         q = data, left = size;
 407         while (left > 0) {
 408                 uint8_t *field, *eq;
 409                 uint64_t v, n;
 410
 411                 if (left < sizeof(uint64_t))
 412                         return -EBADMSG;
 413
 414                 v = unaligned_read_le64(q);
 415                 if (v < 2)
 416                         return -EBADMSG;
 417
 418                 n = sizeof(uint64_t) + v;
 419                 if (left < n)
 420                         return -EBADMSG;
 421
 422                 field = q + sizeof(uint64_t);
 423
 424                 eq = memchr(field, '=', v);
 425                 if (!eq)
 426                         return -EBADMSG;
 427
 428                 if (!journal_field_valid((const char *) field, eq - field, false))
 429                         return -EBADMSG;
 430
 431                 if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
 432                         return -ENOMEM;
 433
 434                 iovec[n_iovec++] = IOVEC_MAKE(field, v);
 435
 436                 left -= n, q += n;
 437         }
 438
 439         free(c->extra_fields_iovec);
 440         free(c->extra_fields_data);
 441
 442         c->extra_fields_iovec = iovec;
 443         c->extra_fields_n_iovec = n_iovec;
 444         c->extra_fields_data = data;
 445         c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
 446
 447         iovec = NULL;
 448         data = NULL;
 449
 450         return 0;
 451 }
 452
 453 static void client_context_really_refresh(
 454                 Server *s,
 455                 ClientContext *c,
 456                 const struct ucred *ucred,
 457                 const char *label, size_t label_size,
 458                 const char *unit_id,
 459                 usec_t timestamp) {
 460
 461         assert(s);
 462         assert(c);
 463         assert(pid_is_valid(c->pid));
 464
 465         if (timestamp == USEC_INFINITY)
 466                 timestamp = now(CLOCK_MONOTONIC);
 467
 468         client_context_read_uid_gid(c, ucred);
 469         client_context_read_basic(c);
 470         (void) client_context_read_label(c, label, label_size);
 471
 472         (void) audit_session_from_pid(c->pid, &c->auditid);
 473         (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
 474
 475         (void) client_context_read_cgroup(s, c, unit_id);
 476         (void) client_context_read_invocation_id(s, c);
 477         (void) client_context_read_log_level_max(s, c);
 478         (void) client_context_read_extra_fields(s, c);
 479
 480         c->timestamp = timestamp;
 481
 482         if (c->in_lru) {
 483                 assert(c->n_ref == 0);
 484                 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
 485         }
 486 }
 487
 488 void client_context_maybe_refresh(
 489                 Server *s,
 490                 ClientContext *c,
 491                 const struct ucred *ucred,
 492                 const char *label, size_t label_size,
 493                 const char *unit_id,
 494                 usec_t timestamp) {
 495
 496         assert(s);
 497         assert(c);
 498
 499         if (timestamp == USEC_INFINITY)
 500                 timestamp = now(CLOCK_MONOTONIC);
 501
 502         /* No cached data so far? Let's fill it up */
 503         if (c->timestamp == USEC_INFINITY)
 504                 goto refresh;
 505
 506         /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
 507          * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
 508         if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
 509                 client_context_reset(c);
 510                 goto refresh;
 511         }
 512
 513         /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
 514         if (c->timestamp + REFRESH_USEC < timestamp)
 515                 goto refresh;
 516
 517         /* If the data passed along doesn't match the cached data we also do a refresh */
 518         if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
 519                 goto refresh;
 520
 521         if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
 522                 goto refresh;
 523
 524         if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
 525                 goto refresh;
 526
 527         return;
 528
 529 refresh:
 530         client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
 531 }
 532
 533 static void client_context_try_shrink_to(Server *s, size_t limit) {
 534         assert(s);
 535
 536         /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
 537          * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
 538          * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
 539
 540         while (hashmap_size(s->client_contexts) > limit) {
 541                 ClientContext *c;
 542
 543                 c = prioq_pop(s->client_contexts_lru);
 544                 if (!c)
 545                         break; /* All remaining entries are pinned, give up */
 546
 547                 assert(c->in_lru);
 548                 assert(c->n_ref == 0);
 549
 550                 c->in_lru = false;
 551
 552                 client_context_free(s, c);
 553         }
 554 }
 555
 556 void client_context_flush_all(Server *s) {
 557         assert(s);
 558
 559         /* Flush out all remaining entries. This assumes all references are already dropped. */
 560
 561         s->my_context = client_context_release(s, s->my_context);
 562         s->pid1_context = client_context_release(s, s->pid1_context);
 563
 564         client_context_try_shrink_to(s, 0);
 565
 566         assert(prioq_size(s->client_contexts_lru) == 0);
 567         assert(hashmap_size(s->client_contexts) == 0);
 568
 569         s->client_contexts_lru = prioq_free(s->client_contexts_lru);
 570         s->client_contexts = hashmap_free(s->client_contexts);
 571 }
 572
 573 static int client_context_get_internal(
 574                 Server *s,
 575                 pid_t pid,
 576                 const struct ucred *ucred,
 577                 const char *label, size_t label_len,
 578                 const char *unit_id,
 579                 bool add_ref,
 580                 ClientContext **ret) {
 581
 582         ClientContext *c;
 583         int r;
 584
 585         assert(s);
 586         assert(ret);
 587
 588         if (!pid_is_valid(pid))
 589                 return -EINVAL;
 590
 591         c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
 592         if (c) {
 593
 594                 if (add_ref) {
 595                         if (c->in_lru) {
 596                                 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
 597                                 assert(c->n_ref == 0);
 598                                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 599                                 c->in_lru = false;
 600                         }
 601
 602                         c->n_ref++;
 603                 }
 604
 605                 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 606
 607                 *ret = c;
 608                 return 0;
 609         }
 610
 611         client_context_try_shrink_to(s, CACHE_MAX-1);
 612
 613         r = client_context_new(s, pid, &c);
 614         if (r < 0)
 615                 return r;
 616
 617         if (add_ref)
 618                 c->n_ref++;
 619         else {
 620                 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
 621                 if (r < 0) {
 622                         client_context_free(s, c);
 623                         return r;
 624                 }
 625
 626                 c->in_lru = true;
 627         }
 628
 629         client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 630
 631         *ret = c;
 632         return 0;
 633 }
 634
 635 int client_context_get(
 636                 Server *s,
 637                 pid_t pid,
 638                 const struct ucred *ucred,
 639                 const char *label, size_t label_len,
 640                 const char *unit_id,
 641                 ClientContext **ret) {
 642
 643         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
 644 }
 645
 646 int client_context_acquire(
 647                 Server *s,
 648                 pid_t pid,
 649                 const struct ucred *ucred,
 650                 const char *label, size_t label_len,
 651                 const char *unit_id,
 652                 ClientContext **ret) {
 653
 654         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
 655 };
 656
 657 ClientContext *client_context_release(Server *s, ClientContext *c) {
 658         assert(s);
 659
 660         if (!c)
 661                 return NULL;
 662
 663         assert(c->n_ref > 0);
 664         assert(!c->in_lru);
 665
 666         c->n_ref--;
 667         if (c->n_ref > 0)
 668                 return NULL;
 669
 670         /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
 671          * right-away */
 672
 673         if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
 674                 client_context_free(s, c);
 675         else
 676                 c->in_lru = true;
 677
 678         return NULL;
 679 }
 680
 681 void client_context_acquire_default(Server *s) {
 682         int r;
 683
 684         assert(s);
 685
 686         /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
 687          * generate driver messages. */
 688
 689         if (!s->my_context) {
 690                 struct ucred ucred = {
 691                         .pid = getpid_cached(),
 692                         .uid = getuid(),
 693                         .gid = getgid(),
 694                 };
 695
 696                 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
 697                 if (r < 0)
 698                         log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
 699         }
 700
 701         if (!s->pid1_context) {
 702
 703                 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
 704                 if (r < 0)
 705                         log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
 706
 707         }
 708 }