src/journal/journald-context.c

   1 /***
   2   This file is part of systemd.
   3
   4   Copyright 2017 Lennart Poettering
   5
   6   systemd is free software; you can redistribute it and/or modify it
   7   under the terms of the GNU Lesser General Public License as published by
   8   the Free Software Foundation; either version 2.1 of the License, or
   9   (at your option) any later version.
  10
  11   systemd is distributed in the hope that it will be useful, but
  12   WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public License
  17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  18 ***/
  19
  20 #if HAVE_SELINUX
  21 #include <selinux/selinux.h>
  22 #endif
  23
  24 #include "alloc-util.h"
  25 #include "audit-util.h"
  26 #include "cgroup-util.h"
  27 #include "journald-context.h"
  28 #include "process-util.h"
  29 #include "string-util.h"
  30 #include "user-util.h"
  31
  32 /* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
  33  * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
  34  * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
  35  * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
  36  *
  37  * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
  38  * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
  39  * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
  40  * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
  41  * flushed out). Data newer than 1s is used immediately without refresh.
  42  *
  43  * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
  44  * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
  45  * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
  46  *
  47  * Caching metadata like this has two major benefits:
  48  *
  49  * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
  50  *
  51  * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
  52  *    entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
  53  *    service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
  54  *    stream connection. This should improve cases where a service process logs immediately before exiting and we
  55  *    previously had trouble associating the log message with the service.
  56  *
  57  * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
  58  *     UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
  59  *     and sometimes slightly newer than what was current at the log event).
  60  */
  61
  62 /* We refresh every 1s */
  63 #define REFRESH_USEC (1*USEC_PER_SEC)
  64
  65 /* Data older than 5s we flush out */
  66 #define MAX_USEC (5*USEC_PER_SEC)
  67
  68 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
  69  * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
  70  * clients itself is limited.) */
  71 #define CACHE_MAX (16*1024)
  72
  73 static int client_context_compare(const void *a, const void *b) {
  74         const ClientContext *x = a, *y = b;
  75
  76         if (x->timestamp < y->timestamp)
  77                 return -1;
  78         if (x->timestamp > y->timestamp)
  79                 return 1;
  80
  81         if (x->pid < y->pid)
  82                 return -1;
  83         if (x->pid > y->pid)
  84                 return 1;
  85
  86         return 0;
  87 }
  88
  89 static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
  90         ClientContext *c;
  91         int r;
  92
  93         assert(s);
  94         assert(pid_is_valid(pid));
  95         assert(ret);
  96
  97         r = hashmap_ensure_allocated(&s->client_contexts, NULL);
  98         if (r < 0)
  99                 return r;
 100
 101         r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
 102         if (r < 0)
 103                 return r;
 104
 105         c = new0(ClientContext, 1);
 106         if (!c)
 107                 return -ENOMEM;
 108
 109         c->pid = pid;
 110
 111         c->uid = UID_INVALID;
 112         c->gid = GID_INVALID;
 113         c->auditid = AUDIT_SESSION_INVALID;
 114         c->loginuid = UID_INVALID;
 115         c->owner_uid = UID_INVALID;
 116         c->lru_index = PRIOQ_IDX_NULL;
 117         c->timestamp = USEC_INFINITY;
 118
 119         r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
 120         if (r < 0) {
 121                 free(c);
 122                 return r;
 123         }
 124
 125         *ret = c;
 126         return 0;
 127 }
 128
 129 static void client_context_reset(ClientContext *c) {
 130         assert(c);
 131
 132         c->timestamp = USEC_INFINITY;
 133
 134         c->uid = UID_INVALID;
 135         c->gid = GID_INVALID;
 136
 137         c->comm = mfree(c->comm);
 138         c->exe = mfree(c->exe);
 139         c->cmdline = mfree(c->cmdline);
 140         c->capeff = mfree(c->capeff);
 141
 142         c->auditid = AUDIT_SESSION_INVALID;
 143         c->loginuid = UID_INVALID;
 144
 145         c->cgroup = mfree(c->cgroup);
 146         c->session = mfree(c->session);
 147         c->owner_uid = UID_INVALID;
 148         c->unit = mfree(c->unit);
 149         c->user_unit = mfree(c->user_unit);
 150         c->slice = mfree(c->slice);
 151         c->user_slice = mfree(c->user_slice);
 152
 153         c->invocation_id = SD_ID128_NULL;
 154
 155         c->label = mfree(c->label);
 156         c->label_size = 0;
 157 }
 158
 159 static ClientContext* client_context_free(Server *s, ClientContext *c) {
 160         assert(s);
 161
 162         if (!c)
 163                 return NULL;
 164
 165         assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
 166
 167         if (c->in_lru)
 168                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 169
 170         client_context_reset(c);
 171
 172         return mfree(c);
 173 }
 174
 175 static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
 176         assert(c);
 177         assert(pid_is_valid(c->pid));
 178
 179         /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
 180         if (ucred && uid_is_valid(ucred->uid))
 181                 c->uid = ucred->uid;
 182         else
 183                 (void) get_process_uid(c->pid, &c->uid);
 184
 185         if (ucred && gid_is_valid(ucred->gid))
 186                 c->gid = ucred->gid;
 187         else
 188                 (void) get_process_gid(c->pid, &c->gid);
 189 }
 190
 191 static void client_context_read_basic(ClientContext *c) {
 192         char *t;
 193
 194         assert(c);
 195         assert(pid_is_valid(c->pid));
 196
 197         if (get_process_comm(c->pid, &t) >= 0)
 198                 free_and_replace(c->comm, t);
 199
 200         if (get_process_exe(c->pid, &t) >= 0)
 201                 free_and_replace(c->exe, t);
 202
 203         if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
 204                 free_and_replace(c->cmdline, t);
 205
 206         if (get_process_capeff(c->pid, &t) >= 0)
 207                 free_and_replace(c->capeff, t);
 208 }
 209
 210 static int client_context_read_label(
 211                 ClientContext *c,
 212                 const char *label, size_t label_size) {
 213
 214         assert(c);
 215         assert(pid_is_valid(c->pid));
 216         assert(label_size == 0 || label);
 217
 218         if (label_size > 0) {
 219                 char *l;
 220
 221                 /* If we got an SELinux label passed in it counts. */
 222
 223                 l = newdup_suffix0(char, label, label_size);
 224                 if (!l)
 225                         return -ENOMEM;
 226
 227                 free_and_replace(c->label, l);
 228                 c->label_size = label_size;
 229         }
 230 #if HAVE_SELINUX
 231         else {
 232                 char *con;
 233
 234                 /* If we got no SELinux label passed in, let's try to acquire one */
 235
 236                 if (getpidcon(c->pid, &con) >= 0) {
 237                         free_and_replace(c->label, con);
 238                         c->label_size = strlen(c->label);
 239                 }
 240         }
 241 #endif
 242
 243         return 0;
 244 }
 245
 246 static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
 247         char *t = NULL;
 248         int r;
 249
 250         assert(c);
 251
 252         /* Try to acquire the current cgroup path */
 253         r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
 254         if (r < 0) {
 255
 256                 /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
 257                 if (unit_id && !c->unit) {
 258                         c->unit = strdup(unit_id);
 259                         if (c->unit)
 260                                 return 0;
 261                 }
 262
 263                 return r;
 264         }
 265
 266         /* Let's shortcut this if the cgroup path didn't change */
 267         if (streq_ptr(c->cgroup, t)) {
 268                 free(t);
 269                 return 0;
 270         }
 271
 272         free_and_replace(c->cgroup, t);
 273
 274         (void) cg_path_get_session(c->cgroup, &t);
 275         free_and_replace(c->session, t);
 276
 277         if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
 278                 c->owner_uid = UID_INVALID;
 279
 280         (void) cg_path_get_unit(c->cgroup, &t);
 281         free_and_replace(c->unit, t);
 282
 283         (void) cg_path_get_user_unit(c->cgroup, &t);
 284         free_and_replace(c->user_unit, t);
 285
 286         (void) cg_path_get_slice(c->cgroup, &t);
 287         free_and_replace(c->slice, t);
 288
 289         (void) cg_path_get_user_slice(c->cgroup, &t);
 290         free_and_replace(c->user_slice, t);
 291
 292         return 0;
 293 }
 294
 295 static int client_context_read_invocation_id(
 296                 Server *s,
 297                 ClientContext *c) {
 298
 299         _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
 300         char ids[SD_ID128_STRING_MAX];
 301         const char *p;
 302         int r;
 303
 304         assert(s);
 305         assert(c);
 306
 307         /* Read the invocation ID of a unit off a unit. It's stored in the "trusted.invocation_id" extended attribute
 308          * on the cgroup path. */
 309
 310         if (!c->unit || !c->slice)
 311                 return 0;
 312
 313         r = cg_slice_to_path(c->slice, &slice_path);
 314         if (r < 0)
 315                 return r;
 316
 317         escaped = cg_escape(c->unit);
 318         if (!escaped)
 319                 return -ENOMEM;
 320
 321         p = strjoina(s->cgroup_root, "/", slice_path, "/", escaped);
 322         if (!p)
 323                 return -ENOMEM;
 324
 325         r = cg_get_xattr(SYSTEMD_CGROUP_CONTROLLER, p, "trusted.invocation_id", ids, 32);
 326         if (r < 0)
 327                 return r;
 328         if (r != 32)
 329                 return -EINVAL;
 330         ids[32] = 0;
 331
 332         return sd_id128_from_string(ids, &c->invocation_id);
 333 }
 334
 335 static void client_context_really_refresh(
 336                 Server *s,
 337                 ClientContext *c,
 338                 const struct ucred *ucred,
 339                 const char *label, size_t label_size,
 340                 const char *unit_id,
 341                 usec_t timestamp) {
 342
 343         assert(s);
 344         assert(c);
 345         assert(pid_is_valid(c->pid));
 346
 347         if (timestamp == USEC_INFINITY)
 348                 timestamp = now(CLOCK_MONOTONIC);
 349
 350         client_context_read_uid_gid(c, ucred);
 351         client_context_read_basic(c);
 352         (void) client_context_read_label(c, label, label_size);
 353
 354         (void) audit_session_from_pid(c->pid, &c->auditid);
 355         (void) audit_loginuid_from_pid(c->pid, &c->loginuid);
 356
 357         (void) client_context_read_cgroup(s, c, unit_id);
 358         (void) client_context_read_invocation_id(s, c);
 359
 360         c->timestamp = timestamp;
 361
 362         if (c->in_lru) {
 363                 assert(c->n_ref == 0);
 364                 assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
 365         }
 366 }
 367
 368 void client_context_maybe_refresh(
 369                 Server *s,
 370                 ClientContext *c,
 371                 const struct ucred *ucred,
 372                 const char *label, size_t label_size,
 373                 const char *unit_id,
 374                 usec_t timestamp) {
 375
 376         assert(s);
 377         assert(c);
 378
 379         if (timestamp == USEC_INFINITY)
 380                 timestamp = now(CLOCK_MONOTONIC);
 381
 382         /* No cached data so far? Let's fill it up */
 383         if (c->timestamp == USEC_INFINITY)
 384                 goto refresh;
 385
 386         /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
 387          * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
 388         if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
 389                 client_context_reset(c);
 390                 goto refresh;
 391         }
 392
 393         /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
 394         if (c->timestamp + REFRESH_USEC < timestamp)
 395                 goto refresh;
 396
 397         /* If the data passed along doesn't match the cached data we also do a refresh */
 398         if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
 399                 goto refresh;
 400
 401         if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
 402                 goto refresh;
 403
 404         if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
 405                 goto refresh;
 406
 407         return;
 408
 409 refresh:
 410         client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
 411 }
 412
 413 static void client_context_try_shrink_to(Server *s, size_t limit) {
 414         assert(s);
 415
 416         /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
 417          * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
 418          * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
 419
 420         while (hashmap_size(s->client_contexts) > limit) {
 421                 ClientContext *c;
 422
 423                 c = prioq_pop(s->client_contexts_lru);
 424                 if (!c)
 425                         break; /* All remaining entries are pinned, give up */
 426
 427                 assert(c->in_lru);
 428                 assert(c->n_ref == 0);
 429
 430                 c->in_lru = false;
 431
 432                 client_context_free(s, c);
 433         }
 434 }
 435
 436 void client_context_flush_all(Server *s) {
 437         assert(s);
 438
 439         /* Flush out all remaining entries. This assumes all references are already dropped. */
 440
 441         s->my_context = client_context_release(s, s->my_context);
 442         s->pid1_context = client_context_release(s, s->pid1_context);
 443
 444         client_context_try_shrink_to(s, 0);
 445
 446         assert(prioq_size(s->client_contexts_lru) == 0);
 447         assert(hashmap_size(s->client_contexts) == 0);
 448
 449         s->client_contexts_lru = prioq_free(s->client_contexts_lru);
 450         s->client_contexts = hashmap_free(s->client_contexts);
 451 }
 452
 453 static int client_context_get_internal(
 454                 Server *s,
 455                 pid_t pid,
 456                 const struct ucred *ucred,
 457                 const char *label, size_t label_len,
 458                 const char *unit_id,
 459                 bool add_ref,
 460                 ClientContext **ret) {
 461
 462         ClientContext *c;
 463         int r;
 464
 465         assert(s);
 466         assert(ret);
 467
 468         if (!pid_is_valid(pid))
 469                 return -EINVAL;
 470
 471         c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
 472         if (c) {
 473
 474                 if (add_ref) {
 475                         if (c->in_lru) {
 476                                 /* The entry wasn't pinned so far, let's remove it from the LRU list then */
 477                                 assert(c->n_ref == 0);
 478                                 assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
 479                                 c->in_lru = false;
 480                         }
 481
 482                         c->n_ref++;
 483                 }
 484
 485                 client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 486
 487                 *ret = c;
 488                 return 0;
 489         }
 490
 491         client_context_try_shrink_to(s, CACHE_MAX-1);
 492
 493         r = client_context_new(s, pid, &c);
 494         if (r < 0)
 495                 return r;
 496
 497         if (add_ref)
 498                 c->n_ref++;
 499         else {
 500                 r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
 501                 if (r < 0) {
 502                         client_context_free(s, c);
 503                         return r;
 504                 }
 505
 506                 c->in_lru = true;
 507         }
 508
 509         client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
 510
 511         *ret = c;
 512         return 0;
 513 }
 514
 515 int client_context_get(
 516                 Server *s,
 517                 pid_t pid,
 518                 const struct ucred *ucred,
 519                 const char *label, size_t label_len,
 520                 const char *unit_id,
 521                 ClientContext **ret) {
 522
 523         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
 524 }
 525
 526 int client_context_acquire(
 527                 Server *s,
 528                 pid_t pid,
 529                 const struct ucred *ucred,
 530                 const char *label, size_t label_len,
 531                 const char *unit_id,
 532                 ClientContext **ret) {
 533
 534         return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
 535 };
 536
 537 ClientContext *client_context_release(Server *s, ClientContext *c) {
 538         assert(s);
 539
 540         if (!c)
 541                 return NULL;
 542
 543         assert(c->n_ref > 0);
 544         assert(!c->in_lru);
 545
 546         c->n_ref--;
 547         if (c->n_ref > 0)
 548                 return NULL;
 549
 550         /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
 551          * right-away */
 552
 553         if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
 554                 client_context_free(s, c);
 555         else
 556                 c->in_lru = true;
 557
 558         return NULL;
 559 }
 560
 561 void client_context_acquire_default(Server *s) {
 562         int r;
 563
 564         assert(s);
 565
 566         /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
 567          * generate driver messages. */
 568
 569         if (!s->my_context) {
 570                 struct ucred ucred = {
 571                         .pid = getpid_cached(),
 572                         .uid = getuid(),
 573                         .gid = getgid(),
 574                 };
 575
 576                 r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
 577                 if (r < 0)
 578                         log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
 579         }
 580
 581         if (!s->pid1_context) {
 582
 583                 r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
 584                 if (r < 0)
 585                         log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
 586
 587         }
 588 }