[thirdparty/systemd.git] / src / journal / journald-context.c

/* SPDX-License-Identifier: LGPL-2.1+ */
/***
  This file is part of systemd.

  Copyright 2017 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

#if HAVE_SELINUX
#include <selinux/selinux.h>
#endif

#include "alloc-util.h"
#include "audit-util.h"
#include "cgroup-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "io-util.h"
#include "journal-util.h"
#include "journald-context.h"
#include "process-util.h"
#include "string-util.h"
#include "syslog-util.h"
#include "unaligned.h"
#include "user-util.h"

/* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
 *
 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
 * flushed out). Data newer than 1s is used immediately without refresh.
 *
 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
 *
 * Caching metadata like this has two major benefits:
 *
 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
 *
 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
 *    entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
 *    service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
 *    stream connection. This should improve cases where a service process logs immediately before exiting and we
 *    previously had trouble associating the log message with the service.
 *
 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
 *     UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
 *     and sometimes slightly newer than what was current at the log event).
 */

/* We refresh every 1s */
#define REFRESH_USEC (1*USEC_PER_SEC)

/* Data older than 5s we flush out */
#define MAX_USEC (5*USEC_PER_SEC)

/* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
 * clients itself is limited.) */
#define CACHE_MAX (16*1024)

static int client_context_compare(const void *a, const void *b) {
        const ClientContext *x = a, *y = b;

        if (x->timestamp < y->timestamp)
                return -1;
        if (x->timestamp > y->timestamp)
                return 1;

        if (x->pid < y->pid)
                return -1;
        if (x->pid > y->pid)
                return 1;

        return 0;
}

static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
        ClientContext *c;
        int r;

        assert(s);
        assert(pid_is_valid(pid));
        assert(ret);

        r = hashmap_ensure_allocated(&s->client_contexts, NULL);
        if (r < 0)
                return r;

        r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
        if (r < 0)
                return r;

        c = new0(ClientContext, 1);
        if (!c)
                return -ENOMEM;

        c->pid = pid;

        c->uid = UID_INVALID;
        c->gid = GID_INVALID;
        c->auditid = AUDIT_SESSION_INVALID;
        c->loginuid = UID_INVALID;
        c->owner_uid = UID_INVALID;
        c->lru_index = PRIOQ_IDX_NULL;
        c->timestamp = USEC_INFINITY;
        c->extra_fields_mtime = NSEC_INFINITY;
        c->log_level_max = -1;

        r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
        if (r < 0) {
                free(c);
                return r;
        }

        *ret = c;
        return 0;
}

static void client_context_reset(ClientContext *c) {
        assert(c);

        c->timestamp = USEC_INFINITY;

        c->uid = UID_INVALID;
        c->gid = GID_INVALID;

        c->comm = mfree(c->comm);
        c->exe = mfree(c->exe);
        c->cmdline = mfree(c->cmdline);
        c->capeff = mfree(c->capeff);

        c->auditid = AUDIT_SESSION_INVALID;
        c->loginuid = UID_INVALID;

        c->cgroup = mfree(c->cgroup);
        c->session = mfree(c->session);
        c->owner_uid = UID_INVALID;
        c->unit = mfree(c->unit);
        c->user_unit = mfree(c->user_unit);
        c->slice = mfree(c->slice);
        c->user_slice = mfree(c->user_slice);

        c->invocation_id = SD_ID128_NULL;

        c->label = mfree(c->label);
        c->label_size = 0;

        c->extra_fields_iovec = mfree(c->extra_fields_iovec);
        c->extra_fields_n_iovec = 0;
        c->extra_fields_data = mfree(c->extra_fields_data);
        c->extra_fields_mtime = NSEC_INFINITY;

        c->log_level_max = -1;
}

static ClientContext* client_context_free(Server *s, ClientContext *c) {
        assert(s);

        if (!c)
                return NULL;

        assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);

        if (c->in_lru)
                assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);

        client_context_reset(c);

        return mfree(c);
}

static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
        assert(c);
        assert(pid_is_valid(c->pid));

        /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
        if (ucred && uid_is_valid(ucred->uid))
                c->uid = ucred->uid;
        else
                (void) get_process_uid(c->pid, &c->uid);

        if (ucred && gid_is_valid(ucred->gid))
                c->gid = ucred->gid;
        else
                (void) get_process_gid(c->pid, &c->gid);
}

static void client_context_read_basic(ClientContext *c) {
        char *t;

        assert(c);
        assert(pid_is_valid(c->pid));

        if (get_process_comm(c->pid, &t) >= 0)
                free_and_replace(c->comm, t);

        if (get_process_exe(c->pid, &t) >= 0)
                free_and_replace(c->exe, t);

        if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
                free_and_replace(c->cmdline, t);

        if (get_process_capeff(c->pid, &t) >= 0)
                free_and_replace(c->capeff, t);
}

static int client_context_read_label(
                ClientContext *c,
                const char *label, size_t label_size) {

        assert(c);
        assert(pid_is_valid(c->pid));
        assert(label_size == 0 || label);

        if (label_size > 0) {
                char *l;

                /* If we got an SELinux label passed in it counts. */

                l = newdup_suffix0(char, label, label_size);
                if (!l)
                        return -ENOMEM;

                free_and_replace(c->label, l);
                c->label_size = label_size;
        }
#if HAVE_SELINUX
        else {
                char *con;

                /* If we got no SELinux label passed in, let's try to acquire one */

                if (getpidcon(c->pid, &con) >= 0) {
                        free_and_replace(c->label, con);
                        c->label_size = strlen(c->label);
                }
        }
#endif

        return 0;
}

static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
        char *t = NULL;
        int r;

        assert(c);

        /* Try to acquire the current cgroup path */
        r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
        if (r < 0) {

                /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
                if (unit_id && !c->unit) {
                        c->unit = strdup(unit_id);
                        if (c->unit)
                                return 0;
                }

                return r;
        }

        /* Let's shortcut this if the cgroup path didn't change */
        if (streq_ptr(c->cgroup, t)) {
                free(t);
                return 0;
        }

        free_and_replace(c->cgroup, t);

        (void) cg_path_get_session(c->cgroup, &t);
        free_and_replace(c->session, t);

        if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
                c->owner_uid = UID_INVALID;

        (void) cg_path_get_unit(c->cgroup, &t);
        free_and_replace(c->unit, t);

        (void) cg_path_get_user_unit(c->cgroup, &t);
        free_and_replace(c->user_unit, t);

        (void) cg_path_get_slice(c->cgroup, &t);
        free_and_replace(c->slice, t);

        (void) cg_path_get_user_slice(c->cgroup, &t);
        free_and_replace(c->user_slice, t);

        return 0;
}

static int client_context_read_invocation_id(
                Server *s,
                ClientContext *c) {

        _cleanup_free_ char *value = NULL;
        const char *p;
        int r;

        assert(s);
        assert(c);

        /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */

        if (!c->unit)
                return 0;

        p = strjoina("/run/systemd/units/invocation:", c->unit);
        r = readlink_malloc(p, &value);
        if (r < 0)
                return r;

        return sd_id128_from_string(value, &c->invocation_id);
}

static int client_context_read_log_level_max(
                Server *s,
                ClientContext *c) {

        _cleanup_free_ char *value = NULL;
        const char *p;
        int r, ll;

        if (!c->unit)
                return 0;

        p = strjoina("/run/systemd/units/log-level-max:", c->unit);
        r = readlink_malloc(p, &value);
        if (r < 0)
                return r;

        ll = log_level_from_string(value);
        if (ll < 0)
                return -EINVAL;

        c->log_level_max = ll;
        return 0;
}

static int client_context_read_extra_fields(
                Server *s,
                ClientContext *c) {

        size_t size = 0, n_iovec = 0, n_allocated = 0, left;
        _cleanup_free_ struct iovec *iovec = NULL;
        _cleanup_free_ void *data = NULL;
        _cleanup_fclose_ FILE *f = NULL;
        struct stat st;
        const char *p;
        uint8_t *q;
        int r;

        if (!c->unit)
                return 0;

        p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);

        if (c->extra_fields_mtime != NSEC_INFINITY) {
                if (stat(p, &st) < 0) {
                        if (errno == ENOENT)
                                return 0;

                        return -errno;
                }

                if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
                        return 0;
        }

        f = fopen(p, "re");
        if (!f) {
                if (errno == ENOENT)
                        return 0;

                return -errno;
        }

        if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
                                        * one, that matches the stuff we are reading */
                return -errno;

        r = read_full_stream(f, (char**) &data, &size);
        if (r < 0)
                return r;

        q = data, left = size;
        while (left > 0) {
                uint8_t *field, *eq;
                uint64_t v, n;

                if (left < sizeof(uint64_t))
                        return -EBADMSG;

                v = unaligned_read_le64(q);
                if (v < 2)
                        return -EBADMSG;

                n = sizeof(uint64_t) + v;
                if (left < n)
                        return -EBADMSG;

                field = q + sizeof(uint64_t);

                eq = memchr(field, '=', v);
                if (!eq)
                        return -EBADMSG;

                if (!journal_field_valid((const char *) field, eq - field, false))
                        return -EBADMSG;

                if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
                        return -ENOMEM;

                iovec[n_iovec++] = IOVEC_MAKE(field, v);

                left -= n, q += n;
        }

        free(c->extra_fields_iovec);
        free(c->extra_fields_data);

        c->extra_fields_iovec = iovec;
        c->extra_fields_n_iovec = n_iovec;
        c->extra_fields_data = data;
        c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);

        iovec = NULL;
        data = NULL;

        return 0;
}

static void client_context_really_refresh(
                Server *s,
                ClientContext *c,
                const struct ucred *ucred,
                const char *label, size_t label_size,
                const char *unit_id,
                usec_t timestamp) {

        assert(s);
        assert(c);
        assert(pid_is_valid(c->pid));

        if (timestamp == USEC_INFINITY)
                timestamp = now(CLOCK_MONOTONIC);

        client_context_read_uid_gid(c, ucred);
        client_context_read_basic(c);
        (void) client_context_read_label(c, label, label_size);

        (void) audit_session_from_pid(c->pid, &c->auditid);
        (void) audit_loginuid_from_pid(c->pid, &c->loginuid);

        (void) client_context_read_cgroup(s, c, unit_id);
        (void) client_context_read_invocation_id(s, c);
        (void) client_context_read_log_level_max(s, c);
        (void) client_context_read_extra_fields(s, c);

        c->timestamp = timestamp;

        if (c->in_lru) {
                assert(c->n_ref == 0);
                assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
        }
}

void client_context_maybe_refresh(
                Server *s,
                ClientContext *c,
                const struct ucred *ucred,
                const char *label, size_t label_size,
                const char *unit_id,
                usec_t timestamp) {

        assert(s);
        assert(c);

        if (timestamp == USEC_INFINITY)
                timestamp = now(CLOCK_MONOTONIC);

        /* No cached data so far? Let's fill it up */
        if (c->timestamp == USEC_INFINITY)
                goto refresh;

        /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
         * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
        if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
                client_context_reset(c);
                goto refresh;
        }

        /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
        if (c->timestamp + REFRESH_USEC < timestamp)
                goto refresh;

        /* If the data passed along doesn't match the cached data we also do a refresh */
        if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
                goto refresh;

        if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
                goto refresh;

        if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
                goto refresh;

        return;

refresh:
        client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
}

static void client_context_try_shrink_to(Server *s, size_t limit) {
        assert(s);

        /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
         * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
         * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */

        while (hashmap_size(s->client_contexts) > limit) {
                ClientContext *c;

                c = prioq_pop(s->client_contexts_lru);
                if (!c)
                        break; /* All remaining entries are pinned, give up */

                assert(c->in_lru);
                assert(c->n_ref == 0);

                c->in_lru = false;

                client_context_free(s, c);
        }
}

void client_context_flush_all(Server *s) {
        assert(s);

        /* Flush out all remaining entries. This assumes all references are already dropped. */

        s->my_context = client_context_release(s, s->my_context);
        s->pid1_context = client_context_release(s, s->pid1_context);

        client_context_try_shrink_to(s, 0);

        assert(prioq_size(s->client_contexts_lru) == 0);
        assert(hashmap_size(s->client_contexts) == 0);

        s->client_contexts_lru = prioq_free(s->client_contexts_lru);
        s->client_contexts = hashmap_free(s->client_contexts);
}

static int client_context_get_internal(
                Server *s,
                pid_t pid,
                const struct ucred *ucred,
                const char *label, size_t label_len,
                const char *unit_id,
                bool add_ref,
                ClientContext **ret) {

        ClientContext *c;
        int r;

        assert(s);
        assert(ret);

        if (!pid_is_valid(pid))
                return -EINVAL;

        c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
        if (c) {

                if (add_ref) {
                        if (c->in_lru) {
                                /* The entry wasn't pinned so far, let's remove it from the LRU list then */
                                assert(c->n_ref == 0);
                                assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
                                c->in_lru = false;
                        }

                        c->n_ref++;
                }

                client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);

                *ret = c;
                return 0;
        }

        client_context_try_shrink_to(s, CACHE_MAX-1);

        r = client_context_new(s, pid, &c);
        if (r < 0)
                return r;

        if (add_ref)
                c->n_ref++;
        else {
                r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
                if (r < 0) {
                        client_context_free(s, c);
                        return r;
                }

                c->in_lru = true;
        }

        client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);

        *ret = c;
        return 0;
}

int client_context_get(
                Server *s,
                pid_t pid,
                const struct ucred *ucred,
                const char *label, size_t label_len,
                const char *unit_id,
                ClientContext **ret) {

        return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
}

int client_context_acquire(
                Server *s,
                pid_t pid,
                const struct ucred *ucred,
                const char *label, size_t label_len,
                const char *unit_id,
                ClientContext **ret) {

        return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
};

ClientContext *client_context_release(Server *s, ClientContext *c) {
        assert(s);

        if (!c)
                return NULL;

        assert(c->n_ref > 0);
        assert(!c->in_lru);

        c->n_ref--;
        if (c->n_ref > 0)
                return NULL;

        /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
         * right-away */

        if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
                client_context_free(s, c);
        else
                c->in_lru = true;

        return NULL;
}

void client_context_acquire_default(Server *s) {
        int r;

        assert(s);

        /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
         * generate driver messages. */

        if (!s->my_context) {
                struct ucred ucred = {
                        .pid = getpid_cached(),
                        .uid = getuid(),
                        .gid = getgid(),
                };

                r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
                if (r < 0)
                        log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
        }

        if (!s->pid1_context) {

                r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
                if (r < 0)
                        log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");

        }
}
Commit	Line	Data
53e1b683	1	/* SPDX-License-Identifier: LGPL-2.1+ */
22e3a02b LP	2	/***
	3	This file is part of systemd.
	4
	5	Copyright 2017 Lennart Poettering
	6
	7	systemd is free software; you can redistribute it and/or modify it
	8	under the terms of the GNU Lesser General Public License as published by
	9	the Free Software Foundation; either version 2.1 of the License, or
	10	(at your option) any later version.
	11
	12	systemd is distributed in the hope that it will be useful, but
	13	WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	Lesser General Public License for more details.
	16
	17	You should have received a copy of the GNU Lesser General Public License
	18	along with systemd; If not, see <http://www.gnu.org/licenses/>.
	19	***/
	20
349cc4a5	21	#if HAVE_SELINUX
22e3a02b LP	22	#include <selinux/selinux.h>
	23	#endif
	24
	25	#include "alloc-util.h"
	26	#include "audit-util.h"
	27	#include "cgroup-util.h"
d3070fbd LP	28	#include "fd-util.h"
	29	#include "fileio.h"
	30	#include "fs-util.h"
	31	#include "io-util.h"
	32	#include "journal-util.h"
22e3a02b LP	33	#include "journald-context.h"
	34	#include "process-util.h"
	35	#include "string-util.h"
d3070fbd LP	36	#include "syslog-util.h"
d3070fbd LP	37	#include "unaligned.h"
22e3a02b LP	38	#include "user-util.h"
	39
	40	/* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
	41	* is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
	42	* store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
	43	* log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
	44	*
	45	* The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
	46	* until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
	47	* are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
	48	* refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
	49	* flushed out). Data newer than 1s is used immediately without refresh.
	50	*
	51	* Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
	52	* as long as their socket is connected. Note that cache entries are shared between different transports. That means a
	53	* cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
	54	*
	55	* Caching metadata like this has two major benefits:
	56	*
	57	* 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
	58	*
	59	* 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
	60	* entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
	61	* service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
	62	* stream connection. This should improve cases where a service process logs immediately before exiting and we
	63	* previously had trouble associating the log message with the service.
	64	*
	65	* NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
	66	* UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
	67	* and sometimes slightly newer than what was current at the log event).
	68	*/
	69
	70	/* We refresh every 1s */
	71	#define REFRESH_USEC (1*USEC_PER_SEC)
	72
	73	/* Data older than 5s we flush out */
	74	#define MAX_USEC (5*USEC_PER_SEC)
	75
	76	/* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
	77	* the cache, in which case we do permit this limit to be breached. That's safe however, as the number of stream
	78	* clients itself is limited.) */
	79	#define CACHE_MAX (16*1024)
	80
	81	static int client_context_compare(const void a, const void b) {
	82	const ClientContext x = a, y = b;
	83
	84	if (x->timestamp < y->timestamp)
	85	return -1;
	86	if (x->timestamp > y->timestamp)
	87	return 1;
	88
	89	if (x->pid < y->pid)
	90	return -1;
	91	if (x->pid > y->pid)
	92	return 1;
	93
	94	return 0;
	95	}
	96
	97	static int client_context_new(Server s, pid_t pid, ClientContext *ret) {
	98	ClientContext *c;
	99	int r;
	100
	101	assert(s);
102	assert(pid_is_valid(pid));
103	assert(ret);
104
105	r = hashmap_ensure_allocated(&s->client_contexts, NULL);
106	if (r < 0)
107	return r;
108
109	r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
110	if (r < 0)
111	return r;
112
113	c = new0(ClientContext, 1);
114	if (!c)
115	return -ENOMEM;
116
117	c->pid = pid;
118
119	c->uid = UID_INVALID;
120	c->gid = GID_INVALID;
121	c->auditid = AUDIT_SESSION_INVALID;
122	c->loginuid = UID_INVALID;
123	c->owner_uid = UID_INVALID;
124	c->lru_index = PRIOQ_IDX_NULL;
125	c->timestamp = USEC_INFINITY;
d3070fbd LP	126	c->extra_fields_mtime = NSEC_INFINITY;
d3070fbd LP	127	c->log_level_max = -1;
22e3a02b LP	128
	129	r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
	130	if (r < 0) {
	131	free(c);
	132	return r;
	133	}
	134
	135	*ret = c;
	136	return 0;
	137	}
	138
	139	static void client_context_reset(ClientContext *c) {
	140	assert(c);
	141
	142	c->timestamp = USEC_INFINITY;
	143
	144	c->uid = UID_INVALID;
	145	c->gid = GID_INVALID;
	146
	147	c->comm = mfree(c->comm);
	148	c->exe = mfree(c->exe);
	149	c->cmdline = mfree(c->cmdline);
	150	c->capeff = mfree(c->capeff);
	151
	152	c->auditid = AUDIT_SESSION_INVALID;
	153	c->loginuid = UID_INVALID;
	154
	155	c->cgroup = mfree(c->cgroup);
	156	c->session = mfree(c->session);
	157	c->owner_uid = UID_INVALID;
	158	c->unit = mfree(c->unit);
	159	c->user_unit = mfree(c->user_unit);
	160	c->slice = mfree(c->slice);
	161	c->user_slice = mfree(c->user_slice);
	162
	163	c->invocation_id = SD_ID128_NULL;
	164
	165	c->label = mfree(c->label);
	166	c->label_size = 0;
d3070fbd LP	167
	168	c->extra_fields_iovec = mfree(c->extra_fields_iovec);
	169	c->extra_fields_n_iovec = 0;
	170	c->extra_fields_data = mfree(c->extra_fields_data);
	171	c->extra_fields_mtime = NSEC_INFINITY;
	172
	173	c->log_level_max = -1;
22e3a02b LP	174	}
	175
	176	static ClientContext* client_context_free(Server s, ClientContext c) {
	177	assert(s);
	178
	179	if (!c)
	180	return NULL;
	181
	182	assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
	183
	184	if (c->in_lru)
	185	assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
	186
	187	client_context_reset(c);
	188
	189	return mfree(c);
	190	}
	191
	192	static void client_context_read_uid_gid(ClientContext c, const struct ucred ucred) {
	193	assert(c);
	194	assert(pid_is_valid(c->pid));
	195
	196	/* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
	197	if (ucred && uid_is_valid(ucred->uid))
	198	c->uid = ucred->uid;
	199	else
	200	(void) get_process_uid(c->pid, &c->uid);
	201
	202	if (ucred && gid_is_valid(ucred->gid))
	203	c->gid = ucred->gid;
	204	else
	205	(void) get_process_gid(c->pid, &c->gid);
	206	}
	207
	208	static void client_context_read_basic(ClientContext *c) {
	209	char *t;
	210
	211	assert(c);
	212	assert(pid_is_valid(c->pid));
	213
	214	if (get_process_comm(c->pid, &t) >= 0)
	215	free_and_replace(c->comm, t);
	216
	217	if (get_process_exe(c->pid, &t) >= 0)
	218	free_and_replace(c->exe, t);
	219
	220	if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
	221	free_and_replace(c->cmdline, t);
	222
	223	if (get_process_capeff(c->pid, &t) >= 0)
	224	free_and_replace(c->capeff, t);
	225	}
	226
	227	static int client_context_read_label(
	228	ClientContext *c,
	229	const char *label, size_t label_size) {
	230
	231	assert(c);
	232	assert(pid_is_valid(c->pid));
	233	assert(label_size == 0 \|\| label);
	234
	235	if (label_size > 0) {
	236	char *l;
	237
238	/* If we got an SELinux label passed in it counts. */
239
240	l = newdup_suffix0(char, label, label_size);
241	if (!l)
242	return -ENOMEM;
243
244	free_and_replace(c->label, l);
245	c->label_size = label_size;
246	}
349cc4a5	247	#if HAVE_SELINUX
22e3a02b LP	248	else {
	249	char *con;
	250
	251	/* If we got no SELinux label passed in, let's try to acquire one */
	252
	253	if (getpidcon(c->pid, &con) >= 0) {
	254	free_and_replace(c->label, con);
	255	c->label_size = strlen(c->label);
	256	}
	257	}
	258	#endif
	259
	260	return 0;
	261	}
	262
	263	static int client_context_read_cgroup(Server s, ClientContext c, const char *unit_id) {
	264	char *t = NULL;
	265	int r;
	266
	267	assert(c);
	268
	269	/* Try to acquire the current cgroup path */
	270	r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
	271	if (r < 0) {
	272
	273	/* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
	274	if (unit_id && !c->unit) {
	275	c->unit = strdup(unit_id);
	276	if (c->unit)
	277	return 0;
	278	}
	279
	280	return r;
	281	}
	282
	283	/* Let's shortcut this if the cgroup path didn't change */
	284	if (streq_ptr(c->cgroup, t)) {
	285	free(t);
	286	return 0;
	287	}
	288
	289	free_and_replace(c->cgroup, t);
	290
	291	(void) cg_path_get_session(c->cgroup, &t);
	292	free_and_replace(c->session, t);
	293
	294	if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
	295	c->owner_uid = UID_INVALID;
	296
	297	(void) cg_path_get_unit(c->cgroup, &t);
	298	free_and_replace(c->unit, t);
	299
	300	(void) cg_path_get_user_unit(c->cgroup, &t);
	301	free_and_replace(c->user_unit, t);
	302
	303	(void) cg_path_get_slice(c->cgroup, &t);
	304	free_and_replace(c->slice, t);
	305
	306	(void) cg_path_get_user_slice(c->cgroup, &t);
	307	free_and_replace(c->user_slice, t);
	308
	309	return 0;
	310	}
	311
312	static int client_context_read_invocation_id(
313	Server *s,
314	ClientContext *c) {
315
d3070fbd	316	_cleanup_free_ char *value = NULL;
22e3a02b LP	317	const char *p;
	318	int r;
	319
	320	assert(s);
	321	assert(c);
	322
d3070fbd	323	/* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
22e3a02b	324
d3070fbd	325	if (!c->unit)
22e3a02b LP	326	return 0;
22e3a02b LP	327
d3070fbd LP	328	p = strjoina("/run/systemd/units/invocation:", c->unit);
d3070fbd LP	329	r = readlink_malloc(p, &value);
22e3a02b LP	330	if (r < 0)
	331	return r;
	332
d3070fbd LP	333	return sd_id128_from_string(value, &c->invocation_id);
d3070fbd LP	334	}
22e3a02b	335
d3070fbd LP	336	static int client_context_read_log_level_max(
	337	Server *s,
	338	ClientContext *c) {
22e3a02b	339
d3070fbd LP	340	_cleanup_free_ char *value = NULL;
	341	const char *p;
	342	int r, ll;
	343
	344	if (!c->unit)
	345	return 0;
	346
	347	p = strjoina("/run/systemd/units/log-level-max:", c->unit);
	348	r = readlink_malloc(p, &value);
22e3a02b LP	349	if (r < 0)
22e3a02b LP	350	return r;
d3070fbd LP	351
	352	ll = log_level_from_string(value);
	353	if (ll < 0)
22e3a02b	354	return -EINVAL;
22e3a02b	355
d3070fbd LP	356	c->log_level_max = ll;
	357	return 0;
	358	}
	359
	360	static int client_context_read_extra_fields(
	361	Server *s,
	362	ClientContext *c) {
	363
	364	size_t size = 0, n_iovec = 0, n_allocated = 0, left;
	365	_cleanup_free_ struct iovec *iovec = NULL;
	366	_cleanup_free_ void *data = NULL;
	367	_cleanup_fclose_ FILE *f = NULL;
	368	struct stat st;
	369	const char *p;
	370	uint8_t *q;
	371	int r;
	372
	373	if (!c->unit)
	374	return 0;
	375
	376	p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
	377
	378	if (c->extra_fields_mtime != NSEC_INFINITY) {
	379	if (stat(p, &st) < 0) {
	380	if (errno == ENOENT)
	381	return 0;
	382
	383	return -errno;
	384	}
	385
	386	if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
	387	return 0;
	388	}
	389
	390	f = fopen(p, "re");
	391	if (!f) {
	392	if (errno == ENOENT)
	393	return 0;
	394
	395	return -errno;
	396	}
	397
	398	if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
	399	* one, that matches the stuff we are reading */
	400	return -errno;
	401
	402	r = read_full_stream(f, (char**) &data, &size);
	403	if (r < 0)
	404	return r;
	405
	406	q = data, left = size;
	407	while (left > 0) {
	408	uint8_t field, eq;
	409	uint64_t v, n;
	410
	411	if (left < sizeof(uint64_t))
	412	return -EBADMSG;
	413
	414	v = unaligned_read_le64(q);
	415	if (v < 2)
	416	return -EBADMSG;
	417
	418	n = sizeof(uint64_t) + v;
	419	if (left < n)
420	return -EBADMSG;
421
422	field = q + sizeof(uint64_t);
423
424	eq = memchr(field, '=', v);
425	if (!eq)
426	return -EBADMSG;
427
428	if (!journal_field_valid((const char *) field, eq - field, false))
429	return -EBADMSG;
430
431	if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
432	return -ENOMEM;
433
434	iovec[n_iovec++] = IOVEC_MAKE(field, v);
435
436	left -= n, q += n;
437	}
438
439	free(c->extra_fields_iovec);
440	free(c->extra_fields_data);
441
442	c->extra_fields_iovec = iovec;
443	c->extra_fields_n_iovec = n_iovec;
444	c->extra_fields_data = data;
445	c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
446
447	iovec = NULL;
448	data = NULL;
449
450	return 0;
22e3a02b LP	451	}
	452
	453	static void client_context_really_refresh(
	454	Server *s,
	455	ClientContext *c,
	456	const struct ucred *ucred,
	457	const char *label, size_t label_size,
	458	const char *unit_id,
	459	usec_t timestamp) {
	460
	461	assert(s);
	462	assert(c);
	463	assert(pid_is_valid(c->pid));
	464
	465	if (timestamp == USEC_INFINITY)
	466	timestamp = now(CLOCK_MONOTONIC);
	467
	468	client_context_read_uid_gid(c, ucred);
	469	client_context_read_basic(c);
	470	(void) client_context_read_label(c, label, label_size);
	471
	472	(void) audit_session_from_pid(c->pid, &c->auditid);
	473	(void) audit_loginuid_from_pid(c->pid, &c->loginuid);
	474
	475	(void) client_context_read_cgroup(s, c, unit_id);
	476	(void) client_context_read_invocation_id(s, c);
d3070fbd LP	477	(void) client_context_read_log_level_max(s, c);
d3070fbd LP	478	(void) client_context_read_extra_fields(s, c);
22e3a02b LP	479
	480	c->timestamp = timestamp;
	481
	482	if (c->in_lru) {
	483	assert(c->n_ref == 0);
	484	assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
	485	}
	486	}
	487
	488	void client_context_maybe_refresh(
	489	Server *s,
	490	ClientContext *c,
	491	const struct ucred *ucred,
	492	const char *label, size_t label_size,
	493	const char *unit_id,
	494	usec_t timestamp) {
	495
	496	assert(s);
	497	assert(c);
	498
	499	if (timestamp == USEC_INFINITY)
	500	timestamp = now(CLOCK_MONOTONIC);
	501
	502	/* No cached data so far? Let's fill it up */
	503	if (c->timestamp == USEC_INFINITY)
	504	goto refresh;
	505
	506	/* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
	507	* entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
	508	if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
	509	client_context_reset(c);
	510	goto refresh;
	511	}
	512
	513	/* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
	514	if (c->timestamp + REFRESH_USEC < timestamp)
	515	goto refresh;
	516
	517	/* If the data passed along doesn't match the cached data we also do a refresh */
	518	if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
	519	goto refresh;
	520
	521	if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
	522	goto refresh;
	523
	524	if (label_size > 0 && (label_size != c->label_size \|\| memcmp(label, c->label, label_size) != 0))
	525	goto refresh;
	526
	527	return;
	528
	529	refresh:
	530	client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
	531	}
	532
	533	static void client_context_try_shrink_to(Server *s, size_t limit) {
	534	assert(s);
	535
	536	/* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
	537	* breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
	538	* cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
	539
	540	while (hashmap_size(s->client_contexts) > limit) {
	541	ClientContext *c;
	542
543	c = prioq_pop(s->client_contexts_lru);
544	if (!c)
545	break; /* All remaining entries are pinned, give up */
546
547	assert(c->in_lru);
548	assert(c->n_ref == 0);
549
550	c->in_lru = false;
551
552	client_context_free(s, c);
553	}
554	}
555
556	void client_context_flush_all(Server *s) {
557	assert(s);
558
559	/* Flush out all remaining entries. This assumes all references are already dropped. */
560
561	s->my_context = client_context_release(s, s->my_context);
562	s->pid1_context = client_context_release(s, s->pid1_context);
563
564	client_context_try_shrink_to(s, 0);
565
566	assert(prioq_size(s->client_contexts_lru) == 0);
567	assert(hashmap_size(s->client_contexts) == 0);
568
569	s->client_contexts_lru = prioq_free(s->client_contexts_lru);
570	s->client_contexts = hashmap_free(s->client_contexts);
571	}
572
573	static int client_context_get_internal(
574	Server *s,
575	pid_t pid,
576	const struct ucred *ucred,
577	const char *label, size_t label_len,
578	const char *unit_id,
579	bool add_ref,
580	ClientContext **ret) {
581
582	ClientContext *c;
583	int r;
584
585	assert(s);
586	assert(ret);
587
588	if (!pid_is_valid(pid))
589	return -EINVAL;
590
591	c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
592	if (c) {
593
594	if (add_ref) {
595	if (c->in_lru) {
596	/* The entry wasn't pinned so far, let's remove it from the LRU list then */
597	assert(c->n_ref == 0);
598	assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
599	c->in_lru = false;
600	}
601
602	c->n_ref++;
603	}
604
605	client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
606
607	*ret = c;
608	return 0;
609	}
610
611	client_context_try_shrink_to(s, CACHE_MAX-1);
612
613	r = client_context_new(s, pid, &c);
614	if (r < 0)
615	return r;
616
617	if (add_ref)
618	c->n_ref++;
619	else {
620	r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
621	if (r < 0) {
622	client_context_free(s, c);
623	return r;
624	}
625
626	c->in_lru = true;
627	}
628
629	client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
630
631	*ret = c;
632	return 0;
633	}
634
635	int client_context_get(
636	Server *s,
637	pid_t pid,
638	const struct ucred *ucred,
639	const char *label, size_t label_len,
640	const char *unit_id,
641	ClientContext **ret) {
642
643	return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
644	}
645
646	int client_context_acquire(
647	Server *s,
648	pid_t pid,
649	const struct ucred *ucred,
650	const char *label, size_t label_len,
651	const char *unit_id,
652	ClientContext **ret) {
653
654	return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
655	};
656
657	ClientContext client_context_release(Server s, ClientContext *c) {
658	assert(s);
659
660	if (!c)
661	return NULL;
662
663	assert(c->n_ref > 0);
664	assert(!c->in_lru);
665
666	c->n_ref--;
667	if (c->n_ref > 0)
668	return NULL;
669
670	/* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
671	* right-away */
672
673	if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
674	client_context_free(s, c);
675	else
676	c->in_lru = true;
677
678	return NULL;
679	}
680
681	void client_context_acquire_default(Server *s) {
682	int r;
683
684	assert(s);
685
686	/* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
687	* generate driver messages. */
688
689	if (!s->my_context) {
690	struct ucred ucred = {
691	.pid = getpid_cached(),
692	.uid = getuid(),
693	.gid = getgid(),
694	};
695
696	r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
697	if (r < 0)
698	log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
699	}
700
701	if (!s->pid1_context) {
702
703	r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
704	if (r < 0)
705	log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
706
707	}
708	}