[thirdparty/systemd.git] / src / journal / journald-context.c

/***
  This file is part of systemd.

  Copyright 2017 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

#if HAVE_SELINUX
#include <selinux/selinux.h>
#endif

#include "alloc-util.h"
#include "audit-util.h"
#include "cgroup-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "io-util.h"
#include "journal-util.h"
#include "journald-context.h"
#include "process-util.h"
#include "string-util.h"
#include "syslog-util.h"
#include "unaligned.h"
#include "user-util.h"

/* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
 * is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
 * store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
 * log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
 *
 * The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
 * until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
 * are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
 * refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
 * flushed out). Data newer than 1s is used immediately without refresh.
 *
 * Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
 * as long as their socket is connected. Note that cache entries are shared between different transports. That means a
 * cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
 *
 * Caching metadata like this has two major benefits:
 *
 * 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
 *
 * 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
 *    entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
 *    service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
 *    stream connection. This should improve cases where a service process logs immediately before exiting and we
 *    previously had trouble associating the log message with the service.
 *
 * NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
 *     UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
 *     and sometimes slightly newer than what was current at the log event).
 */

/* We refresh every 1s */
#define REFRESH_USEC (1*USEC_PER_SEC)

/* Data older than 5s we flush out */
#define MAX_USEC (5*USEC_PER_SEC)

/* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
 * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
 * clients itself is limited.) */
#define CACHE_MAX (16*1024)

static int client_context_compare(const void *a, const void *b) {
        const ClientContext *x = a, *y = b;

        if (x->timestamp < y->timestamp)
                return -1;
        if (x->timestamp > y->timestamp)
                return 1;

        if (x->pid < y->pid)
                return -1;
        if (x->pid > y->pid)
                return 1;

        return 0;
}

static int client_context_new(Server *s, pid_t pid, ClientContext **ret) {
        ClientContext *c;
        int r;

        assert(s);
        assert(pid_is_valid(pid));
        assert(ret);

        r = hashmap_ensure_allocated(&s->client_contexts, NULL);
        if (r < 0)
                return r;

        r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
        if (r < 0)
                return r;

        c = new0(ClientContext, 1);
        if (!c)
                return -ENOMEM;

        c->pid = pid;

        c->uid = UID_INVALID;
        c->gid = GID_INVALID;
        c->auditid = AUDIT_SESSION_INVALID;
        c->loginuid = UID_INVALID;
        c->owner_uid = UID_INVALID;
        c->lru_index = PRIOQ_IDX_NULL;
        c->timestamp = USEC_INFINITY;
        c->extra_fields_mtime = NSEC_INFINITY;
        c->log_level_max = -1;

        r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
        if (r < 0) {
                free(c);
                return r;
        }

        *ret = c;
        return 0;
}

static void client_context_reset(ClientContext *c) {
        assert(c);

        c->timestamp = USEC_INFINITY;

        c->uid = UID_INVALID;
        c->gid = GID_INVALID;

        c->comm = mfree(c->comm);
        c->exe = mfree(c->exe);
        c->cmdline = mfree(c->cmdline);
        c->capeff = mfree(c->capeff);

        c->auditid = AUDIT_SESSION_INVALID;
        c->loginuid = UID_INVALID;

        c->cgroup = mfree(c->cgroup);
        c->session = mfree(c->session);
        c->owner_uid = UID_INVALID;
        c->unit = mfree(c->unit);
        c->user_unit = mfree(c->user_unit);
        c->slice = mfree(c->slice);
        c->user_slice = mfree(c->user_slice);

        c->invocation_id = SD_ID128_NULL;

        c->label = mfree(c->label);
        c->label_size = 0;

        c->extra_fields_iovec = mfree(c->extra_fields_iovec);
        c->extra_fields_n_iovec = 0;
        c->extra_fields_data = mfree(c->extra_fields_data);
        c->extra_fields_mtime = NSEC_INFINITY;

        c->log_level_max = -1;
}

static ClientContext* client_context_free(Server *s, ClientContext *c) {
        assert(s);

        if (!c)
                return NULL;

        assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);

        if (c->in_lru)
                assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);

        client_context_reset(c);

        return mfree(c);
}

static void client_context_read_uid_gid(ClientContext *c, const struct ucred *ucred) {
        assert(c);
        assert(pid_is_valid(c->pid));

        /* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
        if (ucred && uid_is_valid(ucred->uid))
                c->uid = ucred->uid;
        else
                (void) get_process_uid(c->pid, &c->uid);

        if (ucred && gid_is_valid(ucred->gid))
                c->gid = ucred->gid;
        else
                (void) get_process_gid(c->pid, &c->gid);
}

static void client_context_read_basic(ClientContext *c) {
        char *t;

        assert(c);
        assert(pid_is_valid(c->pid));

        if (get_process_comm(c->pid, &t) >= 0)
                free_and_replace(c->comm, t);

        if (get_process_exe(c->pid, &t) >= 0)
                free_and_replace(c->exe, t);

        if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
                free_and_replace(c->cmdline, t);

        if (get_process_capeff(c->pid, &t) >= 0)
                free_and_replace(c->capeff, t);
}

static int client_context_read_label(
                ClientContext *c,
                const char *label, size_t label_size) {

        assert(c);
        assert(pid_is_valid(c->pid));
        assert(label_size == 0 || label);

        if (label_size > 0) {
                char *l;

                /* If we got an SELinux label passed in it counts. */

                l = newdup_suffix0(char, label, label_size);
                if (!l)
                        return -ENOMEM;

                free_and_replace(c->label, l);
                c->label_size = label_size;
        }
#if HAVE_SELINUX
        else {
                char *con;

                /* If we got no SELinux label passed in, let's try to acquire one */

                if (getpidcon(c->pid, &con) >= 0) {
                        free_and_replace(c->label, con);
                        c->label_size = strlen(c->label);
                }
        }
#endif

        return 0;
}

static int client_context_read_cgroup(Server *s, ClientContext *c, const char *unit_id) {
        char *t = NULL;
        int r;

        assert(c);

        /* Try to acquire the current cgroup path */
        r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
        if (r < 0) {

                /* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
                if (unit_id && !c->unit) {
                        c->unit = strdup(unit_id);
                        if (c->unit)
                                return 0;
                }

                return r;
        }

        /* Let's shortcut this if the cgroup path didn't change */
        if (streq_ptr(c->cgroup, t)) {
                free(t);
                return 0;
        }

        free_and_replace(c->cgroup, t);

        (void) cg_path_get_session(c->cgroup, &t);
        free_and_replace(c->session, t);

        if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
                c->owner_uid = UID_INVALID;

        (void) cg_path_get_unit(c->cgroup, &t);
        free_and_replace(c->unit, t);

        (void) cg_path_get_user_unit(c->cgroup, &t);
        free_and_replace(c->user_unit, t);

        (void) cg_path_get_slice(c->cgroup, &t);
        free_and_replace(c->slice, t);

        (void) cg_path_get_user_slice(c->cgroup, &t);
        free_and_replace(c->user_slice, t);

        return 0;
}

static int client_context_read_invocation_id(
                Server *s,
                ClientContext *c) {

        _cleanup_free_ char *value = NULL;
        const char *p;
        int r;

        assert(s);
        assert(c);

        /* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */

        if (!c->unit)
                return 0;

        p = strjoina("/run/systemd/units/invocation:", c->unit);
        r = readlink_malloc(p, &value);
        if (r < 0)
                return r;

        return sd_id128_from_string(value, &c->invocation_id);
}

static int client_context_read_log_level_max(
                Server *s,
                ClientContext *c) {

        _cleanup_free_ char *value = NULL;
        const char *p;
        int r, ll;

        if (!c->unit)
                return 0;

        p = strjoina("/run/systemd/units/log-level-max:", c->unit);
        r = readlink_malloc(p, &value);
        if (r < 0)
                return r;

        ll = log_level_from_string(value);
        if (ll < 0)
                return -EINVAL;

        c->log_level_max = ll;
        return 0;
}

static int client_context_read_extra_fields(
                Server *s,
                ClientContext *c) {

        size_t size = 0, n_iovec = 0, n_allocated = 0, left;
        _cleanup_free_ struct iovec *iovec = NULL;
        _cleanup_free_ void *data = NULL;
        _cleanup_fclose_ FILE *f = NULL;
        struct stat st;
        const char *p;
        uint8_t *q;
        int r;

        if (!c->unit)
                return 0;

        p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);

        if (c->extra_fields_mtime != NSEC_INFINITY) {
                if (stat(p, &st) < 0) {
                        if (errno == ENOENT)
                                return 0;

                        return -errno;
                }

                if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
                        return 0;
        }

        f = fopen(p, "re");
        if (!f) {
                if (errno == ENOENT)
                        return 0;

                return -errno;
        }

        if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
                                        * one, that matches the stuff we are reading */
                return -errno;

        r = read_full_stream(f, (char**) &data, &size);
        if (r < 0)
                return r;

        q = data, left = size;
        while (left > 0) {
                uint8_t *field, *eq;
                uint64_t v, n;

                if (left < sizeof(uint64_t))
                        return -EBADMSG;

                v = unaligned_read_le64(q);
                if (v < 2)
                        return -EBADMSG;

                n = sizeof(uint64_t) + v;
                if (left < n)
                        return -EBADMSG;

                field = q + sizeof(uint64_t);

                eq = memchr(field, '=', v);
                if (!eq)
                        return -EBADMSG;

                if (!journal_field_valid((const char *) field, eq - field, false))
                        return -EBADMSG;

                if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
                        return -ENOMEM;

                iovec[n_iovec++] = IOVEC_MAKE(field, v);

                left -= n, q += n;
        }

        free(c->extra_fields_iovec);
        free(c->extra_fields_data);

        c->extra_fields_iovec = iovec;
        c->extra_fields_n_iovec = n_iovec;
        c->extra_fields_data = data;
        c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);

        iovec = NULL;
        data = NULL;

        return 0;
}

static void client_context_really_refresh(
                Server *s,
                ClientContext *c,
                const struct ucred *ucred,
                const char *label, size_t label_size,
                const char *unit_id,
                usec_t timestamp) {

        assert(s);
        assert(c);
        assert(pid_is_valid(c->pid));

        if (timestamp == USEC_INFINITY)
                timestamp = now(CLOCK_MONOTONIC);

        client_context_read_uid_gid(c, ucred);
        client_context_read_basic(c);
        (void) client_context_read_label(c, label, label_size);

        (void) audit_session_from_pid(c->pid, &c->auditid);
        (void) audit_loginuid_from_pid(c->pid, &c->loginuid);

        (void) client_context_read_cgroup(s, c, unit_id);
        (void) client_context_read_invocation_id(s, c);
        (void) client_context_read_log_level_max(s, c);
        (void) client_context_read_extra_fields(s, c);

        c->timestamp = timestamp;

        if (c->in_lru) {
                assert(c->n_ref == 0);
                assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
        }
}

void client_context_maybe_refresh(
                Server *s,
                ClientContext *c,
                const struct ucred *ucred,
                const char *label, size_t label_size,
                const char *unit_id,
                usec_t timestamp) {

        assert(s);
        assert(c);

        if (timestamp == USEC_INFINITY)
                timestamp = now(CLOCK_MONOTONIC);

        /* No cached data so far? Let's fill it up */
        if (c->timestamp == USEC_INFINITY)
                goto refresh;

        /* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
         * entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
        if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
                client_context_reset(c);
                goto refresh;
        }

        /* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
        if (c->timestamp + REFRESH_USEC < timestamp)
                goto refresh;

        /* If the data passed along doesn't match the cached data we also do a refresh */
        if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
                goto refresh;

        if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
                goto refresh;

        if (label_size > 0 && (label_size != c->label_size || memcmp(label, c->label, label_size) != 0))
                goto refresh;

        return;

refresh:
        client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
}

static void client_context_try_shrink_to(Server *s, size_t limit) {
        assert(s);

        /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
         * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
         * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */

        while (hashmap_size(s->client_contexts) > limit) {
                ClientContext *c;

                c = prioq_pop(s->client_contexts_lru);
                if (!c)
                        break; /* All remaining entries are pinned, give up */

                assert(c->in_lru);
                assert(c->n_ref == 0);

                c->in_lru = false;

                client_context_free(s, c);
        }
}

void client_context_flush_all(Server *s) {
        assert(s);

        /* Flush out all remaining entries. This assumes all references are already dropped. */

        s->my_context = client_context_release(s, s->my_context);
        s->pid1_context = client_context_release(s, s->pid1_context);

        client_context_try_shrink_to(s, 0);

        assert(prioq_size(s->client_contexts_lru) == 0);
        assert(hashmap_size(s->client_contexts) == 0);

        s->client_contexts_lru = prioq_free(s->client_contexts_lru);
        s->client_contexts = hashmap_free(s->client_contexts);
}

static int client_context_get_internal(
                Server *s,
                pid_t pid,
                const struct ucred *ucred,
                const char *label, size_t label_len,
                const char *unit_id,
                bool add_ref,
                ClientContext **ret) {

        ClientContext *c;
        int r;

        assert(s);
        assert(ret);

        if (!pid_is_valid(pid))
                return -EINVAL;

        c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
        if (c) {

                if (add_ref) {
                        if (c->in_lru) {
                                /* The entry wasn't pinned so far, let's remove it from the LRU list then */
                                assert(c->n_ref == 0);
                                assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
                                c->in_lru = false;
                        }

                        c->n_ref++;
                }

                client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);

                *ret = c;
                return 0;
        }

        client_context_try_shrink_to(s, CACHE_MAX-1);

        r = client_context_new(s, pid, &c);
        if (r < 0)
                return r;

        if (add_ref)
                c->n_ref++;
        else {
                r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
                if (r < 0) {
                        client_context_free(s, c);
                        return r;
                }

                c->in_lru = true;
        }

        client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);

        *ret = c;
        return 0;
}

int client_context_get(
                Server *s,
                pid_t pid,
                const struct ucred *ucred,
                const char *label, size_t label_len,
                const char *unit_id,
                ClientContext **ret) {

        return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
}

int client_context_acquire(
                Server *s,
                pid_t pid,
                const struct ucred *ucred,
                const char *label, size_t label_len,
                const char *unit_id,
                ClientContext **ret) {

        return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
};

ClientContext *client_context_release(Server *s, ClientContext *c) {
        assert(s);

        if (!c)
                return NULL;

        assert(c->n_ref > 0);
        assert(!c->in_lru);

        c->n_ref--;
        if (c->n_ref > 0)
                return NULL;

        /* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
         * right-away */

        if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
                client_context_free(s, c);
        else
                c->in_lru = true;

        return NULL;
}

void client_context_acquire_default(Server *s) {
        int r;

        assert(s);

        /* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
         * generate driver messages. */

        if (!s->my_context) {
                struct ucred ucred = {
                        .pid = getpid_cached(),
                        .uid = getuid(),
                        .gid = getgid(),
                };

                r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
                if (r < 0)
                        log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
        }

        if (!s->pid1_context) {

                r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
                if (r < 0)
                        log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");

        }
}
Commit	Line	Data
22e3a02b LP	1	/***
	2	This file is part of systemd.
	3
	4	Copyright 2017 Lennart Poettering
	5
	6	systemd is free software; you can redistribute it and/or modify it
	7	under the terms of the GNU Lesser General Public License as published by
	8	the Free Software Foundation; either version 2.1 of the License, or
	9	(at your option) any later version.
	10
	11	systemd is distributed in the hope that it will be useful, but
	12	WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public License
	17	along with systemd; If not, see <http://www.gnu.org/licenses/>.
	18	***/
	19
349cc4a5	20	#if HAVE_SELINUX
22e3a02b LP	21	#include <selinux/selinux.h>
	22	#endif
	23
	24	#include "alloc-util.h"
	25	#include "audit-util.h"
	26	#include "cgroup-util.h"
d3070fbd LP	27	#include "fd-util.h"
	28	#include "fileio.h"
	29	#include "fs-util.h"
	30	#include "io-util.h"
	31	#include "journal-util.h"
22e3a02b LP	32	#include "journald-context.h"
	33	#include "process-util.h"
	34	#include "string-util.h"
d3070fbd LP	35	#include "syslog-util.h"
d3070fbd LP	36	#include "unaligned.h"
22e3a02b LP	37	#include "user-util.h"
	38
	39	/* This implements a metadata cache for clients, which are identified by their PID. Requesting metadata through /proc
	40	* is expensive, hence let's cache the data if we can. Note that this means the metadata might be out-of-date when we
	41	* store it, but it might already be anyway, as we request the data asynchronously from /proc at a different time the
	42	* log entry was originally created. We hence just increase the "window of inaccuracy" a bit.
	43	*
	44	* The cache is indexed by the PID. Entries may be "pinned" in the cache, in which case the entries are not removed
	45	* until they are unpinned. Unpinned entries are kept around until cache pressure is seen. Cache entries older than 5s
	46	* are never used (a sad attempt to deal with the UNIX weakness of PIDs reuse), cache entries older than 1s are
	47	* refreshed in an incremental way (meaning: data is reread from /proc, but any old data we can't refresh is not
	48	* flushed out). Data newer than 1s is used immediately without refresh.
	49	*
	50	* Log stream clients (i.e. all clients using the AF_UNIX/SOCK_STREAM stdout/stderr transport) will pin a cache entry
	51	* as long as their socket is connected. Note that cache entries are shared between different transports. That means a
	52	* cache entry pinned for the stream connection logic may be reused for the syslog or native protocols.
	53	*
	54	* Caching metadata like this has two major benefits:
	55	*
	56	* 1. Reading metadata is expensive, and we can thus substantially speed up log processing under flood.
	57	*
	58	* 2. Because metadata caching is shared between stream and datagram transports and stream connections pin a cache
	59	* entry there's a good chance we can properly map a substantial set of datagram log messages to their originating
	60	* service, as all services (unless explicitly configured otherwise) will have their stdout/stderr connected to a
	61	* stream connection. This should improve cases where a service process logs immediately before exiting and we
	62	* previously had trouble associating the log message with the service.
	63	*
	64	* NB: With and without the metadata cache: the implicitly added entry metadata in the journal (with the exception of
	65	* UID/PID/GID and SELinux label) must be understood as possibly slightly out of sync (i.e. sometimes slighly older
	66	* and sometimes slightly newer than what was current at the log event).
	67	*/
	68
	69	/* We refresh every 1s */
	70	#define REFRESH_USEC (1*USEC_PER_SEC)
	71
	72	/* Data older than 5s we flush out */
	73	#define MAX_USEC (5*USEC_PER_SEC)
	74
	75	/* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
	76	* the cache, in which case we do permit this limit to be breached. That's safe however, as the number of stream
	77	* clients itself is limited.) */
	78	#define CACHE_MAX (16*1024)
	79
	80	static int client_context_compare(const void a, const void b) {
	81	const ClientContext x = a, y = b;
	82
	83	if (x->timestamp < y->timestamp)
	84	return -1;
	85	if (x->timestamp > y->timestamp)
	86	return 1;
	87
	88	if (x->pid < y->pid)
	89	return -1;
	90	if (x->pid > y->pid)
	91	return 1;
	92
	93	return 0;
	94	}
	95
	96	static int client_context_new(Server s, pid_t pid, ClientContext *ret) {
	97	ClientContext *c;
	98	int r;
	99
	100	assert(s);
101	assert(pid_is_valid(pid));
102	assert(ret);
103
104	r = hashmap_ensure_allocated(&s->client_contexts, NULL);
105	if (r < 0)
106	return r;
107
108	r = prioq_ensure_allocated(&s->client_contexts_lru, client_context_compare);
109	if (r < 0)
110	return r;
111
112	c = new0(ClientContext, 1);
113	if (!c)
114	return -ENOMEM;
115
116	c->pid = pid;
117
118	c->uid = UID_INVALID;
119	c->gid = GID_INVALID;
120	c->auditid = AUDIT_SESSION_INVALID;
121	c->loginuid = UID_INVALID;
122	c->owner_uid = UID_INVALID;
123	c->lru_index = PRIOQ_IDX_NULL;
124	c->timestamp = USEC_INFINITY;
d3070fbd LP	125	c->extra_fields_mtime = NSEC_INFINITY;
d3070fbd LP	126	c->log_level_max = -1;
22e3a02b LP	127
	128	r = hashmap_put(s->client_contexts, PID_TO_PTR(pid), c);
	129	if (r < 0) {
	130	free(c);
	131	return r;
	132	}
	133
	134	*ret = c;
	135	return 0;
	136	}
	137
	138	static void client_context_reset(ClientContext *c) {
	139	assert(c);
	140
	141	c->timestamp = USEC_INFINITY;
	142
	143	c->uid = UID_INVALID;
	144	c->gid = GID_INVALID;
	145
	146	c->comm = mfree(c->comm);
	147	c->exe = mfree(c->exe);
	148	c->cmdline = mfree(c->cmdline);
	149	c->capeff = mfree(c->capeff);
	150
	151	c->auditid = AUDIT_SESSION_INVALID;
	152	c->loginuid = UID_INVALID;
	153
	154	c->cgroup = mfree(c->cgroup);
	155	c->session = mfree(c->session);
	156	c->owner_uid = UID_INVALID;
	157	c->unit = mfree(c->unit);
	158	c->user_unit = mfree(c->user_unit);
	159	c->slice = mfree(c->slice);
	160	c->user_slice = mfree(c->user_slice);
	161
	162	c->invocation_id = SD_ID128_NULL;
	163
	164	c->label = mfree(c->label);
	165	c->label_size = 0;
d3070fbd LP	166
	167	c->extra_fields_iovec = mfree(c->extra_fields_iovec);
	168	c->extra_fields_n_iovec = 0;
	169	c->extra_fields_data = mfree(c->extra_fields_data);
	170	c->extra_fields_mtime = NSEC_INFINITY;
	171
	172	c->log_level_max = -1;
22e3a02b LP	173	}
	174
	175	static ClientContext* client_context_free(Server s, ClientContext c) {
	176	assert(s);
	177
	178	if (!c)
	179	return NULL;
	180
	181	assert_se(hashmap_remove(s->client_contexts, PID_TO_PTR(c->pid)) == c);
	182
	183	if (c->in_lru)
	184	assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
	185
	186	client_context_reset(c);
	187
	188	return mfree(c);
	189	}
	190
	191	static void client_context_read_uid_gid(ClientContext c, const struct ucred ucred) {
	192	assert(c);
	193	assert(pid_is_valid(c->pid));
	194
	195	/* The ucred data passed in is always the most current and accurate, if we have any. Use it. */
	196	if (ucred && uid_is_valid(ucred->uid))
	197	c->uid = ucred->uid;
	198	else
	199	(void) get_process_uid(c->pid, &c->uid);
	200
	201	if (ucred && gid_is_valid(ucred->gid))
	202	c->gid = ucred->gid;
	203	else
	204	(void) get_process_gid(c->pid, &c->gid);
	205	}
	206
	207	static void client_context_read_basic(ClientContext *c) {
	208	char *t;
	209
	210	assert(c);
	211	assert(pid_is_valid(c->pid));
	212
	213	if (get_process_comm(c->pid, &t) >= 0)
	214	free_and_replace(c->comm, t);
	215
	216	if (get_process_exe(c->pid, &t) >= 0)
	217	free_and_replace(c->exe, t);
	218
	219	if (get_process_cmdline(c->pid, 0, false, &t) >= 0)
	220	free_and_replace(c->cmdline, t);
	221
	222	if (get_process_capeff(c->pid, &t) >= 0)
	223	free_and_replace(c->capeff, t);
	224	}
	225
	226	static int client_context_read_label(
	227	ClientContext *c,
	228	const char *label, size_t label_size) {
	229
	230	assert(c);
	231	assert(pid_is_valid(c->pid));
	232	assert(label_size == 0 \|\| label);
	233
	234	if (label_size > 0) {
	235	char *l;
	236
237	/* If we got an SELinux label passed in it counts. */
238
239	l = newdup_suffix0(char, label, label_size);
240	if (!l)
241	return -ENOMEM;
242
243	free_and_replace(c->label, l);
244	c->label_size = label_size;
245	}
349cc4a5	246	#if HAVE_SELINUX
22e3a02b LP	247	else {
	248	char *con;
	249
	250	/* If we got no SELinux label passed in, let's try to acquire one */
	251
	252	if (getpidcon(c->pid, &con) >= 0) {
	253	free_and_replace(c->label, con);
	254	c->label_size = strlen(c->label);
	255	}
	256	}
	257	#endif
	258
	259	return 0;
	260	}
	261
	262	static int client_context_read_cgroup(Server s, ClientContext c, const char *unit_id) {
	263	char *t = NULL;
	264	int r;
	265
	266	assert(c);
	267
	268	/* Try to acquire the current cgroup path */
	269	r = cg_pid_get_path_shifted(c->pid, s->cgroup_root, &t);
	270	if (r < 0) {
	271
	272	/* If that didn't work, we use the unit ID passed in as fallback, if we have nothing cached yet */
	273	if (unit_id && !c->unit) {
	274	c->unit = strdup(unit_id);
	275	if (c->unit)
	276	return 0;
	277	}
	278
	279	return r;
	280	}
	281
	282	/* Let's shortcut this if the cgroup path didn't change */
	283	if (streq_ptr(c->cgroup, t)) {
	284	free(t);
	285	return 0;
	286	}
	287
	288	free_and_replace(c->cgroup, t);
	289
	290	(void) cg_path_get_session(c->cgroup, &t);
	291	free_and_replace(c->session, t);
	292
	293	if (cg_path_get_owner_uid(c->cgroup, &c->owner_uid) < 0)
	294	c->owner_uid = UID_INVALID;
	295
	296	(void) cg_path_get_unit(c->cgroup, &t);
	297	free_and_replace(c->unit, t);
	298
	299	(void) cg_path_get_user_unit(c->cgroup, &t);
	300	free_and_replace(c->user_unit, t);
	301
	302	(void) cg_path_get_slice(c->cgroup, &t);
	303	free_and_replace(c->slice, t);
	304
	305	(void) cg_path_get_user_slice(c->cgroup, &t);
	306	free_and_replace(c->user_slice, t);
	307
	308	return 0;
	309	}
	310
311	static int client_context_read_invocation_id(
312	Server *s,
313	ClientContext *c) {
314
d3070fbd	315	_cleanup_free_ char *value = NULL;
22e3a02b LP	316	const char *p;
	317	int r;
	318
	319	assert(s);
	320	assert(c);
	321
d3070fbd	322	/* Read the invocation ID of a unit off a unit. PID 1 stores it in a per-unit symlink in /run/systemd/units/ */
22e3a02b	323
d3070fbd	324	if (!c->unit)
22e3a02b LP	325	return 0;
22e3a02b LP	326
d3070fbd LP	327	p = strjoina("/run/systemd/units/invocation:", c->unit);
d3070fbd LP	328	r = readlink_malloc(p, &value);
22e3a02b LP	329	if (r < 0)
	330	return r;
	331
d3070fbd LP	332	return sd_id128_from_string(value, &c->invocation_id);
d3070fbd LP	333	}
22e3a02b	334
d3070fbd LP	335	static int client_context_read_log_level_max(
	336	Server *s,
	337	ClientContext *c) {
22e3a02b	338
d3070fbd LP	339	_cleanup_free_ char *value = NULL;
	340	const char *p;
	341	int r, ll;
	342
	343	if (!c->unit)
	344	return 0;
	345
	346	p = strjoina("/run/systemd/units/log-level-max:", c->unit);
	347	r = readlink_malloc(p, &value);
22e3a02b LP	348	if (r < 0)
22e3a02b LP	349	return r;
d3070fbd LP	350
	351	ll = log_level_from_string(value);
	352	if (ll < 0)
22e3a02b	353	return -EINVAL;
22e3a02b	354
d3070fbd LP	355	c->log_level_max = ll;
	356	return 0;
	357	}
	358
	359	static int client_context_read_extra_fields(
	360	Server *s,
	361	ClientContext *c) {
	362
	363	size_t size = 0, n_iovec = 0, n_allocated = 0, left;
	364	_cleanup_free_ struct iovec *iovec = NULL;
	365	_cleanup_free_ void *data = NULL;
	366	_cleanup_fclose_ FILE *f = NULL;
	367	struct stat st;
	368	const char *p;
	369	uint8_t *q;
	370	int r;
	371
	372	if (!c->unit)
	373	return 0;
	374
	375	p = strjoina("/run/systemd/units/log-extra-fields:", c->unit);
	376
	377	if (c->extra_fields_mtime != NSEC_INFINITY) {
	378	if (stat(p, &st) < 0) {
	379	if (errno == ENOENT)
	380	return 0;
	381
	382	return -errno;
	383	}
	384
	385	if (timespec_load_nsec(&st.st_mtim) == c->extra_fields_mtime)
	386	return 0;
	387	}
	388
	389	f = fopen(p, "re");
	390	if (!f) {
	391	if (errno == ENOENT)
	392	return 0;
	393
	394	return -errno;
	395	}
	396
	397	if (fstat(fileno(f), &st) < 0) /* The file might have been replaced since the stat() above, let's get a new
	398	* one, that matches the stuff we are reading */
	399	return -errno;
	400
	401	r = read_full_stream(f, (char**) &data, &size);
	402	if (r < 0)
	403	return r;
	404
	405	q = data, left = size;
	406	while (left > 0) {
	407	uint8_t field, eq;
	408	uint64_t v, n;
	409
	410	if (left < sizeof(uint64_t))
	411	return -EBADMSG;
	412
	413	v = unaligned_read_le64(q);
	414	if (v < 2)
	415	return -EBADMSG;
	416
	417	n = sizeof(uint64_t) + v;
	418	if (left < n)
419	return -EBADMSG;
420
421	field = q + sizeof(uint64_t);
422
423	eq = memchr(field, '=', v);
424	if (!eq)
425	return -EBADMSG;
426
427	if (!journal_field_valid((const char *) field, eq - field, false))
428	return -EBADMSG;
429
430	if (!GREEDY_REALLOC(iovec, n_allocated, n_iovec+1))
431	return -ENOMEM;
432
433	iovec[n_iovec++] = IOVEC_MAKE(field, v);
434
435	left -= n, q += n;
436	}
437
438	free(c->extra_fields_iovec);
439	free(c->extra_fields_data);
440
441	c->extra_fields_iovec = iovec;
442	c->extra_fields_n_iovec = n_iovec;
443	c->extra_fields_data = data;
444	c->extra_fields_mtime = timespec_load_nsec(&st.st_mtim);
445
446	iovec = NULL;
447	data = NULL;
448
449	return 0;
22e3a02b LP	450	}
	451
	452	static void client_context_really_refresh(
	453	Server *s,
	454	ClientContext *c,
	455	const struct ucred *ucred,
	456	const char *label, size_t label_size,
	457	const char *unit_id,
	458	usec_t timestamp) {
	459
	460	assert(s);
	461	assert(c);
	462	assert(pid_is_valid(c->pid));
	463
	464	if (timestamp == USEC_INFINITY)
	465	timestamp = now(CLOCK_MONOTONIC);
	466
	467	client_context_read_uid_gid(c, ucred);
	468	client_context_read_basic(c);
	469	(void) client_context_read_label(c, label, label_size);
	470
	471	(void) audit_session_from_pid(c->pid, &c->auditid);
	472	(void) audit_loginuid_from_pid(c->pid, &c->loginuid);
	473
	474	(void) client_context_read_cgroup(s, c, unit_id);
	475	(void) client_context_read_invocation_id(s, c);
d3070fbd LP	476	(void) client_context_read_log_level_max(s, c);
d3070fbd LP	477	(void) client_context_read_extra_fields(s, c);
22e3a02b LP	478
	479	c->timestamp = timestamp;
	480
	481	if (c->in_lru) {
	482	assert(c->n_ref == 0);
	483	assert_se(prioq_reshuffle(s->client_contexts_lru, c, &c->lru_index) >= 0);
	484	}
	485	}
	486
	487	void client_context_maybe_refresh(
	488	Server *s,
	489	ClientContext *c,
	490	const struct ucred *ucred,
	491	const char *label, size_t label_size,
	492	const char *unit_id,
	493	usec_t timestamp) {
	494
	495	assert(s);
	496	assert(c);
	497
	498	if (timestamp == USEC_INFINITY)
	499	timestamp = now(CLOCK_MONOTONIC);
	500
	501	/* No cached data so far? Let's fill it up */
	502	if (c->timestamp == USEC_INFINITY)
	503	goto refresh;
	504
	505	/* If the data isn't pinned and if the cashed data is older than the upper limit, we flush it out
	506	* entirely. This follows the logic that as long as an entry is pinned the PID reuse is unlikely. */
	507	if (c->n_ref == 0 && c->timestamp + MAX_USEC < timestamp) {
	508	client_context_reset(c);
	509	goto refresh;
	510	}
	511
	512	/* If the data is older than the lower limit, we refresh, but keep the old data for all we can't update */
	513	if (c->timestamp + REFRESH_USEC < timestamp)
	514	goto refresh;
	515
	516	/* If the data passed along doesn't match the cached data we also do a refresh */
	517	if (ucred && uid_is_valid(ucred->uid) && c->uid != ucred->uid)
	518	goto refresh;
	519
	520	if (ucred && gid_is_valid(ucred->gid) && c->gid != ucred->gid)
	521	goto refresh;
	522
	523	if (label_size > 0 && (label_size != c->label_size \|\| memcmp(label, c->label, label_size) != 0))
	524	goto refresh;
	525
	526	return;
	527
	528	refresh:
	529	client_context_really_refresh(s, c, ucred, label, label_size, unit_id, timestamp);
	530	}
	531
	532	static void client_context_try_shrink_to(Server *s, size_t limit) {
	533	assert(s);
	534
	535	/* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
	536	* breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
	537	* cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
	538
	539	while (hashmap_size(s->client_contexts) > limit) {
	540	ClientContext *c;
	541
542	c = prioq_pop(s->client_contexts_lru);
543	if (!c)
544	break; /* All remaining entries are pinned, give up */
545
546	assert(c->in_lru);
547	assert(c->n_ref == 0);
548
549	c->in_lru = false;
550
551	client_context_free(s, c);
552	}
553	}
554
555	void client_context_flush_all(Server *s) {
556	assert(s);
557
558	/* Flush out all remaining entries. This assumes all references are already dropped. */
559
560	s->my_context = client_context_release(s, s->my_context);
561	s->pid1_context = client_context_release(s, s->pid1_context);
562
563	client_context_try_shrink_to(s, 0);
564
565	assert(prioq_size(s->client_contexts_lru) == 0);
566	assert(hashmap_size(s->client_contexts) == 0);
567
568	s->client_contexts_lru = prioq_free(s->client_contexts_lru);
569	s->client_contexts = hashmap_free(s->client_contexts);
570	}
571
572	static int client_context_get_internal(
573	Server *s,
574	pid_t pid,
575	const struct ucred *ucred,
576	const char *label, size_t label_len,
577	const char *unit_id,
578	bool add_ref,
579	ClientContext **ret) {
580
581	ClientContext *c;
582	int r;
583
584	assert(s);
585	assert(ret);
586
587	if (!pid_is_valid(pid))
588	return -EINVAL;
589
590	c = hashmap_get(s->client_contexts, PID_TO_PTR(pid));
591	if (c) {
592
593	if (add_ref) {
594	if (c->in_lru) {
595	/* The entry wasn't pinned so far, let's remove it from the LRU list then */
596	assert(c->n_ref == 0);
597	assert_se(prioq_remove(s->client_contexts_lru, c, &c->lru_index) >= 0);
598	c->in_lru = false;
599	}
600
601	c->n_ref++;
602	}
603
604	client_context_maybe_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
605
606	*ret = c;
607	return 0;
608	}
609
610	client_context_try_shrink_to(s, CACHE_MAX-1);
611
612	r = client_context_new(s, pid, &c);
613	if (r < 0)
614	return r;
615
616	if (add_ref)
617	c->n_ref++;
618	else {
619	r = prioq_put(s->client_contexts_lru, c, &c->lru_index);
620	if (r < 0) {
621	client_context_free(s, c);
622	return r;
623	}
624
625	c->in_lru = true;
626	}
627
628	client_context_really_refresh(s, c, ucred, label, label_len, unit_id, USEC_INFINITY);
629
630	*ret = c;
631	return 0;
632	}
633
634	int client_context_get(
635	Server *s,
636	pid_t pid,
637	const struct ucred *ucred,
638	const char *label, size_t label_len,
639	const char *unit_id,
640	ClientContext **ret) {
641
642	return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, false, ret);
643	}
644
645	int client_context_acquire(
646	Server *s,
647	pid_t pid,
648	const struct ucred *ucred,
649	const char *label, size_t label_len,
650	const char *unit_id,
651	ClientContext **ret) {
652
653	return client_context_get_internal(s, pid, ucred, label, label_len, unit_id, true, ret);
654	};
655
656	ClientContext client_context_release(Server s, ClientContext *c) {
657	assert(s);
658
659	if (!c)
660	return NULL;
661
662	assert(c->n_ref > 0);
663	assert(!c->in_lru);
664
665	c->n_ref--;
666	if (c->n_ref > 0)
667	return NULL;
668
669	/* The entry is not pinned anymore, let's add it to the LRU prioq if we can. If we can't we'll drop it
670	* right-away */
671
672	if (prioq_put(s->client_contexts_lru, c, &c->lru_index) < 0)
673	client_context_free(s, c);
674	else
675	c->in_lru = true;
676
677	return NULL;
678	}
679
680	void client_context_acquire_default(Server *s) {
681	int r;
682
683	assert(s);
684
685	/* Ensure that our own and PID1's contexts are always pinned. Our own context is particularly useful to
686	* generate driver messages. */
687
688	if (!s->my_context) {
689	struct ucred ucred = {
690	.pid = getpid_cached(),
691	.uid = getuid(),
692	.gid = getgid(),
693	};
694
695	r = client_context_acquire(s, ucred.pid, &ucred, NULL, 0, NULL, &s->my_context);
696	if (r < 0)
697	log_warning_errno(r, "Failed to acquire our own context, ignoring: %m");
698	}
699
700	if (!s->pid1_context) {
701
702	r = client_context_acquire(s, 1, NULL, NULL, 0, NULL, &s->pid1_context);
703	if (r < 0)
704	log_warning_errno(r, "Failed to acquire PID1's context, ignoring: %m");
705
706	}
707	}