fresh otherwise
Currently, exec_setup_credential() always rewrite all credentials
upon exec_invoke(), i.e. invocation of each ExecCommand, and within
a single tmpfs instance. This is problematic though:
* When writing each tmp cred file, we essentially double the size
of the credential. Therefore, if one cred is bigger than half
of CREDENTIALS_TOTAL_SIZE_MAX, confusing ENOSPC occurs (see also
https://github.com/systemd/systemd/pull/24734#issuecomment-
1925440546)
* Credential is a unit-wide thing and thus should not change
during the whole lifetime of main process. However, if e.g.
a on-disk credential or SetCredential= in unit file
changes between ExecStart= and ExecStartPost=,
the credentials are overwritten when the latter gets to run,
and the already-running main process is suddenly seeing
completely different creds.
So, let's try to reuse final cred dir if the main process has started
and the tmpfs has been populated, so that the creds used is stable
across all ExecStart= and ExecStartPost=-s. We still want to retain
the ability of updating creds through ExecStartPre= though, therefore
we forcibly use a fresh cred dir for those. 'Fresh' means to actually
unmount the old tmpfs first, so the first problem goes away, too.
bool exec_params_need_credentials(const ExecParameters *p) {
assert(p);
- return FLAGS_SET(p->flags, EXEC_SETUP_CREDENTIALS);
+ return p->flags & (EXEC_SETUP_CREDENTIALS|EXEC_SETUP_CREDENTIALS_FRESH);
}
bool exec_context_has_credentials(const ExecContext *c) {
uid_t uid,
gid_t gid) {
+ bool final_mounted;
int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
* if we mounted something; false if we definitely can't mount anything */
- bool final_mounted;
- const char *where;
assert(context);
assert(params);
assert(final);
assert(workspace);
+ r = path_is_mount_point(final);
+ if (r < 0)
+ return r;
+ final_mounted = r > 0;
+
+ if (final_mounted) {
+ if (FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
+ r = umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
+ if (r < 0)
+ return r;
+
+ final_mounted = false;
+ } else {
+ /* We can reuse the previous credential dir */
+ r = dir_is_empty(final, /* ignore_hidden_or_backup = */ false);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
+ return 0;
+ }
+ }
+ }
+
if (reuse_workspace) {
r = path_is_mount_point(workspace);
if (r < 0)
} else
workspace_mounted = -1; /* ditto */
- r = path_is_mount_point(final);
- if (r < 0)
- return r;
- if (r > 0) {
- /* If the final place already has something mounted, we use that. If the workspace also has
- * something mounted we assume it's actually the same mount (but with MS_RDONLY
- * different). */
- final_mounted = true;
-
- if (workspace_mounted < 0) {
- /* If the final place is mounted, but the workspace isn't, then let's bind mount
- * the final version to the workspace, and make it writable, so that we can make
- * changes */
-
- r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
- if (r < 0)
- return r;
-
- r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
- if (r < 0)
- return r;
-
- workspace_mounted = true;
- }
- } else
- final_mounted = false;
+ /* If both the final place and the workspace are mounted, we have no mounts to set up, based on
+ * the assumption that they're actually the same tmpfs (but the latter with MS_RDONLY different).
+ * If the workspace is not mounted, we just bind the final place over and make it writable. */
+ must_mount = must_mount || final_mounted;
if (workspace_mounted < 0) {
- /* Nothing is mounted on the workspace yet, let's try to mount something now */
-
- r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
- if (r < 0) {
- /* If that didn't work, try to make a bind mount from the final to the workspace, so
- * that we can make it writable there. */
+ if (!final_mounted)
+ /* Nothing is mounted on the workspace yet, let's try to mount a new tmpfs if
+ * not using the final place. */
+ r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
+ if (final_mounted || r < 0) {
+ /* If using final place or failed to mount new tmpfs, make a bind mount from
+ * the final to the workspace, so that we can make it writable there. */
r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
if (r < 0) {
if (!ERRNO_IS_PRIVILEGE(r))
return r;
/* If we lack privileges to bind mount stuff, then let's gracefully proceed
- * for compat with container envs, and just use the final dir as is. */
+ * for compat with container envs, and just use the final dir as is.
+ * Final place must not be mounted in this case (refused by must_mount
+ * above) */
workspace_mounted = false;
} else {
/* Make the new bind mount writable (i.e. drop MS_RDONLY) */
- r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+ r = mount_nofollow_verbose(LOG_DEBUG,
+ NULL,
+ workspace,
+ NULL,
+ MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false),
+ NULL);
if (r < 0)
return r;
workspace_mounted = true;
}
- assert(!must_mount || workspace_mounted > 0);
- where = workspace_mounted ? workspace : final;
+ assert(workspace_mounted >= 0);
+ assert(!must_mount || workspace_mounted);
+
+ const char *where = workspace_mounted ? workspace : final;
(void) label_fix_full(AT_FDCWD, where, final, 0);
r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted);
- if (r < 0)
- return r;
-
- if (workspace_mounted) {
- bool install;
-
- /* Determine if we should actually install the prepared mount in the final location by bind
- * mounting it there. We do so only if the mount is not established there already, and if the
- * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
- * case we are doing all this in a mount namespace, thus no one else will see that we
- * allocated a file system we are getting rid of again here. */
+ if (r < 0) {
+ /* If we're using final place as workspace, and failed to acquire credentials, we might
+ * have left half-written creds there. Let's get rid of the whole mount, so future
+ * calls won't reuse it. */
if (final_mounted)
- install = false; /* already installed */
- else {
- r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
- if (r < 0)
- return r;
+ (void) umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
- install = r == 0; /* install only if non-empty */
- }
+ return r;
+ }
- if (install) {
+ if (workspace_mounted) {
+ if (!final_mounted) {
/* Make workspace read-only now, so that any bind mount we make from it defaults to
* read-only too */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
/* And mount it to the final place, read-only */
r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
} else
- /* Otherwise get rid of it */
+ /* Otherwise we just get rid of the bind mount of final place */
r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
if (r < 0)
return r;
}
typedef enum ExecFlags {
- EXEC_APPLY_SANDBOXING = 1 << 0,
- EXEC_APPLY_CHROOT = 1 << 1,
- EXEC_APPLY_TTY_STDIN = 1 << 2,
- EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */
- EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
- EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */
- EXEC_CGROUP_DELEGATE = 1 << 6,
- EXEC_IS_CONTROL = 1 << 7,
- EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */
- EXEC_SETUP_CREDENTIALS = 1 << 9, /* Set up the credential store logic */
+ EXEC_APPLY_SANDBOXING = 1 << 0,
+ EXEC_APPLY_CHROOT = 1 << 1,
+ EXEC_APPLY_TTY_STDIN = 1 << 2,
+ EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */
+ EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
+ EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */
+ EXEC_CGROUP_DELEGATE = 1 << 6,
+ EXEC_IS_CONTROL = 1 << 7,
+ EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */
+ EXEC_SETUP_CREDENTIALS = 1 << 9, /* Set up the credential store logic */
+ EXEC_SETUP_CREDENTIALS_FRESH = 1 << 10, /* Set up a new credential store (disable reuse) */
/* The following are not used by execute.c, but by consumers internally */
- EXEC_PASS_FDS = 1 << 10,
- EXEC_SETENV_RESULT = 1 << 11,
- EXEC_SET_WATCHDOG = 1 << 12,
- EXEC_SETENV_MONITOR_RESULT = 1 << 13, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */
+ EXEC_PASS_FDS = 1 << 11,
+ EXEC_SETENV_RESULT = 1 << 12,
+ EXEC_SET_WATCHDOG = 1 << 13,
+ EXEC_SETENV_MONITOR_RESULT = 1 << 14, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */
} ExecFlags;
/* Parameters for a specific invocation of a command. This structure is put together right before a command is
return NULL;
}
-static ExecFlags service_exec_flags(ServiceExecCommand command_id) {
+static ExecFlags service_exec_flags(ServiceExecCommand command_id, ExecFlags cred_flag) {
/* All service main/control processes honor sandboxing and namespacing options (except those
explicitly excluded in service_spawn()) */
ExecFlags flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT;
assert(command_id >= 0);
assert(command_id < _SERVICE_EXEC_COMMAND_MAX);
+ assert((cred_flag & ~(EXEC_SETUP_CREDENTIALS_FRESH|EXEC_SETUP_CREDENTIALS)) == 0);
+ assert((cred_flag != 0) == (command_id == SERVICE_EXEC_START));
/* Control processes spawned before main process also get tty access */
if (IN_SET(command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START))
flags |= EXEC_APPLY_TTY_STDIN;
- /* All start phases get access to credentials */
- if (IN_SET(command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START, SERVICE_EXEC_START_POST))
+ /* All start phases get access to credentials. ExecStartPre= gets a new credential store upon
+ * every invocation, so that updating credential files through it works. When the first main process
+ * starts, passed creds become stable. Also see 'cred_flag'. */
+ if (command_id == SERVICE_EXEC_START_PRE)
+ flags |= EXEC_SETUP_CREDENTIALS_FRESH;
+ if (command_id == SERVICE_EXEC_START_POST)
flags |= EXEC_SETUP_CREDENTIALS;
if (IN_SET(command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START))
flags |= EXEC_SETENV_MONITOR_RESULT;
if (command_id == SERVICE_EXEC_START)
- return flags|EXEC_PASS_FDS|EXEC_SET_WATCHDOG;
+ return flags|cred_flag|EXEC_PASS_FDS|EXEC_SET_WATCHDOG;
flags |= EXEC_IS_CONTROL;
static int service_spawn_internal(
const char *caller,
Service *s,
- ServiceExecCommand command_id,
ExecCommand *c,
+ ExecFlags flags,
usec_t timeout,
PidRef *ret_pid) {
- _cleanup_(exec_params_shallow_clear) ExecParameters exec_params =
- EXEC_PARAMETERS_INIT(service_exec_flags(command_id));
+ _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(flags);
_cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL;
_cleanup_strv_free_ char **final_env = NULL, **our_env = NULL;
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
pidref_done(&s->control_pid);
r = service_spawn(s,
- s->control_command_id,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_stop_usec,
&s->control_pid);
if (r < 0) {
pidref_done(&s->control_pid);
r = service_spawn(s,
- s->control_command_id,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_stop_usec,
&s->control_pid);
if (r < 0) {
pidref_done(&s->control_pid);
r = service_spawn(s,
- s->control_command_id,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
&s->control_pid);
if (r < 0) {
timeout = s->timeout_start_usec;
r = service_spawn(s,
- SERVICE_EXEC_START,
c,
+ service_exec_flags(SERVICE_EXEC_START, EXEC_SETUP_CREDENTIALS_FRESH),
timeout,
&pidref);
if (r < 0) {
s->control_command_id = SERVICE_EXEC_START_PRE;
r = service_spawn(s,
- s->control_command_id,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
&s->control_pid);
if (r < 0) {
pidref_done(&s->control_pid);
r = service_spawn(s,
- s->control_command_id,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
&s->control_pid);
pidref_done(&s->control_pid);
r = service_spawn(s,
- s->control_command_id,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
&s->control_pid);
if (r < 0) {
pidref_done(&s->control_pid);
r = service_spawn(s,
- s->control_command_id,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
timeout,
&s->control_pid);
if (r < 0) {
service_unwatch_main_pid(s);
r = service_spawn(s,
- SERVICE_EXEC_START,
s->main_command,
+ service_exec_flags(SERVICE_EXEC_START, EXEC_SETUP_CREDENTIALS),
s->timeout_start_usec,
&pidref);
if (r < 0) {
# shellcheck disable=SC2016
set -eux
+# shellcheck source=test/units/util.sh
+. "$(dirname "$0")"/util.sh
+
systemd-analyze log-level debug
run_with_cred_compare() {
--service-type=oneshot --wait --pipe \
true | cmp /etc/os-release
+# https://github.com/systemd/systemd/pull/24734#issuecomment-1925440546
+# Also ExecStartPre= should be able to update creds
+dd if=/dev/urandom of=/tmp/cred-huge bs=600K count=1
+chmod 777 /tmp/cred-huge
+systemd-run -p ProtectSystem=full \
+ -p 'LoadCredential=huge:/tmp/cred-huge' \
+ -p 'ExecStartPre=true' \
+ -p 'ExecStartPre=bash -c "echo fresh >/tmp/cred-huge"' \
+ --unit=test-54-huge-cred.service \
+ --wait --pipe \
+ systemd-creds cat huge | cmp - <(echo "fresh")
+rm /tmp/cred-huge
+
+echo stable >/tmp/cred-stable
+systemd-run -p 'LoadCredential=stable:/tmp/cred-stable' \
+ -p 'ExecStartPost=systemd-creds cat stable' \
+ --unit=test-54-stable.service \
+ --service-type=oneshot --wait --pipe \
+ bash -c "echo bogus >/tmp/cred-stable" | cmp - <(echo "stable")
+assert_eq "$(cat /tmp/cred-stable)" "bogus"
+rm /tmp/cred-stable
+
if ! systemd-detect-virt -q -c ; then
# Validate that the credential we inserted via the initrd logic arrived
test "$(systemd-creds cat --system myinitrdcred)" = "guatemala"