LGPL-2.1-or-later for all code, exceptions noted in LICENSES/README.md
REQUIREMENTS:
- Linux kernel ≥ 3.15
- ≥ 4.3 for ambient capabilities
+ Linux kernel ≥ 4.3
≥ 4.5 for pids controller in cgroup v2
≥ 4.6 for cgroup namespaces
≥ 4.9 for RENAME_NOREPLACE support in vfat
≥ 5.4 for pidfd and signed Verity images
≥ 5.7 for CLONE_INTO_CGROUP, BPF links and the BPF LSM hook
- ⛔ Kernel versions below 3.15 ("minimum baseline") are not supported at
+ ⛔ Kernel versions below 4.3 ("minimum baseline") are not supported at
all, and are missing required functionality (e.g. CLOCK_BOOTTIME
- support for timerfd_create()).
+ support for timerfd_create() or ambient capabilities).
⚠️ Kernel versions below 5.4 ("recommended baseline") have significant
gaps in functionality and are not recommended for use with this version
and then rework cgroupsv2 support around fds, i.e. keep one fd per active
unit around, and always operate on that, instead of cgroup fs paths.
-* drop support for kernels that lack ambient capabilities support (i.e. make
- 4.3 new baseline). Then drop support for "!!" modifier for ExecStart= which
- is only supported for such old kernels.
-
* drop support for kernels lacking memfd_create() (i.e. make 3.17 new
baseline), then drop all pipe() based fallbacks.
assigned to this option, the ambient capability set is reset to the empty capability set, and all prior
settings have no effect. If set to <literal>~</literal> (without any further argument), the ambient capability
set is reset to the full set of available capabilities, also undoing any previous settings. Note that adding
- capabilities to the ambient capability set adds them to the process's inherited capability set. </para><para>
- Ambient capability sets are useful if you want to execute a process as a non-privileged user but still want to
- give it some capabilities. Note that in this case option <constant>keep-caps</constant> is automatically added
- to <varname>SecureBits=</varname> to retain the capabilities over the user
+ capabilities to the ambient capability set adds them to the process's inherited capability set.</para>
+
+ <para>Ambient capability sets are useful if you want to execute a process as a non-privileged user but
+ still want to give it some capabilities. Note that in this case option <constant>keep-caps</constant>
+ is automatically added to <varname>SecureBits=</varname> to retain the capabilities over the user
change. <varname>AmbientCapabilities=</varname> does not affect commands prefixed with
<literal>+</literal>.</para>
<entry>Similar to the <literal>+</literal> character discussed above this permits invoking command lines with elevated privileges. However, unlike <literal>+</literal> the <literal>!</literal> character exclusively alters the effect of <varname>User=</varname>, <varname>Group=</varname> and <varname>SupplementaryGroups=</varname>, i.e. only the stanzas that affect user and group credentials. Note that this setting may be combined with <varname>DynamicUser=</varname>, in which case a dynamic user/group pair is allocated before the command is invoked, but credential changing is left to the executed process itself.</entry>
</row>
-
- <row>
- <entry><literal>!!</literal></entry>
-
- <entry>This prefix is very similar to <literal>!</literal>, however it only has an effect on systems lacking support for ambient process capabilities, i.e. without support for <varname>AmbientCapabilities=</varname>. It's intended to be used for unit files that take benefit of ambient capabilities to run processes with minimal privileges wherever possible while remaining compatible with systems that lack ambient capabilities support. Note that when <literal>!!</literal> is used, and a system lacking ambient capability support is detected any configured <varname>SystemCallFilter=</varname> and <varname>CapabilityBoundingSet=</varname> stanzas are implicitly modified, in order to permit spawned processes to drop credentials and capabilities themselves, even if this is configured to not be allowed. Moreover, if this prefix is used and a system lacking ambient capability support is detected <varname>AmbientCapabilities=</varname> will be skipped and not be applied. On systems supporting ambient capabilities, <literal>!!</literal> has no effect and is redundant.</entry>
- </row>
</tbody>
</tgroup>
</table>
/* Add the capabilities to the ambient set (an possibly also the inheritable set) */
- /* Check that we can use PR_CAP_AMBIENT or quit early. */
- if (!ambient_capabilities_supported())
- return (set & all_capabilities()) == 0 ?
- 0 : -EOPNOTSUPP; /* if actually no ambient caps are to be set, be silent,
- * otherwise fail recognizably */
-
if (also_inherit) {
caps = cap_get_proc();
if (!caps)
return change_capability(cv, CAP_SET);
}
-bool ambient_capabilities_supported(void) {
- static int cache = -1;
-
- if (cache >= 0)
- return cache;
-
- /* If PR_CAP_AMBIENT returns something valid, or an unexpected error code we assume that ambient caps are
- * available. */
-
- cache = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_KILL, 0, 0) >= 0 ||
- !IN_SET(errno, EINVAL, EOPNOTSUPP, ENOSYS);
-
- return cache;
-}
-
bool capability_quintet_mangle(CapabilityQuintet *q) {
uint64_t combined, drop = 0;
- bool ambient_supported;
assert(q);
- combined = q->effective | q->bounding | q->inheritable | q->permitted;
-
- ambient_supported = q->ambient != CAP_MASK_UNSET;
- if (ambient_supported)
- combined |= q->ambient;
+ combined = q->effective | q->bounding | q->inheritable | q->permitted | q->ambient;
for (unsigned i = 0; i <= cap_last_cap(); i++) {
unsigned long bit = UINT64_C(1) << i;
q->bounding &= ~drop;
q->inheritable &= ~drop;
q->permitted &= ~drop;
-
- if (ambient_supported)
- q->ambient &= ~drop;
+ q->ambient &= ~drop;
return drop != 0; /* Let the caller know we changed something */
}
assert(ret);
- if (!ambient_capabilities_supported()) {
- *ret = 0;
- return 0;
- }
-
for (unsigned i = 0; i <= cap_last_cap(); i++) {
r = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, i, 0, 0);
if (r < 0)
return FLAGS_SET(caps, all_capabilities());
}
-bool ambient_capabilities_supported(void);
-
/* Identical to linux/capability.h's CAP_TO_MASK(), but uses an unsigned 1U instead of a signed 1 for shifting left, in
* order to avoid complaints about shifting a signed int left by 31 bits, which would make it negative. */
#define CAP_TO_MASK_CORRECTED(x) (1U << ((x) & 31U))
return strjoin(FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE) ? "-" : "",
FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND) ? ":" : "",
FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED) ? "+" : "",
- FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID) ? "!" : "",
- FLAGS_SET(flags, EXEC_COMMAND_AMBIENT_MAGIC) ? "!!" : "");
+ FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID) ? "!" : "");
}
int bus_set_transient_exec_command(
return true;
}
-static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
+static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p) {
uint32_t negative_action, default_action, action;
int r;
action = negative_action;
}
- if (needs_ambient_hack) {
- r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
- if (r < 0)
- return r;
- }
-
/* Sending over exec_fd or handoff_timestamp_fd requires write() syscall. */
if (p->exec_fd >= 0 || p->handoff_timestamp_fd >= 0) {
r = seccomp_filter_set_add_by_name(c->syscall_filter, c->syscall_allow_list, "write");
bool userns_set_up = false;
bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
- needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
- needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
+ needs_mount_namespace; /* Do we need to set up a mount namespace for this kernel? */
bool keep_seccomp_privileges = false;
bool has_cap_sys_admin = false;
#if HAVE_SELINUX
return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
}
- /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
- * for it, and the kernel doesn't actually support ambient caps. */
- needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
-
/* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
- * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
- * desired. */
- if (needs_ambient_hack)
- needs_setuid = false;
- else
- needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
+ * excepted from either whole sandboxing or just setresuid() itself. */
+ needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
uint64_t capability_ambient_set = context->capability_ambient_set;
return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
}
- if (ambient_capabilities_supported()) {
- uint64_t ambient_after_pam;
-
- /* PAM modules might have set some ambient caps. Query them here and merge them into
- * the caps we want to set in the end, so that we don't end up unsetting them. */
- r = capability_get_ambient(&ambient_after_pam);
- if (r < 0) {
- *exit_status = EXIT_CAPABILITIES;
- return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
- }
-
- capability_ambient_set |= ambient_after_pam;
+ /* PAM modules might have set some ambient caps. Query them here and merge them into
+ * the caps we want to set in the end, so that we don't end up unsetting them. */
+ uint64_t ambient_after_pam;
+ r = capability_get_ambient(&ambient_after_pam);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
}
+ capability_ambient_set |= ambient_after_pam;
+
ngids_after_pam = getgroups_alloc(&gids_after_pam);
if (ngids_after_pam < 0) {
*exit_status = EXIT_GROUP;
#endif
bset = context->capability_bounding_set;
- /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
- * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
- * instead of us doing that */
- if (needs_ambient_hack)
- bset |= (UINT64_C(1) << CAP_SETPCAP) |
- (UINT64_C(1) << CAP_SETUID) |
- (UINT64_C(1) << CAP_SETGID);
#if HAVE_SECCOMP
/* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
*
* The requested ambient capabilities are raised in the inheritable set if the second
* argument is true. */
- if (!needs_ambient_hack && capability_ambient_set != 0) {
+ if (capability_ambient_set != 0) {
r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
if (r < 0) {
*exit_status = EXIT_CAPABILITIES;
}
}
- if (!needs_ambient_hack && capability_ambient_set != 0) {
+ if (capability_ambient_set != 0) {
/* Raise the ambient capabilities after user change. */
r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
#if HAVE_SECCOMP
/* This really should remain as close to the execve() as possible, to make sure our own code is affected
* by the filter as little as possible. */
- r = apply_syscall_filter(context, params, needs_ambient_hack);
+ r = apply_syscall_filter(context, params);
if (r < 0) {
*exit_status = EXIT_SECCOMP;
return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
}
const char *f = firstword;
- bool ignore, separate_argv0 = false;
+ bool ignore, separate_argv0 = false, ambient_hack = false;
ExecCommandFlags flags = 0;
for (;; f++) {
* ":": Disable environment variable substitution
* "+": Run with full privileges and no sandboxing
* "!": Apply sandboxing except for user/group credentials
- * "!!": Apply user/group credentials if the kernel supports ambient capabilities -
- * if it doesn't we don't apply the credentials themselves, but do apply
- * most other sandboxing, with some special exceptions for changing UID.
- *
- * The idea is that '!!' may be used to write services that can take benefit of
- * systemd's UID/GID dropping if the kernel supports ambient creds, but provide
- * an automatic fallback to privilege dropping within the daemon if the kernel
- * does not offer that. */
+ */
- if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE))
+ if (*f == '-' && !FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE))
flags |= EXEC_COMMAND_IGNORE_FAILURE;
else if (*f == '@' && !separate_argv0)
separate_argv0 = true;
- else if (*f == ':' && !(flags & EXEC_COMMAND_NO_ENV_EXPAND))
+ else if (*f == ':' && !FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND))
flags |= EXEC_COMMAND_NO_ENV_EXPAND;
- else if (*f == '+' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+ else if (*f == '+' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID)) && !ambient_hack)
flags |= EXEC_COMMAND_FULLY_PRIVILEGED;
- else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+ else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID)) && !ambient_hack)
flags |= EXEC_COMMAND_NO_SETUID;
- else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))) {
+ else if (*f == '!' && !FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED) && !ambient_hack) {
+ /* Compatibility with the old !! ambient caps hack (removed in v258). Since
+ * we don't support that anymore and !! was a noop on non-supporting systems,
+ * we'll just turn off the EXEC_COMMAND_NO_SETUID flag again and be done with
+ * it. */
flags &= ~EXEC_COMMAND_NO_SETUID;
- flags |= EXEC_COMMAND_AMBIENT_MAGIC;
+ ambient_hack = true;
+
+ log_syntax(unit, LOG_NOTICE, filename, line, 0,
+ "The !! modifier for %s= lines is no longer supported and is now ignored. "
+ "Please update your unit files and remove the modifier.", lvalue);
} else
break;
}
if (q.permitted == UINT64_MAX)
q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
- if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
+ if (q.ambient == UINT64_MAX)
q.ambient = arg_caps_ambient;
if (capability_quintet_mangle(&q))
.effective = uid == 0 ? arg_caps_retain : 0,
.inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
.permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
- .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
+ .ambient = arg_caps_ambient,
};
/* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
}
static int bus_append_exec_command(sd_bus_message *m, const char *field, const char *eq) {
- bool explicit_path = false, done = false;
+ bool explicit_path = false, done = false, ambient_hack = false;
_cleanup_strv_free_ char **l = NULL, **ex_opts = NULL;
_cleanup_free_ char *path = NULL, *upgraded_name = NULL;
ExecCommandFlags flags = 0;
break;
case '+':
- if (flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))
+ if ((flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID)) != 0 || ambient_hack)
done = true;
else {
flags |= EXEC_COMMAND_FULLY_PRIVILEGED;
break;
case '!':
- if (flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))
+ if (FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED) || ambient_hack)
done = true;
else if (FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID)) {
+ /* Compatibility with the old !! ambient caps hack (removed in v258). Since
+ * we don't support that anymore and !! was a noop on non-supporting systems,
+ * we'll just turn off the EXEC_COMMAND_NO_SETUID flag again and be done with
+ * it. */
flags &= ~EXEC_COMMAND_NO_SETUID;
- flags |= EXEC_COMMAND_AMBIENT_MAGIC;
eq++;
+ ambient_hack = true;
+
+ log_notice("!! modifier for %s= fields is no longer supported and is now ignored.", field);
} else {
flags |= EXEC_COMMAND_NO_SETUID;
eq++;
}
} while (!done);
- if (!is_ex_prop && (flags & (EXEC_COMMAND_NO_ENV_EXPAND|EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))) {
+ if (!is_ex_prop && (flags & (EXEC_COMMAND_NO_ENV_EXPAND|EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID))) {
/* Upgrade the ExecXYZ= property to ExecXYZEx= for convenience */
is_ex_prop = true;
upgraded_name = strjoin(field, "Ex");
"ignore-failure", /* EXEC_COMMAND_IGNORE_FAILURE */
"privileged", /* EXEC_COMMAND_FULLY_PRIVILEGED */
"no-setuid", /* EXEC_COMMAND_NO_SETUID */
- "ambient", /* EXEC_COMMAND_AMBIENT_MAGIC */
"no-env-expand", /* EXEC_COMMAND_NO_ENV_EXPAND */
};
+assert_cc((1 << ELEMENTSOF(exec_command_strings)) - 1 == _EXEC_COMMAND_FLAGS_ALL);
+
const char* exec_command_flags_to_string(ExecCommandFlags i) {
for (size_t idx = 0; idx < ELEMENTSOF(exec_command_strings); idx++)
if (i == (1 << idx))
ExecCommandFlags exec_command_flags_from_string(const char *s) {
ssize_t idx;
- idx = string_table_lookup(exec_command_strings, ELEMENTSOF(exec_command_strings), s);
+ if (streq(s, "ambient")) /* Compatibility with ambient hack, removed in v258, map to no bits set */
+ return 0;
+ idx = string_table_lookup(exec_command_strings, ELEMENTSOF(exec_command_strings), s);
if (idx < 0)
return _EXEC_COMMAND_FLAGS_INVALID;
- else
- return 1 << idx;
+
+ return 1 << idx;
}
int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]) {
EXEC_COMMAND_IGNORE_FAILURE = 1 << 0,
EXEC_COMMAND_FULLY_PRIVILEGED = 1 << 1,
EXEC_COMMAND_NO_SETUID = 1 << 2,
- EXEC_COMMAND_AMBIENT_MAGIC = 1 << 3,
- EXEC_COMMAND_NO_ENV_EXPAND = 1 << 4,
+ EXEC_COMMAND_NO_ENV_EXPAND = 1 << 3,
_EXEC_COMMAND_FLAGS_INVALID = -EINVAL,
+ _EXEC_COMMAND_FLAGS_ALL = (1 << 4) -1,
} ExecCommandFlags;
int exec_command_flags_from_strv(char * const *ex_opts, ExecCommandFlags *ret);
test_last_cap_file();
test_last_cap_probe();
- log_info("have ambient caps: %s", yes_no(ambient_capabilities_supported()));
-
if (getuid() != 0)
return log_tests_skipped("not running as root");
assert_se(FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND));
assert_se(FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID));
assert_se(FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE));
- assert_se(!FLAGS_SET(flags, EXEC_COMMAND_AMBIENT_MAGIC));
assert_se(!FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED));
r = exec_command_flags_from_strv(invalid_strv, &flags);
TEST(exec_command_flags_to_strv) {
_cleanup_strv_free_ char **opts = NULL;
- ASSERT_OK(exec_command_flags_to_strv(EXEC_COMMAND_AMBIENT_MAGIC|EXEC_COMMAND_NO_ENV_EXPAND|EXEC_COMMAND_IGNORE_FAILURE, &opts));
- assert_se(strv_equal(opts, STRV_MAKE("ignore-failure", "ambient", "no-env-expand")));
+ ASSERT_OK(exec_command_flags_to_strv(EXEC_COMMAND_NO_ENV_EXPAND|EXEC_COMMAND_IGNORE_FAILURE, &opts));
+ assert_se(strv_equal(opts, STRV_MAKE("ignore-failure", "no-env-expand")));
opts = strv_free(opts);
BusName=org.freedesktop.network1
CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_BIND_SERVICE CAP_NET_BROADCAST CAP_NET_RAW CAP_BPF CAP_SYS_ADMIN
DeviceAllow=char-* rw
-ExecStart=!!{{LIBEXECDIR}}/systemd-networkd
+ExecStart={{LIBEXECDIR}}/systemd-networkd
FileDescriptorStoreMax=512
ImportCredential=network.wireguard.*
LockPersonality=yes
AmbientCapabilities=CAP_SETPCAP CAP_NET_RAW CAP_NET_BIND_SERVICE
BusName=org.freedesktop.resolve1
CapabilityBoundingSet=CAP_SETPCAP CAP_NET_RAW CAP_NET_BIND_SERVICE
-ExecStart=!!{{LIBEXECDIR}}/systemd-resolved
+ExecStart={{LIBEXECDIR}}/systemd-resolved
LockPersonality=yes
MemoryDenyWriteExecute=yes
NoNewPrivileges=yes
# correct time to work, but we likely won't acquire that without NTP. Let's
# break this chicken-and-egg cycle here.
Environment=SYSTEMD_NSS_RESOLVE_VALIDATE=0
-ExecStart=!!{{LIBEXECDIR}}/systemd-timesyncd
+ExecStart={{LIBEXECDIR}}/systemd-timesyncd
LockPersonality=yes
MemoryDenyWriteExecute=yes
NoNewPrivileges=yes