#include "chown-recursive.h"
#include "cpu-set-util.h"
#include "def.h"
+#include "env-file.h"
#include "env-util.h"
#include "errno-list.h"
#include "execute.h"
#include "exit-status.h"
#include "fd-util.h"
-#include "fileio.h"
#include "format-util.h"
#include "fs-util.h"
#include "glob-util.h"
#if HAVE_SECCOMP
#include "seccomp-util.h"
#endif
-#include "securebits.h"
#include "securebits-util.h"
#include "selinux-util.h"
#include "signal-util.h"
}
}
+static bool can_inherit_stderr_from_stdout(
+ const ExecContext *context,
+ ExecOutput o,
+ ExecOutput e) {
+
+ assert(context);
+
+ /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
+ * stderr fd */
+
+ if (e == EXEC_OUTPUT_INHERIT)
+ return true;
+ if (e != o)
+ return false;
+
+ if (e == EXEC_OUTPUT_NAMED_FD)
+ return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
+
+ if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
+ return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
+
+ return true;
+}
+
static int setup_output(
const Unit *unit,
const ExecContext *context,
return fileno;
/* Duplicate from stdout if possible */
- if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
+ if (can_inherit_stderr_from_stdout(context, o, e))
return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
o = e;
flags |= O_APPEND;
fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
-
if (fd < 0)
return fd;
c->private_devices ||
context_has_syscall_filters(c) ||
!set_isempty(c->syscall_archs) ||
- c->lock_personality;
+ c->lock_personality ||
+ c->protect_hostname;
}
#if HAVE_SECCOMP
#endif
-static void do_idle_pipe_dance(int idle_pipe[4]) {
+static void do_idle_pipe_dance(int idle_pipe[static 4]) {
assert(idle_pipe);
idle_pipe[1] = safe_close(idle_pipe[1]);
if (context->n_temporary_filesystems > 0)
return true;
- if (context->mount_flags != 0)
+ if (!IN_SET(context->mount_flags, 0, MS_SHARED))
return true;
if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
r = mkdir_label(p, context->directories[type].mode);
if (r < 0 && r != -EEXIST)
goto fail;
- if (r == -EEXIST && !context->dynamic_user)
- continue;
+ if (r == -EEXIST) {
+ struct stat st;
+
+ if (stat(p, &st) < 0) {
+ r = -errno;
+ goto fail;
+ }
+ if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
+ log_warning("%s \'%s\' already exists but the mode is different. "
+ "(filesystem: %o %sMode: %o)",
+ exec_directory_type_to_string(type), *rt,
+ st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
+ if (!context->dynamic_user)
+ continue;
+ }
}
/* Don't change the owner of the configuration directory, as in the common case it is not written to by
.protect_control_groups = context->protect_control_groups,
.protect_kernel_tunables = context->protect_kernel_tunables,
.protect_kernel_modules = context->protect_kernel_modules,
+ .protect_hostname = context->protect_hostname,
.mount_apivfs = context->mount_apivfs,
.private_mounts = context->private_mounts,
};
else
ns_info = (NamespaceInfo) {};
+ if (context->mount_flags == MS_SHARED)
+ log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
+
r = setup_namespace(root_dir, root_image,
&ns_info, context->read_write_paths,
needs_sandboxing ? context->read_only_paths : NULL,
* on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
* UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
- if (!(p->flags & EXEC_NEW_KEYRING))
- return 0;
-
if (context->keyring_mode == EXEC_KEYRING_INHERIT)
return 0;
return r;
}
-static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
+static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
assert(array);
assert(n);
static char *exec_command_line(char **argv);
+static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
+ bool using_subcgroup;
+ char *p;
+
+ assert(params);
+ assert(ret);
+
+ if (!params->cgroup_path)
+ return -EINVAL;
+
+ /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
+ * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
+ * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
+ * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
+ * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
+ * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
+ * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
+ * flag, which is only passed for the former statements, not for the latter. */
+
+ using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
+ if (using_subcgroup)
+ p = strjoin(params->cgroup_path, "/.control");
+ else
+ p = strdup(params->cgroup_path);
+ if (!p)
+ return -ENOMEM;
+
+ *ret = p;
+ return using_subcgroup;
+}
+
static int exec_child(
Unit *unit,
const ExecCommand *command,
int user_lookup_fd,
int *exit_status) {
- _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
+ _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
_cleanup_free_ gid_t *supplementary_gids = NULL;
const char *username = NULL, *groupname = NULL;
_cleanup_free_ char *home_buffer = NULL;
const char *home = NULL, *shell = NULL;
+ char **final_argv = NULL;
dev_t journal_stream_dev = 0;
ino_t journal_stream_ino = 0;
bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
if (socket_fd >= 0)
(void) fd_nonblock(socket_fd, false);
+ /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
+ * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
+ if (params->cgroup_path) {
+ _cleanup_free_ char *p = NULL;
+
+ r = exec_parameters_get_cgroup_path(params, &p);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
+ }
+
+ r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
+ }
+ }
+
+ if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
+ r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
+ if (r < 0) {
+ *exit_status = EXIT_NETWORK;
+ return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
+ }
+ }
+
r = setup_input(context, params, socket_fd, named_iofds);
if (r < 0) {
*exit_status = EXIT_STDIN;
return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
}
- if (params->cgroup_path) {
- r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
- if (r < 0) {
- *exit_status = EXIT_CGROUP;
- return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
- }
- }
-
if (context->oom_score_adjust_set) {
/* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
* prohibit write access to this file, and we shouldn't trip up over that. */
}
}
- /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
+ /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
- * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
+ * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
* touch a single hierarchy too. */
if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
#endif
}
+ if (needs_sandboxing) {
+ int which_failed;
+
+ /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
+ * is set here. (See below.) */
+
+ r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
+ if (r < 0) {
+ *exit_status = EXIT_LIMITS;
+ return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+ }
+ }
+
if (needs_setuid) {
+
+ /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
+ * wins here. (See above.) */
+
if (context->pam_name && username) {
r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
if (r < 0) {
}
}
- if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
+ if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
+
if (ns_type_supported(NAMESPACE_NET)) {
r = setup_netns(runtime->netns_storage_socket);
if (r < 0) {
*exit_status = EXIT_NETWORK;
return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
}
+ } else if (context->network_namespace_path) {
+ *exit_status = EXIT_NETWORK;
+ return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
} else
log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
}
}
}
+ if (context->protect_hostname) {
+ if (ns_type_supported(NAMESPACE_UTS)) {
+ if (unshare(CLONE_NEWUTS) < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
+ }
+ } else
+ log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
+#if HAVE_SECCOMP
+ r = seccomp_protect_hostname();
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
+ }
+#endif
+ }
+
/* Drop groups as early as possbile */
if (needs_setuid) {
r = enforce_groups(gid, supplementary_gids, ngids);
if (needs_sandboxing) {
uint64_t bset;
- int which_failed;
- r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
- if (r < 0) {
- *exit_status = EXIT_LIMITS;
- return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
- }
-
- /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
+ /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
+ * requested. (Note this is placed after the general resource limit initialization, see
+ * above, in order to take precedence.) */
if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
*exit_status = EXIT_LIMITS;
strv_free_and_replace(accum_env, ee);
}
- final_argv = replace_env_argv(command->argv, accum_env);
- if (!final_argv) {
- *exit_status = EXIT_MEMORY;
- return log_oom();
- }
+ if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
+ replaced_argv = replace_env_argv(command->argv, accum_env);
+ if (!replaced_argv) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+ final_argv = replaced_argv;
+ } else
+ final_argv = command->argv;
if (DEBUG_LOGGING) {
_cleanup_free_ char *line;
pid_t *ret) {
int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
+ _cleanup_free_ char *subcgroup_path = NULL;
_cleanup_strv_free_ char **files_env = NULL;
size_t n_storage_fds = 0, n_socket_fds = 0;
_cleanup_free_ char *line = NULL;
LOG_UNIT_ID(unit),
LOG_UNIT_INVOCATION_ID(unit));
+ if (params->cgroup_path) {
+ r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
+ if (r > 0) { /* We are using a child cgroup */
+ r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
+ }
+ }
+
pid = fork();
if (pid < 0)
return log_unit_error_errno(unit, errno, "Failed to fork: %m");
log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
- /* We add the new process to the cgroup both in the child (so
- * that we can be sure that no user code is ever executed
- * outside of the cgroup) and in the parent (so that we can be
- * sure that when we kill the cgroup the process will be
- * killed too). */
- if (params->cgroup_path)
- (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
+ /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
+ * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
+ * process will be killed too). */
+ if (subcgroup_path)
+ (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
exec_status_start(&command->exec_status, pid);
c->stdin_data = mfree(c->stdin_data);
c->stdin_data_size = 0;
+
+ c->network_namespace_path = mfree(c->network_namespace_path);
}
int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
}
}
-static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
+static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]) {
size_t i, targets;
const char* stdio_fdname[3];
size_t n_fds;
"%sIgnoreSIGPIPE: %s\n"
"%sMemoryDenyWriteExecute: %s\n"
"%sRestrictRealtime: %s\n"
- "%sKeyringMode: %s\n",
+ "%sKeyringMode: %s\n"
+ "%sProtectHostname: %s\n",
prefix, c->umask,
prefix, c->working_directory ? c->working_directory : "/",
prefix, c->root_directory ? c->root_directory : "/",
prefix, yes_no(c->ignore_sigpipe),
prefix, yes_no(c->memory_deny_write_execute),
prefix, yes_no(c->restrict_realtime),
- prefix, exec_keyring_mode_to_string(c->keyring_mode));
+ prefix, exec_keyring_mode_to_string(c->keyring_mode),
+ prefix, yes_no(c->protect_hostname));
if (c->root_image)
fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
prefix, s);
}
+ if (c->network_namespace_path)
+ fprintf(f,
+ "%sNetworkNamespacePath: %s\n",
+ prefix, c->network_namespace_path);
+
if (c->syscall_errno > 0) {
const char *errno_name;
else
fprintf(f, "%d\n", c->syscall_errno);
}
-
- if (c->apparmor_profile)
- fprintf(f,
- "%sAppArmorProfile: %s%s\n",
- prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
}
bool exec_context_maintains_privileges(const ExecContext *c) {
}
static void exec_runtime_freep(ExecRuntime **rt) {
- if (*rt)
- (void) exec_runtime_free(*rt, false);
+ (void) exec_runtime_free(*rt, false);
}
-static int exec_runtime_allocate(ExecRuntime **rt) {
- assert(rt);
+static int exec_runtime_allocate(ExecRuntime **ret) {
+ ExecRuntime *n;
+
+ assert(ret);
- *rt = new0(ExecRuntime, 1);
- if (!*rt)
+ n = new(ExecRuntime, 1);
+ if (!n)
return -ENOMEM;
- (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
+ *n = (ExecRuntime) {
+ .netns_storage_socket = { -1, -1 },
+ };
+
+ *ret = n;
return 0;
}
static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
_cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
- _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
+ _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
int r;
assert(m);
assert(id);
/* It is not necessary to create ExecRuntime object. */
- if (!c->private_network && !c->private_tmp)
+ if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
return 0;
if (c->private_tmp) {
return r;
}
- if (c->private_network) {
+ if (c->private_network || c->network_namespace_path) {
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
return -errno;
}
return r;
/* Avoid cleanup */
- netns_storage_socket[0] = -1;
- netns_storage_socket[1] = -1;
+ netns_storage_socket[0] = netns_storage_socket[1] = -1;
return 1;
}