]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/core/execute.c
execute: Rename ExecRuntime to ExecSharedRuntime
[thirdparty/systemd.git] / src / core / execute.c
index e19f38211849d7812c7650ef07aa53f011430a49..b5cf140f72dfe9e6d107a3c4ada48f861666c167 100644 (file)
@@ -46,7 +46,7 @@
 #include "cap-list.h"
 #include "capability-util.h"
 #include "cgroup-setup.h"
-#include "chase-symlinks.h"
+#include "chase.h"
 #include "chown-recursive.h"
 #include "constants.h"
 #include "cpu-set-util.h"
 #include "memory-util.h"
 #include "missing_fs.h"
 #include "missing_ioprio.h"
+#include "missing_prctl.h"
 #include "mkdir-label.h"
 #include "mount-util.h"
 #include "mountpoint-util.h"
 #include "namespace.h"
 #include "parse-util.h"
 #include "path-util.h"
+#include "proc-cmdline.h"
 #include "process-util.h"
+#include "psi-util.h"
 #include "random-util.h"
 #include "recurse-dir.h"
 #include "rlimit-util.h"
@@ -199,6 +202,66 @@ static const char *exec_context_tty_path(const ExecContext *context) {
         return "/dev/console";
 }
 
+static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
+        _cleanup_free_ char *rowskey = NULL, *rowsvalue = NULL, *colskey = NULL, *colsvalue = NULL;
+        unsigned rows, cols;
+        const char *tty;
+        int r;
+
+        assert(context);
+        assert(ret_rows);
+        assert(ret_cols);
+
+        rows = context->tty_rows;
+        cols = context->tty_cols;
+
+        tty = exec_context_tty_path(context);
+        if (!tty || (rows != UINT_MAX && cols != UINT_MAX)) {
+                *ret_rows = rows;
+                *ret_cols = cols;
+                return 0;
+        }
+
+        tty = skip_dev_prefix(tty);
+        if (!in_charset(tty, ALPHANUMERICAL)) {
+                log_debug("%s contains non-alphanumeric characters, ignoring", tty);
+                *ret_rows = rows;
+                *ret_cols = cols;
+                return 0;
+        }
+
+        rowskey = strjoin("systemd.tty.rows.", tty);
+        if (!rowskey)
+                return -ENOMEM;
+
+        colskey = strjoin("systemd.tty.columns.", tty);
+        if (!colskey)
+                return -ENOMEM;
+
+        r = proc_cmdline_get_key_many(/* flags = */ 0,
+                                      rowskey, &rowsvalue,
+                                      colskey, &colsvalue);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read TTY size of %s from kernel cmdline, ignoring: %m", tty);
+
+        if (rows == UINT_MAX && rowsvalue) {
+                r = safe_atou(rowsvalue, &rows);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", rowskey, rowsvalue);
+        }
+
+        if (cols == UINT_MAX && colsvalue) {
+                r = safe_atou(colsvalue, &cols);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", colskey, colsvalue);
+        }
+
+        *ret_rows = rows;
+        *ret_cols = cols;
+
+        return 0;
+}
+
 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
         const char *path;
 
@@ -220,8 +283,12 @@ static void exec_context_tty_reset(const ExecContext *context, const ExecParamet
                         (void) reset_terminal(path);
         }
 
-        if (p && p->stdin_fd >= 0)
-                (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
+        if (p && p->stdin_fd >= 0) {
+                unsigned rows = context->tty_rows, cols = context->tty_cols;
+
+                (void) exec_context_tty_size(context, &rows, &cols);
+                (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
+        }
 
         if (context->tty_vt_disallocate && path)
                 (void) vt_disallocate(path);
@@ -479,9 +546,12 @@ static int setup_input(
 
                 /* Try to make this the controlling tty, if it is a tty, and reset it */
                 if (isatty(STDIN_FILENO)) {
+                        unsigned rows = context->tty_rows, cols = context->tty_cols;
+
+                        (void) exec_context_tty_size(context, &rows, &cols);
                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
                         (void) reset_terminal_fd(STDIN_FILENO, true);
-                        (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
+                        (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
                 }
 
                 return STDIN_FILENO;
@@ -497,6 +567,7 @@ static int setup_input(
         case EXEC_INPUT_TTY:
         case EXEC_INPUT_TTY_FORCE:
         case EXEC_INPUT_TTY_FAIL: {
+                unsigned rows, cols;
                 int fd;
 
                 fd = acquire_terminal(exec_context_tty_path(context),
@@ -507,7 +578,11 @@ static int setup_input(
                 if (fd < 0)
                         return fd;
 
-                r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
+                r = exec_context_tty_size(context, &rows, &cols);
+                if (r < 0)
+                        return r;
+
+                r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
                 if (r < 0)
                         return r;
 
@@ -770,6 +845,7 @@ static int setup_confirm_stdio(
                 int *ret_saved_stdout) {
 
         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
+        unsigned rows, cols;
         int r;
 
         assert(ret_saved_stdin);
@@ -795,7 +871,11 @@ static int setup_confirm_stdio(
         if (r < 0)
                 return r;
 
-        r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
+        r = exec_context_tty_size(context, &rows, &cols);
+        if (r < 0)
+                return r;
+
+        r = terminal_set_size_fd(fd, vc, rows, cols);
         if (r < 0)
                 return r;
 
@@ -1114,7 +1194,10 @@ static int set_securebits(unsigned bits, unsigned mask) {
         return 1;
 }
 
-static int enforce_user(const ExecContext *context, uid_t uid) {
+static int enforce_user(
+                const ExecContext *context,
+                uid_t uid,
+                uint64_t capability_ambient_set) {
         assert(context);
         int r;
 
@@ -1125,7 +1208,7 @@ static int enforce_user(const ExecContext *context, uid_t uid) {
          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
          * case. */
 
-        if ((context->capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
+        if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
 
                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
@@ -1567,12 +1650,25 @@ static int apply_address_families(const Unit* u, const ExecContext *c) {
 }
 
 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
+        int r;
+
         assert(u);
         assert(c);
 
         if (!c->memory_deny_write_execute)
                 return 0;
 
+        /* use prctl() if kernel supports it (6.3) */
+        r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
+        if (r == 0) {
+                log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
+                return 0;
+        }
+        if (r < 0 && errno != EINVAL)
+                return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
+        /* else use seccomp */
+        log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
+
         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
                 return 0;
 
@@ -1805,6 +1901,7 @@ static int build_environment(
                 const Unit *u,
                 const ExecContext *c,
                 const ExecParameters *p,
+                const CGroupContext *cgroup_context,
                 size_t n_fds,
                 char **fdnames,
                 const char *home,
@@ -1812,18 +1909,20 @@ static int build_environment(
                 const char *shell,
                 dev_t journal_stream_dev,
                 ino_t journal_stream_ino,
+                const char *memory_pressure_path,
                 char ***ret) {
 
         _cleanup_strv_free_ char **our_env = NULL;
         size_t n_env = 0;
         char *x;
+        int r;
 
         assert(u);
         assert(c);
         assert(p);
         assert(ret);
 
-#define N_ENV_VARS 17
+#define N_ENV_VARS 19
         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
         if (!our_env)
                 return -ENOMEM;
@@ -1907,6 +2006,7 @@ static int build_environment(
         }
 
         if (exec_context_needs_term(c)) {
+                _cleanup_free_ char *cmdline = NULL;
                 const char *tty_path, *term = NULL;
 
                 tty_path = exec_context_tty_path(c);
@@ -1917,6 +2017,19 @@ static int build_environment(
 
                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
                         term = getenv("TERM");
+                else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
+                        _cleanup_free_ char *key = NULL;
+
+                        key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
+                        if (!key)
+                                return -ENOMEM;
+
+                        r = proc_cmdline_get_key(key, 0, &cmdline);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
+                        else if (r > 0)
+                                term = cmdline;
+                }
 
                 if (!term)
                         term = default_term_for_tty(tty_path);
@@ -1987,8 +2100,35 @@ static int build_environment(
 
         our_env[n_env++] = x;
 
-        our_env[n_env++] = NULL;
-        assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
+        if (memory_pressure_path) {
+                x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
+                if (!x)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+
+                if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
+                        _cleanup_free_ char *b = NULL, *e = NULL;
+
+                        if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
+                                     MEMORY_PRESSURE_DEFAULT_TYPE,
+                                     cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
+                                     CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
+                                     MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
+                                return -ENOMEM;
+
+                        if (base64mem(b, strlen(b) + 1, &e) < 0)
+                                return -ENOMEM;
+
+                        x = strjoin("MEMORY_PRESSURE_WRITE=", e);
+                        if (!x)
+                                return -ENOMEM;
+
+                        our_env[n_env++] = x;
+                }
+        }
+
+        assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
 #undef N_ENV_VARS
 
         *ret = TAKE_PTR(our_env);
@@ -2038,7 +2178,7 @@ static bool exec_needs_ipc_namespace(const ExecContext *context) {
 bool exec_needs_mount_namespace(
                 const ExecContext *context,
                 const ExecParameters *params,
-                const ExecRuntime *runtime) {
+                const ExecSharedRuntime *runtime) {
 
         assert(context);
 
@@ -2067,14 +2207,15 @@ bool exec_needs_mount_namespace(
         if (!strv_isempty(context->extension_directories))
                 return true;
 
-        if (!IN_SET(context->mount_flags, 0, MS_SHARED))
+        if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
                 return true;
 
         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
                 return true;
 
         if (context->private_devices ||
-            context->private_mounts ||
+            context->private_mounts > 0 ||
+            (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
             context->protect_system != PROTECT_SYSTEM_NO ||
             context->protect_home != PROTECT_HOME_NO ||
             context->protect_kernel_tunables ||
@@ -2461,7 +2602,7 @@ static int setup_exec_directory(
                                  * since they all support the private/ symlink logic at least in some
                                  * configurations, see above. */
 
-                                r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
+                                r = chase(target, NULL, 0, &target_resolved, NULL);
                                 if (r < 0)
                                         goto fail;
 
@@ -2472,7 +2613,7 @@ static int setup_exec_directory(
                                 }
 
                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
-                                r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
+                                r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
                                 if (r < 0)
                                         goto fail;
 
@@ -3286,7 +3427,7 @@ static int setup_smack(
                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
                         return r;
 
-                r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
+                r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
                 if (r < 0)
                         return r;
         }
@@ -3542,14 +3683,17 @@ static int apply_mount_namespace(
                 ExecCommandFlags command_flags,
                 const ExecContext *context,
                 const ExecParameters *params,
-                const ExecRuntime *runtime,
+                const ExecSharedRuntime *runtime,
+                const char *memory_pressure_path,
                 char **error_path) {
 
-        _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
+        _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
+                        **read_write_paths_cleanup = NULL;
         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
         const char *root_dir = NULL, *root_image = NULL;
         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
                         *extension_dir = NULL;
+        char **read_write_paths;
         NamespaceInfo ns_info;
         bool needs_sandboxing;
         BindMount *bind_mounts = NULL;
@@ -3574,6 +3718,23 @@ static int apply_mount_namespace(
         if (r < 0)
                 goto finalize;
 
+        /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
+         * service will need to write to it in order to start the notifications. */
+        if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+                read_write_paths_cleanup = strv_copy(context->read_write_paths);
+                if (!read_write_paths_cleanup) {
+                        r = -ENOMEM;
+                        goto finalize;
+                }
+
+                r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
+                if (r < 0)
+                        goto finalize;
+
+                read_write_paths = read_write_paths_cleanup;
+        } else
+                read_write_paths = context->read_write_paths;
+
         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
         if (needs_sandboxing) {
                 /* The runtime struct only contains the parent of the private /tmp,
@@ -3606,6 +3767,7 @@ static int apply_mount_namespace(
                         .protect_system = context->protect_system,
                         .protect_proc = context->protect_proc,
                         .proc_subset = context->proc_subset,
+                        .private_network = exec_needs_network_namespace(context),
                         .private_ipc = exec_needs_ipc_namespace(context),
                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
@@ -3622,7 +3784,7 @@ static int apply_mount_namespace(
         else
                 ns_info = (NamespaceInfo) {};
 
-        if (context->mount_flags == MS_SHARED)
+        if (context->mount_propagation_flag == MS_SHARED)
                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
 
         if (exec_context_has_credentials(context) &&
@@ -3660,7 +3822,7 @@ static int apply_mount_namespace(
                 }
 
         r = setup_namespace(root_dir, root_image, context->root_image_options,
-                            &ns_info, context->read_write_paths,
+                            &ns_info, read_write_paths,
                             needs_sandboxing ? context->read_only_paths : NULL,
                             needs_sandboxing ? context->inaccessible_paths : NULL,
                             needs_sandboxing ? context->exec_paths : NULL,
@@ -3677,7 +3839,7 @@ static int apply_mount_namespace(
                             var_tmp_dir,
                             creds_path,
                             context->log_namespace,
-                            context->mount_flags,
+                            context->mount_propagation_flag,
                             context->root_hash, context->root_hash_size, context->root_hash_path,
                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
                             context->root_verity,
@@ -3894,7 +4056,7 @@ static void append_socket_pair(int *array, size_t *n, const int pair[static 2])
 
 static int close_remaining_fds(
                 const ExecParameters *params,
-                const ExecRuntime *runtime,
+                const ExecSharedRuntime *runtime,
                 const DynamicCreds *dcreds,
                 int user_lookup_fd,
                 int socket_fd,
@@ -3957,9 +4119,9 @@ static int send_user_lookup(
 
         if (writev(user_lookup_fd,
                (struct iovec[]) {
-                           IOVEC_INIT(&uid, sizeof(uid)),
-                           IOVEC_INIT(&gid, sizeof(gid)),
-                           IOVEC_INIT_STRING(unit->id) }, 3) < 0)
+                           IOVEC_MAKE(&uid, sizeof(uid)),
+                           IOVEC_MAKE(&gid, sizeof(gid)),
+                           IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
                 return -errno;
 
         return 0;
@@ -4163,9 +4325,10 @@ static int get_open_file_fd(Unit *u, const OpenFile *of) {
 
         ofd = open(of->path, O_PATH | O_CLOEXEC);
         if (ofd < 0)
-                return log_error_errno(errno, "Could not open \"%s\": %m", of->path);
+                return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
+
         if (fstat(ofd, &st) < 0)
-                return log_error_errno(errno, "Failed to stat %s: %m", of->path);
+                return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
 
         if (S_ISSOCK(st.st_mode)) {
                 fd = connect_unix_harder(u, of, ofd);
@@ -4173,7 +4336,8 @@ static int get_open_file_fd(Unit *u, const OpenFile *of) {
                         return fd;
 
                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
-                        return log_error_errno(errno, "Failed to shutdown send for socket %s: %m", of->path);
+                        return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
+                                                    of->path);
 
                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
         } else {
@@ -4239,8 +4403,9 @@ static int exec_child(
                 const ExecCommand *command,
                 const ExecContext *context,
                 const ExecParameters *params,
-                ExecRuntime *runtime,
+                ExecSharedRuntime *runtime,
                 DynamicCreds *dcreds,
+                const CGroupContext *cgroup_context,
                 int socket_fd,
                 const int named_iofds[static 3],
                 int *params_fds,
@@ -4254,7 +4419,7 @@ static int exec_child(
         int r, ngids = 0, exec_fd;
         _cleanup_free_ gid_t *supplementary_gids = NULL;
         const char *username = NULL, *groupname = NULL;
-        _cleanup_free_ char *home_buffer = NULL;
+        _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
         const char *home = NULL, *shell = NULL;
         char **final_argv = NULL;
         dev_t journal_stream_dev = 0;
@@ -4323,6 +4488,7 @@ static int exec_child(
 
         log_forget_fds();
         log_set_open_when_needed(true);
+        log_settle_target();
 
         /* In case anything used libc syslog(), close this here, too */
         closelog();
@@ -4412,7 +4578,7 @@ static int exec_child(
          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
-            setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
+            setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(unit->manager->runtime_scope), true) != 0) {
                 *exit_status = EXIT_MEMORY;
                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
         }
@@ -4619,11 +4785,13 @@ static int exec_child(
 
         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
                 r = apply_numa_policy(&context->numa_policy);
-                if (r == -EOPNOTSUPP)
-                        log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
-                else if (r < 0) {
-                        *exit_status = EXIT_NUMA_POLICY;
-                        return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
+                if (r < 0) {
+                        if (ERRNO_IS_NOT_SUPPORTED(r))
+                                log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
+                        else {
+                                *exit_status = EXIT_NUMA_POLICY;
+                                return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
+                        }
                 }
         }
 
@@ -4667,15 +4835,41 @@ static int exec_child(
                 }
         }
 
-        /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
-         * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
-         * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
-         * touch a single hierarchy too. */
-        if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
-                r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
-                if (r < 0) {
-                        *exit_status = EXIT_CGROUP;
-                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
+        if (params->cgroup_path) {
+                /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
+                 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
+                 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
+                 * touch a single hierarchy too. */
+
+                if (params->flags & EXEC_CGROUP_DELEGATE) {
+                        r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
+                        if (r < 0) {
+                                *exit_status = EXIT_CGROUP;
+                                return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
+                        }
+                }
+
+                if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
+                        if (cgroup_context_want_memory_pressure(cgroup_context)) {
+                                r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
+                                if (r < 0) {
+                                        *exit_status = EXIT_MEMORY;
+                                        return log_oom();
+                                }
+
+                                r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
+                                if (r < 0) {
+                                        log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+                                                            "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
+                                        memory_pressure_path = mfree(memory_pressure_path);
+                                }
+                        } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
+                                memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
+                                if (!memory_pressure_path) {
+                                        *exit_status = EXIT_MEMORY;
+                                        return log_oom();
+                                }
+                        }
                 }
         }
 
@@ -4699,6 +4893,7 @@ static int exec_child(
                         unit,
                         context,
                         params,
+                        cgroup_context,
                         n_fds,
                         fdnames,
                         home,
@@ -4706,6 +4901,7 @@ static int exec_child(
                         shell,
                         journal_stream_dev,
                         journal_stream_ino,
+                        memory_pressure_path,
                         &our_env);
         if (r < 0) {
                 *exit_status = EXIT_MEMORY;
@@ -4773,6 +4969,8 @@ static int exec_child(
         else
                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
 
+        uint64_t capability_ambient_set = context->capability_ambient_set;
+
         if (needs_sandboxing) {
                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
                  * /sys being present. The actual MAC context application will happen later, as late as
@@ -4813,6 +5011,20 @@ static int exec_child(
                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
                 }
 
+                if (ambient_capabilities_supported()) {
+                        uint64_t ambient_after_pam;
+
+                        /* PAM modules might have set some ambient caps. Query them here and merge them into
+                         * the caps we want to set in the end, so that we don't end up unsetting them. */
+                        r = capability_get_ambient(&ambient_after_pam);
+                        if (r < 0) {
+                                *exit_status = EXIT_CAPABILITIES;
+                                return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
+                        }
+
+                        capability_ambient_set |= ambient_after_pam;
+                }
+
                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
                 if (ngids_after_pam < 0) {
                         *exit_status = EXIT_MEMORY;
@@ -4837,12 +5049,14 @@ static int exec_child(
 
                 if (ns_type_supported(NAMESPACE_NET)) {
                         r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
-                        if (r == -EPERM)
-                                log_unit_warning_errno(unit, r,
-                                                       "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
-                        else if (r < 0) {
-                                *exit_status = EXIT_NETWORK;
-                                return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
+                        if (r < 0) {
+                                if (ERRNO_IS_PRIVILEGE(r))
+                                        log_unit_warning_errno(unit, r,
+                                                               "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
+                                else {
+                                        *exit_status = EXIT_NETWORK;
+                                        return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
+                                }
                         }
                 } else if (context->network_namespace_path) {
                         *exit_status = EXIT_NETWORK;
@@ -4874,7 +5088,7 @@ static int exec_child(
         if (needs_mount_namespace) {
                 _cleanup_free_ char *error_path = NULL;
 
-                r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
+                r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
                 if (r < 0) {
                         *exit_status = EXIT_NAMESPACE;
                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
@@ -5041,7 +5255,7 @@ static int exec_child(
                                 (UINT64_C(1) << CAP_SETGID);
 
                 if (!cap_test_all(bset)) {
-                        r = capability_bounding_set_drop(bset, false);
+                        r = capability_bounding_set_drop(bset, /* right_now= */ false);
                         if (r < 0) {
                                 *exit_status = EXIT_CAPABILITIES;
                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
@@ -5060,7 +5274,7 @@ static int exec_child(
                  * The requested ambient capabilities are raised in the inheritable set if the second
                  * argument is true. */
                 if (!needs_ambient_hack) {
-                        r = capability_ambient_set_apply(context->capability_ambient_set, true);
+                        r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
                         if (r < 0) {
                                 *exit_status = EXIT_CAPABILITIES;
                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
@@ -5075,17 +5289,16 @@ static int exec_child(
 
         if (needs_setuid) {
                 if (uid_is_valid(uid)) {
-                        r = enforce_user(context, uid);
+                        r = enforce_user(context, uid, capability_ambient_set);
                         if (r < 0) {
                                 *exit_status = EXIT_USER;
                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
                         }
 
-                        if (!needs_ambient_hack &&
-                            context->capability_ambient_set != 0) {
+                        if (!needs_ambient_hack && capability_ambient_set != 0) {
 
                                 /* Raise the ambient capabilities after user change. */
-                                r = capability_ambient_set_apply(context->capability_ambient_set, false);
+                                r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
                                 if (r < 0) {
                                         *exit_status = EXIT_CAPABILITIES;
                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
@@ -5336,8 +5549,9 @@ int exec_spawn(Unit *unit,
                ExecCommand *command,
                const ExecContext *context,
                const ExecParameters *params,
-               ExecRuntime *runtime,
+               ExecSharedRuntime *runtime,
                DynamicCreds *dcreds,
+               const CGroupContext *cgroup_context,
                pid_t *ret) {
 
         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
@@ -5354,6 +5568,8 @@ int exec_spawn(Unit *unit,
         assert(params);
         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
 
+        LOG_CONTEXT_PUSH_UNIT(unit);
+
         if (context->std_input == EXEC_INPUT_SOCKET ||
             context->std_output == EXEC_OUTPUT_SOCKET ||
             context->std_error == EXEC_OUTPUT_SOCKET) {
@@ -5425,6 +5641,7 @@ int exec_spawn(Unit *unit,
                                params,
                                runtime,
                                dcreds,
+                               cgroup_context,
                                socket_fd,
                                named_iofds,
                                fds,
@@ -5488,6 +5705,7 @@ void exec_context_init(ExecContext *c) {
         c->tty_rows = UINT_MAX;
         c->tty_cols = UINT_MAX;
         numa_policy_reset(&c->numa_policy);
+        c->private_mounts = -1;
 }
 
 void exec_context_done(ExecContext *c) {
@@ -6564,6 +6782,23 @@ int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
         return 0;
 }
 
+bool exec_context_has_encrypted_credentials(ExecContext *c) {
+        ExecLoadCredential *load_cred;
+        ExecSetCredential *set_cred;
+
+        assert(c);
+
+        HASHMAP_FOREACH(load_cred, c->load_credentials)
+                if (load_cred->encrypted)
+                        return true;
+
+        HASHMAP_FOREACH(set_cred, c->set_credentials)
+                if (set_cred->encrypted)
+                        return true;
+
+        return false;
+}
+
 void exec_status_start(ExecStatus *s, pid_t pid) {
         assert(s);
 
@@ -6721,14 +6956,14 @@ static void *remove_tmpdir_thread(void *p) {
         return NULL;
 }
 
-static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
+static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt, bool destroy) {
         int r;
 
         if (!rt)
                 return NULL;
 
         if (rt->manager)
-                (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
+                (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
 
         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
 
@@ -6760,13 +6995,13 @@ static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
         return mfree(rt);
 }
 
-static void exec_runtime_freep(ExecRuntime **rt) {
-        (void) exec_runtime_free(*rt, false);
+static void exec_shared_runtime_freep(ExecSharedRuntime **rt) {
+        (void) exec_shared_runtime_free(*rt, false);
 }
 
-static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
+static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
         _cleanup_free_ char *id_copy = NULL;
-        ExecRuntime *n;
+        ExecSharedRuntime *n;
 
         assert(ret);
 
@@ -6774,11 +7009,11 @@ static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
         if (!id_copy)
                 return -ENOMEM;
 
-        n = new(ExecRuntime, 1);
+        n = new(ExecSharedRuntime, 1);
         if (!n)
                 return -ENOMEM;
 
-        *n = (ExecRuntime) {
+        *n = (ExecSharedRuntime) {
                 .id = TAKE_PTR(id_copy),
                 .netns_storage_socket = PIPE_EBADF,
                 .ipcns_storage_socket = PIPE_EBADF,
@@ -6788,16 +7023,16 @@ static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
         return 0;
 }
 
-static int exec_runtime_add(
+static int exec_shared_runtime_add(
                 Manager *m,
                 const char *id,
                 char **tmp_dir,
                 char **var_tmp_dir,
                 int netns_storage_socket[2],
                 int ipcns_storage_socket[2],
-                ExecRuntime **ret) {
+                ExecSharedRuntime **ret) {
 
-        _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
+        _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
         int r;
 
         assert(m);
@@ -6805,11 +7040,11 @@ static int exec_runtime_add(
 
         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
 
-        r = exec_runtime_allocate(&rt, id);
+        r = exec_shared_runtime_allocate(&rt, id);
         if (r < 0)
                 return r;
 
-        r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
+        r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
         if (r < 0)
                 return r;
 
@@ -6831,16 +7066,16 @@ static int exec_runtime_add(
 
         if (ret)
                 *ret = rt;
-        /* do not remove created ExecRuntime object when the operation succeeds. */
+        /* do not remove created ExecSharedRuntime object when the operation succeeds. */
         TAKE_PTR(rt);
         return 0;
 }
 
-static int exec_runtime_make(
+static int exec_shared_runtime_make(
                 Manager *m,
                 const ExecContext *c,
                 const char *id,
-                ExecRuntime **ret) {
+                ExecSharedRuntime **ret) {
 
         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
@@ -6850,7 +7085,7 @@ static int exec_runtime_make(
         assert(c);
         assert(id);
 
-        /* It is not necessary to create ExecRuntime object. */
+        /* It is not necessary to create ExecSharedRuntime object. */
         if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
                 *ret = NULL;
                 return 0;
@@ -6875,24 +7110,24 @@ static int exec_runtime_make(
                         return -errno;
         }
 
-        r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
+        r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
         if (r < 0)
                 return r;
 
         return 1;
 }
 
-int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
-        ExecRuntime *rt;
+int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
+        ExecSharedRuntime *rt;
         int r;
 
         assert(m);
         assert(id);
         assert(ret);
 
-        rt = hashmap_get(m->exec_runtime_by_id, id);
+        rt = hashmap_get(m->exec_shared_runtime_by_id, id);
         if (rt)
-                /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
+                /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
                 goto ref;
 
         if (!create) {
@@ -6901,11 +7136,11 @@ int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool
         }
 
         /* If not found, then create a new object. */
-        r = exec_runtime_make(m, c, id, &rt);
+        r = exec_shared_runtime_make(m, c, id, &rt);
         if (r < 0)
                 return r;
         if (r == 0) {
-                /* When r == 0, it is not necessary to create ExecRuntime object. */
+                /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
                 *ret = NULL;
                 return 0;
         }
@@ -6917,7 +7152,7 @@ ref:
         return 1;
 }
 
-ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
+ExecSharedRuntime *exec_shared_runtime_unref(ExecSharedRuntime *rt, bool destroy) {
         if (!rt)
                 return NULL;
 
@@ -6927,17 +7162,17 @@ ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
         if (rt->n_ref > 0)
                 return NULL;
 
-        return exec_runtime_free(rt, destroy);
+        return exec_shared_runtime_free(rt, destroy);
 }
 
-int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
-        ExecRuntime *rt;
+int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
+        ExecSharedRuntime *rt;
 
         assert(m);
         assert(f);
         assert(fds);
 
-        HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
+        HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
                 fprintf(f, "exec-runtime=%s", rt->id);
 
                 if (rt->tmp_dir)
@@ -6992,33 +7227,33 @@ int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
         return 0;
 }
 
-int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
-        _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
-        ExecRuntime *rt;
+int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
+        _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
+        ExecSharedRuntime *rt;
         int r;
 
         /* This is for the migration from old (v237 or earlier) deserialization text.
          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
-         * Even if the ExecRuntime object originally created by the other unit, we cannot judge
+         * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
          * so or not from the serialized text, then we always creates a new object owned by this. */
 
         assert(u);
         assert(key);
         assert(value);
 
-        /* Manager manages ExecRuntime objects by the unit id.
+        /* Manager manages ExecSharedRuntime objects by the unit id.
          * So, we omit the serialized text when the unit does not have id (yet?)... */
         if (isempty(u->id)) {
                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
                 return 0;
         }
 
-        if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
+        if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
                 return log_oom();
 
-        rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
+        rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
         if (!rt) {
-                if (exec_runtime_allocate(&rt_create, u->id) < 0)
+                if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
                         return log_oom();
 
                 rt = rt_create;
@@ -7057,9 +7292,9 @@ int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value,
         } else
                 return 0;
 
-        /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
+        /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
         if (rt_create) {
-                r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
+                r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
                 if (r < 0) {
                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
                         return 0;
@@ -7074,7 +7309,7 @@ int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value,
         return 1;
 }
 
-int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
+int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
         char *id = NULL;
         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
@@ -7186,24 +7421,24 @@ int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
         }
 
 finalize:
-        r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
+        r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
         if (r < 0)
                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
         return 0;
 }
 
-void exec_runtime_vacuum(Manager *m) {
-        ExecRuntime *rt;
+void exec_shared_runtime_vacuum(Manager *m) {
+        ExecSharedRuntime *rt;
 
         assert(m);
 
-        /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
+        /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
 
-        HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
+        HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
                 if (rt->n_ref > 0)
                         continue;
 
-                (void) exec_runtime_free(rt, false);
+                (void) exec_shared_runtime_free(rt, false);
         }
 }