execute: Rename ExecRuntime to ExecSharedRuntime

[thirdparty/systemd.git] / src / core / execute.c
diff --git a/src/core/execute.c b/src/core/execute.c

index e19f38211849d7812c7650ef07aa53f011430a49..b5cf140f72dfe9e6d107a3c4ada48f861666c167 100644 (file)
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -46,7 +46,7 @@
  #include "cap-list.h"
  #include "capability-util.h"
  #include "cgroup-setup.h"
-#include "chase-symlinks.h"
+#include "chase.h"
  #include "chown-recursive.h"
  #include "constants.h"
  #include "cpu-set-util.h"
@@ -73,13 +73,16 @@
  #include "memory-util.h"
  #include "missing_fs.h"
  #include "missing_ioprio.h"
+#include "missing_prctl.h"
  #include "mkdir-label.h"
  #include "mount-util.h"
  #include "mountpoint-util.h"
  #include "namespace.h"
  #include "parse-util.h"
  #include "path-util.h"
+#include "proc-cmdline.h"
  #include "process-util.h"
+#include "psi-util.h"
  #include "random-util.h"
  #include "recurse-dir.h"
  #include "rlimit-util.h"
@@ -199,6 +202,66 @@ static const char *exec_context_tty_path(const ExecContext *context) {
          return "/dev/console";
  }
  
+static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
+        _cleanup_free_ char *rowskey = NULL, *rowsvalue = NULL, *colskey = NULL, *colsvalue = NULL;
+        unsigned rows, cols;
+        const char *tty;
+        int r;
+
+        assert(context);
+        assert(ret_rows);
+        assert(ret_cols);
+
+        rows = context->tty_rows;
+        cols = context->tty_cols;
+
+        tty = exec_context_tty_path(context);
+        if (!tty || (rows != UINT_MAX && cols != UINT_MAX)) {
+                *ret_rows = rows;
+                *ret_cols = cols;
+                return 0;
+        }
+
+        tty = skip_dev_prefix(tty);
+        if (!in_charset(tty, ALPHANUMERICAL)) {
+                log_debug("%s contains non-alphanumeric characters, ignoring", tty);
+                *ret_rows = rows;
+                *ret_cols = cols;
+                return 0;
+        }
+
+        rowskey = strjoin("systemd.tty.rows.", tty);
+        if (!rowskey)
+                return -ENOMEM;
+
+        colskey = strjoin("systemd.tty.columns.", tty);
+        if (!colskey)
+                return -ENOMEM;
+
+        r = proc_cmdline_get_key_many(/* flags = */ 0,
+                                      rowskey, &rowsvalue,
+                                      colskey, &colsvalue);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read TTY size of %s from kernel cmdline, ignoring: %m", tty);
+
+        if (rows == UINT_MAX && rowsvalue) {
+                r = safe_atou(rowsvalue, &rows);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", rowskey, rowsvalue);
+        }
+
+        if (cols == UINT_MAX && colsvalue) {
+                r = safe_atou(colsvalue, &cols);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", colskey, colsvalue);
+        }
+
+        *ret_rows = rows;
+        *ret_cols = cols;
+
+        return 0;
+}
+
  static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
          const char *path;
  
@@ -220,8 +283,12 @@ static void exec_context_tty_reset(const ExecContext *context, const ExecParamet
                          (void) reset_terminal(path);
          }
  
-        if (p && p->stdin_fd >= 0)
-                (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
+        if (p && p->stdin_fd >= 0) {
+                unsigned rows = context->tty_rows, cols = context->tty_cols;
+
+                (void) exec_context_tty_size(context, &rows, &cols);
+                (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
+        }
  
          if (context->tty_vt_disallocate && path)
                  (void) vt_disallocate(path);
@@ -479,9 +546,12 @@ static int setup_input(
  
                  /* Try to make this the controlling tty, if it is a tty, and reset it */
                  if (isatty(STDIN_FILENO)) {
+                        unsigned rows = context->tty_rows, cols = context->tty_cols;
+
+                        (void) exec_context_tty_size(context, &rows, &cols);
                          (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
                          (void) reset_terminal_fd(STDIN_FILENO, true);
-                        (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
+                        (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
                  }
  
                  return STDIN_FILENO;
@@ -497,6 +567,7 @@ static int setup_input(
          case EXEC_INPUT_TTY:
          case EXEC_INPUT_TTY_FORCE:
          case EXEC_INPUT_TTY_FAIL: {
+                unsigned rows, cols;
                  int fd;
  
                  fd = acquire_terminal(exec_context_tty_path(context),
@@ -507,7 +578,11 @@ static int setup_input(
                  if (fd < 0)
                          return fd;
  
-                r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
+                r = exec_context_tty_size(context, &rows, &cols);
+                if (r < 0)
+                        return r;
+
+                r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
                  if (r < 0)
                          return r;
  
@@ -770,6 +845,7 @@ static int setup_confirm_stdio(
                  int *ret_saved_stdout) {
  
          _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
+        unsigned rows, cols;
          int r;
  
          assert(ret_saved_stdin);
@@ -795,7 +871,11 @@ static int setup_confirm_stdio(
          if (r < 0)
                  return r;
  
-        r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
+        r = exec_context_tty_size(context, &rows, &cols);
+        if (r < 0)
+                return r;
+
+        r = terminal_set_size_fd(fd, vc, rows, cols);
          if (r < 0)
                  return r;
  
@@ -1114,7 +1194,10 @@ static int set_securebits(unsigned bits, unsigned mask) {
          return 1;
  }
  
-static int enforce_user(const ExecContext *context, uid_t uid) {
+static int enforce_user(
+                const ExecContext *context,
+                uid_t uid,
+                uint64_t capability_ambient_set) {
          assert(context);
          int r;
  
@@ -1125,7 +1208,7 @@ static int enforce_user(const ExecContext *context, uid_t uid) {
           * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
           * case. */
  
-        if ((context->capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
+        if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
  
                  /* First step: If we need to keep capabilities but drop privileges we need to make sure we
                   * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
@@ -1567,12 +1650,25 @@ static int apply_address_families(const Unit* u, const ExecContext *c) {
  }
  
  static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
+        int r;
+
          assert(u);
          assert(c);
  
          if (!c->memory_deny_write_execute)
                  return 0;
  
+        /* use prctl() if kernel supports it (6.3) */
+        r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
+        if (r == 0) {
+                log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
+                return 0;
+        }
+        if (r < 0 && errno != EINVAL)
+                return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
+        /* else use seccomp */
+        log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
+
          if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
                  return 0;
  
@@ -1805,6 +1901,7 @@ static int build_environment(
                  const Unit *u,
                  const ExecContext *c,
                  const ExecParameters *p,
+                const CGroupContext *cgroup_context,
                  size_t n_fds,
                  char **fdnames,
                  const char *home,
@@ -1812,18 +1909,20 @@ static int build_environment(
                  const char *shell,
                  dev_t journal_stream_dev,
                  ino_t journal_stream_ino,
+                const char *memory_pressure_path,
                  char ***ret) {
  
          _cleanup_strv_free_ char **our_env = NULL;
          size_t n_env = 0;
          char *x;
+        int r;
  
          assert(u);
          assert(c);
          assert(p);
          assert(ret);
  
-#define N_ENV_VARS 17
+#define N_ENV_VARS 19
          our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
          if (!our_env)
                  return -ENOMEM;
@@ -1907,6 +2006,7 @@ static int build_environment(
          }
  
          if (exec_context_needs_term(c)) {
+                _cleanup_free_ char *cmdline = NULL;
                  const char *tty_path, *term = NULL;
  
                  tty_path = exec_context_tty_path(c);
@@ -1917,6 +2017,19 @@ static int build_environment(
  
                  if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
                          term = getenv("TERM");
+                else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
+                        _cleanup_free_ char *key = NULL;
+
+                        key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
+                        if (!key)
+                                return -ENOMEM;
+
+                        r = proc_cmdline_get_key(key, 0, &cmdline);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
+                        else if (r > 0)
+                                term = cmdline;
+                }
  
                  if (!term)
                          term = default_term_for_tty(tty_path);
@@ -1987,8 +2100,35 @@ static int build_environment(
  
          our_env[n_env++] = x;
  
-        our_env[n_env++] = NULL;
-        assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
+        if (memory_pressure_path) {
+                x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
+                if (!x)
+                        return -ENOMEM;
+
+                our_env[n_env++] = x;
+
+                if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
+                        _cleanup_free_ char *b = NULL, *e = NULL;
+
+                        if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
+                                     MEMORY_PRESSURE_DEFAULT_TYPE,
+                                     cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
+                                     CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
+                                     MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
+                                return -ENOMEM;
+
+                        if (base64mem(b, strlen(b) + 1, &e) < 0)
+                                return -ENOMEM;
+
+                        x = strjoin("MEMORY_PRESSURE_WRITE=", e);
+                        if (!x)
+                                return -ENOMEM;
+
+                        our_env[n_env++] = x;
+                }
+        }
+
+        assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
  #undef N_ENV_VARS
  
          *ret = TAKE_PTR(our_env);
@@ -2038,7 +2178,7 @@ static bool exec_needs_ipc_namespace(const ExecContext *context) {
  bool exec_needs_mount_namespace(
                  const ExecContext *context,
                  const ExecParameters *params,
-                const ExecRuntime *runtime) {
+                const ExecSharedRuntime *runtime) {
  
          assert(context);
  
@@ -2067,14 +2207,15 @@ bool exec_needs_mount_namespace(
          if (!strv_isempty(context->extension_directories))
                  return true;
  
-        if (!IN_SET(context->mount_flags, 0, MS_SHARED))
+        if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
                  return true;
  
          if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
                  return true;
  
          if (context->private_devices ||
-            context->private_mounts ||
+            context->private_mounts > 0 ||
+            (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
              context->protect_system != PROTECT_SYSTEM_NO ||
              context->protect_home != PROTECT_HOME_NO ||
              context->protect_kernel_tunables ||
@@ -2461,7 +2602,7 @@ static int setup_exec_directory(
                                   * since they all support the private/ symlink logic at least in some
                                   * configurations, see above. */
  
-                                r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
+                                r = chase(target, NULL, 0, &target_resolved, NULL);
                                  if (r < 0)
                                          goto fail;
  
@@ -2472,7 +2613,7 @@ static int setup_exec_directory(
                                  }
  
                                  /* /var/lib or friends may be symlinks. So, let's chase them also. */
-                                r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
+                                r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
                                  if (r < 0)
                                          goto fail;
  
@@ -3286,7 +3427,7 @@ static int setup_smack(
                  if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
                          return r;
  
-                r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
+                r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
                  if (r < 0)
                          return r;
          }
@@ -3542,14 +3683,17 @@ static int apply_mount_namespace(
                  ExecCommandFlags command_flags,
                  const ExecContext *context,
                  const ExecParameters *params,
-                const ExecRuntime *runtime,
+                const ExecSharedRuntime *runtime,
+                const char *memory_pressure_path,
                  char **error_path) {
  
-        _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
+        _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
+                        **read_write_paths_cleanup = NULL;
          const char *tmp_dir = NULL, *var_tmp_dir = NULL;
          const char *root_dir = NULL, *root_image = NULL;
          _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
                          *extension_dir = NULL;
+        char **read_write_paths;
          NamespaceInfo ns_info;
          bool needs_sandboxing;
          BindMount *bind_mounts = NULL;
@@ -3574,6 +3718,23 @@ static int apply_mount_namespace(
          if (r < 0)
                  goto finalize;
  
+        /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
+         * service will need to write to it in order to start the notifications. */
+        if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+                read_write_paths_cleanup = strv_copy(context->read_write_paths);
+                if (!read_write_paths_cleanup) {
+                        r = -ENOMEM;
+                        goto finalize;
+                }
+
+                r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
+                if (r < 0)
+                        goto finalize;
+
+                read_write_paths = read_write_paths_cleanup;
+        } else
+                read_write_paths = context->read_write_paths;
+
          needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
          if (needs_sandboxing) {
                  /* The runtime struct only contains the parent of the private /tmp,
@@ -3606,6 +3767,7 @@ static int apply_mount_namespace(
                          .protect_system = context->protect_system,
                          .protect_proc = context->protect_proc,
                          .proc_subset = context->proc_subset,
+                        .private_network = exec_needs_network_namespace(context),
                          .private_ipc = exec_needs_ipc_namespace(context),
                          /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
                          .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
@@ -3622,7 +3784,7 @@ static int apply_mount_namespace(
          else
                  ns_info = (NamespaceInfo) {};
  
-        if (context->mount_flags == MS_SHARED)
+        if (context->mount_propagation_flag == MS_SHARED)
                  log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
  
          if (exec_context_has_credentials(context) &&
@@ -3660,7 +3822,7 @@ static int apply_mount_namespace(
                  }
  
          r = setup_namespace(root_dir, root_image, context->root_image_options,
-                            &ns_info, context->read_write_paths,
+                            &ns_info, read_write_paths,
                              needs_sandboxing ? context->read_only_paths : NULL,
                              needs_sandboxing ? context->inaccessible_paths : NULL,
                              needs_sandboxing ? context->exec_paths : NULL,
@@ -3677,7 +3839,7 @@ static int apply_mount_namespace(
                              var_tmp_dir,
                              creds_path,
                              context->log_namespace,
-                            context->mount_flags,
+                            context->mount_propagation_flag,
                              context->root_hash, context->root_hash_size, context->root_hash_path,
                              context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
                              context->root_verity,
@@ -3894,7 +4056,7 @@ static void append_socket_pair(int *array, size_t *n, const int pair[static 2])
  
  static int close_remaining_fds(
                  const ExecParameters *params,
-                const ExecRuntime *runtime,
+                const ExecSharedRuntime *runtime,
                  const DynamicCreds *dcreds,
                  int user_lookup_fd,
                  int socket_fd,
@@ -3957,9 +4119,9 @@ static int send_user_lookup(
  
          if (writev(user_lookup_fd,
                 (struct iovec[]) {
-                           IOVEC_INIT(&uid, sizeof(uid)),
-                           IOVEC_INIT(&gid, sizeof(gid)),
-                           IOVEC_INIT_STRING(unit->id) }, 3) < 0)
+                           IOVEC_MAKE(&uid, sizeof(uid)),
+                           IOVEC_MAKE(&gid, sizeof(gid)),
+                           IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
                  return -errno;
  
          return 0;
@@ -4163,9 +4325,10 @@ static int get_open_file_fd(Unit *u, const OpenFile *of) {
  
          ofd = open(of->path, O_PATH | O_CLOEXEC);
          if (ofd < 0)
-                return log_error_errno(errno, "Could not open \"%s\": %m", of->path);
+                return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
+
          if (fstat(ofd, &st) < 0)
-                return log_error_errno(errno, "Failed to stat %s: %m", of->path);
+                return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
  
          if (S_ISSOCK(st.st_mode)) {
                  fd = connect_unix_harder(u, of, ofd);
@@ -4173,7 +4336,8 @@ static int get_open_file_fd(Unit *u, const OpenFile *of) {
                          return fd;
  
                  if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
-                        return log_error_errno(errno, "Failed to shutdown send for socket %s: %m", of->path);
+                        return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
+                                                    of->path);
  
                  log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
          } else {
@@ -4239,8 +4403,9 @@ static int exec_child(
                  const ExecCommand *command,
                  const ExecContext *context,
                  const ExecParameters *params,
-                ExecRuntime *runtime,
+                ExecSharedRuntime *runtime,
                  DynamicCreds *dcreds,
+                const CGroupContext *cgroup_context,
                  int socket_fd,
                  const int named_iofds[static 3],
                  int *params_fds,
@@ -4254,7 +4419,7 @@ static int exec_child(
          int r, ngids = 0, exec_fd;
          _cleanup_free_ gid_t *supplementary_gids = NULL;
          const char *username = NULL, *groupname = NULL;
-        _cleanup_free_ char *home_buffer = NULL;
+        _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
          const char *home = NULL, *shell = NULL;
          char **final_argv = NULL;
          dev_t journal_stream_dev = 0;
@@ -4323,6 +4488,7 @@ static int exec_child(
  
          log_forget_fds();
          log_set_open_when_needed(true);
+        log_settle_target();
  
          /* In case anything used libc syslog(), close this here, too */
          closelog();
@@ -4412,7 +4578,7 @@ static int exec_child(
           * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
           * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
          if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
-            setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
+            setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(unit->manager->runtime_scope), true) != 0) {
                  *exit_status = EXIT_MEMORY;
                  return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
          }
@@ -4619,11 +4785,13 @@ static int exec_child(
  
          if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
                  r = apply_numa_policy(&context->numa_policy);
-                if (r == -EOPNOTSUPP)
-                        log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
-                else if (r < 0) {
-                        *exit_status = EXIT_NUMA_POLICY;
-                        return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
+                if (r < 0) {
+                        if (ERRNO_IS_NOT_SUPPORTED(r))
+                                log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
+                        else {
+                                *exit_status = EXIT_NUMA_POLICY;
+                                return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
+                        }
                  }
          }
  
@@ -4667,15 +4835,41 @@ static int exec_child(
                  }
          }
  
-        /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
-         * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
-         * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
-         * touch a single hierarchy too. */
-        if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
-                r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
-                if (r < 0) {
-                        *exit_status = EXIT_CGROUP;
-                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
+        if (params->cgroup_path) {
+                /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
+                 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
+                 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
+                 * touch a single hierarchy too. */
+
+                if (params->flags & EXEC_CGROUP_DELEGATE) {
+                        r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
+                        if (r < 0) {
+                                *exit_status = EXIT_CGROUP;
+                                return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
+                        }
+                }
+
+                if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
+                        if (cgroup_context_want_memory_pressure(cgroup_context)) {
+                                r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
+                                if (r < 0) {
+                                        *exit_status = EXIT_MEMORY;
+                                        return log_oom();
+                                }
+
+                                r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
+                                if (r < 0) {
+                                        log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+                                                            "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
+                                        memory_pressure_path = mfree(memory_pressure_path);
+                                }
+                        } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
+                                memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
+                                if (!memory_pressure_path) {
+                                        *exit_status = EXIT_MEMORY;
+                                        return log_oom();
+                                }
+                        }
                  }
          }
  
@@ -4699,6 +4893,7 @@ static int exec_child(
                          unit,
                          context,
                          params,
+                        cgroup_context,
                          n_fds,
                          fdnames,
                          home,
@@ -4706,6 +4901,7 @@ static int exec_child(
                          shell,
                          journal_stream_dev,
                          journal_stream_ino,
+                        memory_pressure_path,
                          &our_env);
          if (r < 0) {
                  *exit_status = EXIT_MEMORY;
@@ -4773,6 +4969,8 @@ static int exec_child(
          else
                  needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
  
+        uint64_t capability_ambient_set = context->capability_ambient_set;
+
          if (needs_sandboxing) {
                  /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
                   * /sys being present. The actual MAC context application will happen later, as late as
@@ -4813,6 +5011,20 @@ static int exec_child(
                          return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
                  }
  
+                if (ambient_capabilities_supported()) {
+                        uint64_t ambient_after_pam;
+
+                        /* PAM modules might have set some ambient caps. Query them here and merge them into
+                         * the caps we want to set in the end, so that we don't end up unsetting them. */
+                        r = capability_get_ambient(&ambient_after_pam);
+                        if (r < 0) {
+                                *exit_status = EXIT_CAPABILITIES;
+                                return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
+                        }
+
+                        capability_ambient_set |= ambient_after_pam;
+                }
+
                  ngids_after_pam = getgroups_alloc(&gids_after_pam);
                  if (ngids_after_pam < 0) {
                          *exit_status = EXIT_MEMORY;
@@ -4837,12 +5049,14 @@ static int exec_child(
  
                  if (ns_type_supported(NAMESPACE_NET)) {
                          r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
-                        if (r == -EPERM)
-                                log_unit_warning_errno(unit, r,
-                                                       "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
-                        else if (r < 0) {
-                                *exit_status = EXIT_NETWORK;
-                                return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
+                        if (r < 0) {
+                                if (ERRNO_IS_PRIVILEGE(r))
+                                        log_unit_warning_errno(unit, r,
+                                                               "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
+                                else {
+                                        *exit_status = EXIT_NETWORK;
+                                        return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
+                                }
                          }
                  } else if (context->network_namespace_path) {
                          *exit_status = EXIT_NETWORK;
@@ -4874,7 +5088,7 @@ static int exec_child(
          if (needs_mount_namespace) {
                  _cleanup_free_ char *error_path = NULL;
  
-                r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
+                r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
                  if (r < 0) {
                          *exit_status = EXIT_NAMESPACE;
                          return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
@@ -5041,7 +5255,7 @@ static int exec_child(
                                  (UINT64_C(1) << CAP_SETGID);
  
                  if (!cap_test_all(bset)) {
-                        r = capability_bounding_set_drop(bset, false);
+                        r = capability_bounding_set_drop(bset, /* right_now= */ false);
                          if (r < 0) {
                                  *exit_status = EXIT_CAPABILITIES;
                                  return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
@@ -5060,7 +5274,7 @@ static int exec_child(
                   * The requested ambient capabilities are raised in the inheritable set if the second
                   * argument is true. */
                  if (!needs_ambient_hack) {
-                        r = capability_ambient_set_apply(context->capability_ambient_set, true);
+                        r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
                          if (r < 0) {
                                  *exit_status = EXIT_CAPABILITIES;
                                  return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
@@ -5075,17 +5289,16 @@ static int exec_child(
  
          if (needs_setuid) {
                  if (uid_is_valid(uid)) {
-                        r = enforce_user(context, uid);
+                        r = enforce_user(context, uid, capability_ambient_set);
                          if (r < 0) {
                                  *exit_status = EXIT_USER;
                                  return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
                          }
  
-                        if (!needs_ambient_hack &&
-                            context->capability_ambient_set != 0) {
+                        if (!needs_ambient_hack && capability_ambient_set != 0) {
  
                                  /* Raise the ambient capabilities after user change. */
-                                r = capability_ambient_set_apply(context->capability_ambient_set, false);
+                                r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
                                  if (r < 0) {
                                          *exit_status = EXIT_CAPABILITIES;
                                          return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
@@ -5336,8 +5549,9 @@ int exec_spawn(Unit *unit,
                 ExecCommand *command,
                 const ExecContext *context,
                 const ExecParameters *params,
-               ExecRuntime *runtime,
+               ExecSharedRuntime *runtime,
                 DynamicCreds *dcreds,
+               const CGroupContext *cgroup_context,
                 pid_t *ret) {
  
          int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
@@ -5354,6 +5568,8 @@ int exec_spawn(Unit *unit,
          assert(params);
          assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
  
+        LOG_CONTEXT_PUSH_UNIT(unit);
+
          if (context->std_input == EXEC_INPUT_SOCKET ||
              context->std_output == EXEC_OUTPUT_SOCKET ||
              context->std_error == EXEC_OUTPUT_SOCKET) {
@@ -5425,6 +5641,7 @@ int exec_spawn(Unit *unit,
                                 params,
                                 runtime,
                                 dcreds,
+                               cgroup_context,
                                 socket_fd,
                                 named_iofds,
                                 fds,
@@ -5488,6 +5705,7 @@ void exec_context_init(ExecContext *c) {
          c->tty_rows = UINT_MAX;
          c->tty_cols = UINT_MAX;
          numa_policy_reset(&c->numa_policy);
+        c->private_mounts = -1;
  }
  
  void exec_context_done(ExecContext *c) {
@@ -6564,6 +6782,23 @@ int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
          return 0;
  }
  
+bool exec_context_has_encrypted_credentials(ExecContext *c) {
+        ExecLoadCredential *load_cred;
+        ExecSetCredential *set_cred;
+
+        assert(c);
+
+        HASHMAP_FOREACH(load_cred, c->load_credentials)
+                if (load_cred->encrypted)
+                        return true;
+
+        HASHMAP_FOREACH(set_cred, c->set_credentials)
+                if (set_cred->encrypted)
+                        return true;
+
+        return false;
+}
+
  void exec_status_start(ExecStatus *s, pid_t pid) {
          assert(s);
  
@@ -6721,14 +6956,14 @@ static void *remove_tmpdir_thread(void *p) {
          return NULL;
  }
  
-static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
+static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt, bool destroy) {
          int r;
  
          if (!rt)
                  return NULL;
  
          if (rt->manager)
-                (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
+                (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
  
          /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
  
@@ -6760,13 +6995,13 @@ static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
          return mfree(rt);
  }
  
-static void exec_runtime_freep(ExecRuntime **rt) {
-        (void) exec_runtime_free(*rt, false);
+static void exec_shared_runtime_freep(ExecSharedRuntime **rt) {
+        (void) exec_shared_runtime_free(*rt, false);
  }
  
-static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
+static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
          _cleanup_free_ char *id_copy = NULL;
-        ExecRuntime *n;
+        ExecSharedRuntime *n;
  
          assert(ret);
  
@@ -6774,11 +7009,11 @@ static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
          if (!id_copy)
                  return -ENOMEM;
  
-        n = new(ExecRuntime, 1);
+        n = new(ExecSharedRuntime, 1);
          if (!n)
                  return -ENOMEM;
  
-        *n = (ExecRuntime) {
+        *n = (ExecSharedRuntime) {
                  .id = TAKE_PTR(id_copy),
                  .netns_storage_socket = PIPE_EBADF,
                  .ipcns_storage_socket = PIPE_EBADF,
@@ -6788,16 +7023,16 @@ static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
          return 0;
  }
  
-static int exec_runtime_add(
+static int exec_shared_runtime_add(
                  Manager *m,
                  const char *id,
                  char **tmp_dir,
                  char **var_tmp_dir,
                  int netns_storage_socket[2],
                  int ipcns_storage_socket[2],
-                ExecRuntime **ret) {
+                ExecSharedRuntime **ret) {
  
-        _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
+        _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
          int r;
  
          assert(m);
@@ -6805,11 +7040,11 @@ static int exec_runtime_add(
  
          /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
  
-        r = exec_runtime_allocate(&rt, id);
+        r = exec_shared_runtime_allocate(&rt, id);
          if (r < 0)
                  return r;
  
-        r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
+        r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
          if (r < 0)
                  return r;
  
@@ -6831,16 +7066,16 @@ static int exec_runtime_add(
  
          if (ret)
                  *ret = rt;
-        /* do not remove created ExecRuntime object when the operation succeeds. */
+        /* do not remove created ExecSharedRuntime object when the operation succeeds. */
          TAKE_PTR(rt);
          return 0;
  }
  
-static int exec_runtime_make(
+static int exec_shared_runtime_make(
                  Manager *m,
                  const ExecContext *c,
                  const char *id,
-                ExecRuntime **ret) {
+                ExecSharedRuntime **ret) {
  
          _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
          _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
@@ -6850,7 +7085,7 @@ static int exec_runtime_make(
          assert(c);
          assert(id);
  
-        /* It is not necessary to create ExecRuntime object. */
+        /* It is not necessary to create ExecSharedRuntime object. */
          if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
                  *ret = NULL;
                  return 0;
@@ -6875,24 +7110,24 @@ static int exec_runtime_make(
                          return -errno;
          }
  
-        r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
+        r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
          if (r < 0)
                  return r;
  
          return 1;
  }
  
-int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
-        ExecRuntime *rt;
+int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
+        ExecSharedRuntime *rt;
          int r;
  
          assert(m);
          assert(id);
          assert(ret);
  
-        rt = hashmap_get(m->exec_runtime_by_id, id);
+        rt = hashmap_get(m->exec_shared_runtime_by_id, id);
          if (rt)
-                /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
+                /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
                  goto ref;
  
          if (!create) {
@@ -6901,11 +7136,11 @@ int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool
          }
  
          /* If not found, then create a new object. */
-        r = exec_runtime_make(m, c, id, &rt);
+        r = exec_shared_runtime_make(m, c, id, &rt);
          if (r < 0)
                  return r;
          if (r == 0) {
-                /* When r == 0, it is not necessary to create ExecRuntime object. */
+                /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
                  *ret = NULL;
                  return 0;
          }
@@ -6917,7 +7152,7 @@ ref:
          return 1;
  }
  
-ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
+ExecSharedRuntime *exec_shared_runtime_unref(ExecSharedRuntime *rt, bool destroy) {
          if (!rt)
                  return NULL;
  
@@ -6927,17 +7162,17 @@ ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
          if (rt->n_ref > 0)
                  return NULL;
  
-        return exec_runtime_free(rt, destroy);
+        return exec_shared_runtime_free(rt, destroy);
  }
  
-int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
-        ExecRuntime *rt;
+int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
+        ExecSharedRuntime *rt;
  
          assert(m);
          assert(f);
          assert(fds);
  
-        HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
+        HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
                  fprintf(f, "exec-runtime=%s", rt->id);
  
                  if (rt->tmp_dir)
@@ -6992,33 +7227,33 @@ int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
          return 0;
  }
  
-int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
-        _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
-        ExecRuntime *rt;
+int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
+        _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
+        ExecSharedRuntime *rt;
          int r;
  
          /* This is for the migration from old (v237 or earlier) deserialization text.
           * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
-         * Even if the ExecRuntime object originally created by the other unit, we cannot judge
+         * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
           * so or not from the serialized text, then we always creates a new object owned by this. */
  
          assert(u);
          assert(key);
          assert(value);
  
-        /* Manager manages ExecRuntime objects by the unit id.
+        /* Manager manages ExecSharedRuntime objects by the unit id.
           * So, we omit the serialized text when the unit does not have id (yet?)... */
          if (isempty(u->id)) {
                  log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
                  return 0;
          }
  
-        if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
+        if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
                  return log_oom();
  
-        rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
+        rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
          if (!rt) {
-                if (exec_runtime_allocate(&rt_create, u->id) < 0)
+                if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
                          return log_oom();
  
                  rt = rt_create;
@@ -7057,9 +7292,9 @@ int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value,
          } else
                  return 0;
  
-        /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
+        /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
          if (rt_create) {
-                r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
+                r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
                  if (r < 0) {
                          log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
                          return 0;
@@ -7074,7 +7309,7 @@ int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value,
          return 1;
  }
  
-int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
+int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
          _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
          char *id = NULL;
          int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
@@ -7186,24 +7421,24 @@ int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
          }
  
  finalize:
-        r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
+        r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
          if (r < 0)
                  return log_debug_errno(r, "Failed to add exec-runtime: %m");
          return 0;
  }
  
-void exec_runtime_vacuum(Manager *m) {
-        ExecRuntime *rt;
+void exec_shared_runtime_vacuum(Manager *m) {
+        ExecSharedRuntime *rt;
  
          assert(m);
  
-        /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
+        /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
  
-        HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
+        HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
                  if (rt->n_ref > 0)
                          continue;
  
-                (void) exec_runtime_free(rt, false);
+                (void) exec_shared_runtime_free(rt, false);
          }
  }