]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: do not imply PrivateTmp with DynamicUser, create a private tmpfs instead
authorLuca Boccassi <bluca@debian.org>
Wed, 8 May 2024 19:12:57 +0000 (20:12 +0100)
committerLuca Boccassi <bluca@debian.org>
Mon, 17 Jun 2024 16:05:55 +0000 (17:05 +0100)
DynamicUser= enables PrivateTmp= implicitly to avoid files owned by reusable uids
leaking into the host. Change it to instead create a fully private tmpfs instance
instead, which also ensures the same result, since it has less impactful semantics
with respect to PrivateTmp=yes, which links the mount namespace to the host's /tmp
instead. If a user specifies PrivateTmp manually, let the existing behaviour
unchanged to ensure backward compatibility is not broken.

15 files changed:
man/systemd.exec.xml
src/core/dbus-execute.c
src/core/dbus-util.c
src/core/dbus-util.h
src/core/exec-invoke.c
src/core/execute-serialize.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.in
src/core/load-fragment.c
src/core/load-fragment.h
src/core/namespace.c
src/core/namespace.h
src/core/unit.c
test/units/TEST-07-PID1.exec-context.sh

index 49099b2ee9773b8b272c498cf01ec0078986ae04..2fd69f6912f03c178442e53fb3a641c9e8bc2b8a 100644 (file)
         part of a unit for which dynamic users/groups are enabled do not leave files or directories owned by
         these users/groups around, as a different unit might get the same UID/GID assigned later on, and thus
         gain access to these files or directories. If <varname>DynamicUser=</varname> is enabled,
-        <varname>RemoveIPC=</varname> and <varname>PrivateTmp=</varname> are implied (and cannot be turned
-        off). This ensures that the lifetime of IPC objects and temporary files created by the executed
-        processes is bound to the runtime of the service, and hence the lifetime of the dynamic
-        user/group. Since <filename>/tmp/</filename> and <filename>/var/tmp/</filename> are usually the only
-        world-writable directories on a system this ensures that a unit making use of dynamic user/group
-        allocation cannot leave files around after unit termination. Furthermore
+        <varname>RemoveIPC=</varname> is implied (and cannot be turned off). This ensures that the lifetime
+        of IPC objects and temporary files created by the executed processes is bound to the runtime of the
+        service, and hence the lifetime of the dynamic user/group. Since <filename>/tmp/</filename> and
+        <filename>/var/tmp/</filename> are usually the only world-writable directories on a system, unless
+        <varname>PrivateTmp=</varname> is manually enabled, those directories will be placed on a private
+        tmpfs filesystem, as this ensures that a unit making use of dynamic user/group allocation cannot
+        leave files around after unit termination. Furthermore
         <varname>NoNewPrivileges=</varname> and <varname>RestrictSUIDSGID=</varname> are implicitly enabled
         (and cannot be disabled), to ensure that processes invoked cannot take benefit or create SUID/SGID
         files or directories. Moreover <varname>ProtectSystem=strict</varname> and
index 21c260b26b820247509e802ed7b58061a43959d8..530a0f7a0be0cf70baa7239005011ef64025fb8a 100644 (file)
@@ -1055,7 +1055,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("NoExecPaths", "as", NULL, offsetof(ExecContext, no_exec_paths), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ExecSearchPath", "as", NULL, offsetof(ExecContext, exec_search_path), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_propagation_flag), SD_BUS_VTABLE_PROPERTY_CONST),
-        SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_private_tmp, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectClock", "b", bus_property_get_bool, offsetof(ExecContext, protect_clock), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1736,7 +1736,7 @@ int bus_exec_context_set_transient_property(
                 return bus_set_transient_unsigned(u, name, &c->tty_cols, message, flags, error);
 
         if (streq(name, "PrivateTmp"))
-                return bus_set_transient_bool(u, name, &c->private_tmp, message, flags, error);
+                return bus_set_transient_private_tmp(u, name, &c->private_tmp, message, flags, error);
 
         if (streq(name, "PrivateDevices"))
                 return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
index b871d89368227c29585acc1514df2d7bb5e0cf45..46676d7a210a471483f984720a06733f28857ac3 100644 (file)
@@ -150,6 +150,45 @@ int bus_set_transient_usec_internal(
         return 1;
 }
 
+int bus_set_transient_private_tmp(
+                Unit *u,
+                const char *name,
+                PrivateTmp *p,
+                sd_bus_message *message,
+                UnitWriteFlags flags,
+                sd_bus_error *error) {
+
+        int v, r;
+
+        assert(p);
+
+        r = sd_bus_message_read(message, "b", &v);
+        if (r < 0)
+                return r;
+
+        if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+                *p = v ? PRIVATE_TMP_CONNECTED : PRIVATE_TMP_OFF;
+                unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v));
+        }
+
+        return 1;
+}
+
+int bus_property_get_private_tmp(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        PrivateTmp *p = ASSERT_PTR(userdata);
+        int b = *p != PRIVATE_TMP_OFF;
+
+        return sd_bus_message_append_basic(reply, 'b', &b);
+}
+
 int bus_verify_manage_units_async_full(
                 Unit *u,
                 const char *verb,
index 0fc3a949610a734a2c70b2d67bd0fbec4ede0ec0..29796eb249ffbf040e3e00c1272ad05df46719a2 100644 (file)
@@ -4,6 +4,7 @@
 #include "sd-bus.h"
 
 #include "dissect-image.h"
+#include "execute.h"
 #include "unit.h"
 
 int bus_property_get_triggered_unit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
@@ -244,6 +245,7 @@ int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message
 int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
 int bus_set_transient_tristate(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
 int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_private_tmp(Unit *u, const char *name, PrivateTmp *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
 static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) {
         return bus_set_transient_usec_internal(u, name, p, false, message, flags, error);
 }
@@ -255,3 +257,4 @@ int bus_verify_manage_units_async_full(Unit *u, const char *verb, const char *po
 int bus_read_mount_options(sd_bus_message *message, sd_bus_error *error, MountOptions **ret_options, char **ret_format_str, const char *separator);
 
 int bus_property_get_activation_details(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+int bus_property_get_private_tmp(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
index ee8db04e7631cb7d9d731c8547957d3531a69e51..8b88ccb1e98dc72e01524b75ef0abfd9213becfd 100644 (file)
@@ -3054,7 +3054,7 @@ static int apply_mount_namespace(
         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
                         **read_write_paths_cleanup = NULL;
         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
-                *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
+                *private_namespace_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
         char **read_write_paths;
         bool setup_os_release_symlink;
@@ -3108,7 +3108,7 @@ static int apply_mount_namespace(
                  * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
                  * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
 
-                if (context->private_tmp && runtime && runtime->shared) {
+                if (context->private_tmp == PRIVATE_TMP_CONNECTED && runtime && runtime->shared) {
                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
                                 tmp_dir = runtime->shared->tmp_dir;
                         else if (runtime->shared->tmp_dir)
@@ -3145,8 +3145,8 @@ static int apply_mount_namespace(
                 if (!incoming_dir)
                         return -ENOMEM;
 
-                extension_dir = strdup("/run/systemd/unit-extensions");
-                if (!extension_dir)
+                private_namespace_dir = strdup("/run/systemd");
+                if (!private_namespace_dir)
                         return -ENOMEM;
 
                 /* If running under a different root filesystem, propagate the host's os-release. We make a
@@ -3159,7 +3159,7 @@ static int apply_mount_namespace(
         } else {
                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
 
-                if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
+                if (asprintf(&private_namespace_dir, "/run/user/" UID_FMT "/systemd", geteuid()) < 0)
                         return -ENOMEM;
 
                 if (setup_os_release_symlink) {
@@ -3205,6 +3205,8 @@ static int apply_mount_namespace(
                 .temporary_filesystems = context->temporary_filesystems,
                 .n_temporary_filesystems = context->n_temporary_filesystems,
 
+                .private_tmp = context->private_tmp,
+
                 .mount_images = context->mount_images,
                 .n_mount_images = context->n_mount_images,
                 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
@@ -3225,7 +3227,7 @@ static int apply_mount_namespace(
 
                 .propagate_dir = propagate_dir,
                 .incoming_dir = incoming_dir,
-                .extension_dir = extension_dir,
+                .private_namespace_dir = private_namespace_dir,
                 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
                 .host_os_release_stage = host_os_release_stage,
 
@@ -3857,7 +3859,7 @@ static bool exec_context_need_unprivileged_private_users(
                 return false;
 
         return context->private_users ||
-               context->private_tmp ||
+               context->private_tmp != PRIVATE_TMP_OFF ||
                context->private_devices ||
                context->private_network ||
                context->network_namespace_path ||
index ecd1e70db67fe93a14b43eedd919c301684d72c1..d6c0162a7c2724902157c65cb6e17bec9cc0207e 100644 (file)
@@ -1840,7 +1840,7 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
         if (r < 0)
                 return r;
 
-        r = serialize_bool_elide(f, "exec-context-private-tmp", c->private_tmp);
+        r = serialize_item(f, "exec-context-private-tmp", private_tmp_to_string(c->private_tmp));
         if (r < 0)
                 return r;
 
@@ -2720,10 +2720,9 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
                         if (r < 0)
                                 return r;
                 } else if ((val = startswith(l, "exec-context-private-tmp="))) {
-                        r = parse_boolean(val);
-                        if (r < 0)
-                                return r;
-                        c->private_tmp = r;
+                        c->private_tmp = private_tmp_from_string(val);
+                        if (c->private_tmp < 0)
+                                return c->private_tmp;
                 } else if ((val = startswith(l, "exec-context-private-devices="))) {
                         r = parse_boolean(val);
                         if (r < 0)
index 513e95e09d7195c7569564b8880689942fec1511..9ba5a77975a730e9e6cf0ca1dc947921af31f93f 100644 (file)
@@ -231,7 +231,10 @@ bool exec_needs_mount_namespace(
         if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
                 return true;
 
-        if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
+        if (context->private_tmp == PRIVATE_TMP_DISCONNECTED)
+                return true;
+
+        if (context->private_tmp == PRIVATE_TMP_CONNECTED && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
                 return true;
 
         if (context->private_devices ||
@@ -964,7 +967,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
                 prefix, empty_to_root(c->root_directory),
                 prefix, yes_no(c->root_ephemeral),
                 prefix, yes_no(c->non_blocking),
-                prefix, yes_no(c->private_tmp),
+                prefix, private_tmp_to_string(c->private_tmp),
                 prefix, yes_no(c->private_devices),
                 prefix, yes_no(c->protect_kernel_tunables),
                 prefix, yes_no(c->protect_kernel_modules),
@@ -2155,12 +2158,12 @@ static int exec_shared_runtime_make(
         assert(id);
 
         /* It is not necessary to create ExecSharedRuntime object. */
-        if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
+        if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && c->private_tmp != PRIVATE_TMP_CONNECTED) {
                 *ret = NULL;
                 return 0;
         }
 
-        if (c->private_tmp &&
+        if (c->private_tmp == PRIVATE_TMP_CONNECTED &&
             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
index 107ae2524387cbc68b3b91dbf80c5dd2cee4f20b..ef8dadfe20270199c088b67180b11bcdb77369f7 100644 (file)
@@ -314,7 +314,7 @@ struct ExecContext {
         int private_mounts;
         int mount_apivfs;
         int memory_ksm;
-        bool private_tmp;
+        PrivateTmp private_tmp;
         bool private_network;
         bool private_devices;
         bool private_users;
index df219d86dfeaa38749d078229f343be74b628b87..46c2198ac71f49f0d898d0a96257851988f95b82 100644 (file)
 {{type}}.BindPaths,                        config_parse_bind_paths,                     0,                                  offsetof({{type}}, exec_context)
 {{type}}.BindReadOnlyPaths,                config_parse_bind_paths,                     0,                                  offsetof({{type}}, exec_context)
 {{type}}.TemporaryFileSystem,              config_parse_temporary_filesystems,          0,                                  offsetof({{type}}, exec_context)
-{{type}}.PrivateTmp,                       config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.private_tmp)
+{{type}}.PrivateTmp,                       config_parse_private_tmp,                    0,                                  offsetof({{type}}, exec_context)
 {{type}}.PrivateDevices,                   config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.private_devices)
 {{type}}.ProtectKernelTunables,            config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.protect_kernel_tunables)
 {{type}}.ProtectKernelModules,             config_parse_bool,                           0,                                  offsetof({{type}}, exec_context.protect_kernel_modules)
index 4ff1e66d7ba83b6fa9a64cdc51ac5a3dec868f13..3701270ab53e1f4947f91439e4c25bfabdc9cb4d 100644 (file)
@@ -5199,6 +5199,34 @@ int config_parse_temporary_filesystems(
         }
 }
 
+int config_parse_private_tmp(
+                const char* unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        ExecContext *c = ASSERT_PTR(data);
+        int r;
+
+        assert(filename);
+        assert(rvalue);
+
+        r = parse_boolean(rvalue);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse boolean value: %s ignoring", rvalue);
+                return 0;
+        }
+
+        c->private_tmp = r ? PRIVATE_TMP_CONNECTED : PRIVATE_TMP_OFF;
+        return 0;
+}
+
 int config_parse_bind_paths(
                 const char *unit,
                 const char *filename,
index 005b91510608f779e30aa3f04b7e4d5525238559..94b13a4bb315057ff0a8456f462fc380a26bf317 100644 (file)
@@ -113,6 +113,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_import_credential);
 CONFIG_PARSER_PROTOTYPE(config_parse_set_status);
 CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv);
 CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems);
+CONFIG_PARSER_PROTOTYPE(config_parse_private_tmp);
 CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota);
 CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset);
 CONFIG_PARSER_PROTOTYPE(config_parse_protect_home);
index e521dc964c4c944b1c6a590dc3155722eb5fdd68..da96fc2f33c92887dbdfd493170a982a21a73d52 100644 (file)
@@ -72,6 +72,7 @@ typedef enum MountMode {
         MOUNT_EXEC,
         MOUNT_TMPFS,
         MOUNT_RUN,
+        MOUNT_PRIVATE_TMPFS,       /* Mounted outside the root directory, and used by subsequent mounts */
         MOUNT_EXTENSION_DIRECTORY, /* Bind-mounted outside the root directory, and used by subsequent mounts */
         MOUNT_EXTENSION_IMAGE,     /* Mounted outside the root directory, and used by subsequent mounts */
         MOUNT_MQUEUEFS,
@@ -97,6 +98,8 @@ typedef struct MountEntry {
         bool nosuid:1;            /* Shall set MS_NOSUID on the mount itself */
         bool noexec:1;            /* Shall set MS_NOEXEC on the mount itself */
         bool exec:1;              /* Shall clear MS_NOEXEC on the mount itself */
+        bool create_source_dir:1; /* Create the source directory if it doesn't exist - for implicit bind mounts */
+        mode_t source_dir_mode;   /* Mode for the source directory, if it is to be created */
         MountEntryState state;    /* Whether it was already processed or skipped */
         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
         const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */
@@ -246,6 +249,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
         [MOUNT_EXEC]                  = "exec",
         [MOUNT_TMPFS]                 = "tmpfs",
         [MOUNT_RUN]                   = "run",
+        [MOUNT_PRIVATE_TMPFS]         = "private-tmpfs",
         [MOUNT_EXTENSION_DIRECTORY]   = "extension-directory",
         [MOUNT_EXTENSION_IMAGE]       = "extension-image",
         [MOUNT_MQUEUEFS]              = "mqueuefs",
@@ -467,7 +471,7 @@ static int append_mount_images(MountList *ml, const MountImage *mount_images, si
 static int append_extensions(
                 MountList *ml,
                 const char *root,
-                const char *extension_dir,
+                const char *private_namespace_dir,
                 char **hierarchies,
                 const MountImage *mount_images,
                 size_t n_mount_images,
@@ -482,7 +486,7 @@ static int append_extensions(
         if (n_mount_images == 0 && strv_isempty(extension_directories))
                 return 0;
 
-        assert(extension_dir);
+        assert(private_namespace_dir);
 
         n_overlays = strv_length(hierarchies);
         if (n_overlays == 0)
@@ -519,7 +523,7 @@ static int append_extensions(
                                         "No matching entry in .v/ directory %s found.",
                                         m->source);
 
-                if (asprintf(&mount_point, "%s/%zu", extension_dir, i) < 0)
+                if (asprintf(&mount_point, "%s/unit-extensions/%zu", private_namespace_dir, i) < 0)
                         return -ENOMEM;
 
                 for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
@@ -556,7 +560,7 @@ static int append_extensions(
                 bool ignore_enoent = false;
 
                 /* Pick up the counter where the ExtensionImages left it. */
-                if (asprintf(&mount_point, "%s/%zu", extension_dir, n_mount_images++) < 0)
+                if (asprintf(&mount_point, "%s/unit-extensions/%zu", private_namespace_dir, n_mount_images++) < 0)
                         return -ENOMEM;
 
                 /* Look for any prefixes */
@@ -888,8 +892,12 @@ static void drop_outside_root(MountList *ml, const char *root_directory) {
 
         for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
 
-                /* ExtensionImages/Directories bases are opened in /run/systemd/unit-extensions on the host */
-                if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) && !path_startswith(mount_entry_path(f), root_directory)) {
+                /* ExtensionImages/Directories bases are opened in /run/[user/xyz/]systemd/unit-extensions
+                 * on the host, and a private (invisible to the guest) tmpfs instance is mounted on
+                 * /run/[user/xyz/]systemd/unit-private-tmp as the storage backend of private /tmp and
+                 * /var/tmp. */
+                if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY, MOUNT_PRIVATE_TMPFS) &&
+                    !path_startswith(mount_entry_path(f), root_directory)) {
                         log_debug("%s is outside of root directory.", mount_entry_path(f));
                         mount_entry_done(f);
                         continue;
@@ -1618,6 +1626,14 @@ static int apply_one_mount(
                  * that bind mount source paths are always relative to the host root, hence we pass NULL as
                  * root directory to chase() here. */
 
+                /* When we create implicit mounts, we might need to create the path ourselves as it is on a
+                 * just-created tmpfs, for example. */
+                if (m->create_source_dir) {
+                        r = mkdir_p(mount_entry_source(m), m->source_dir_mode);
+                        if (r < 0)
+                                return log_debug_errno(r, "Failed to create source directory %s: %m", mount_entry_source(m));
+                }
+
                 r = chase(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
                 if (r == -ENOENT && m->ignore) {
                         log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
@@ -1637,6 +1653,7 @@ static int apply_one_mount(
         }
 
         case MOUNT_EMPTY_DIR:
+        case MOUNT_PRIVATE_TMPFS:
         case MOUNT_TMPFS:
                 return mount_tmpfs(m);
 
@@ -1743,7 +1760,7 @@ static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self
          * and running Linux <= 4.17. */
         submounts =
                 mount_entry_read_only(m) &&
-                !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
+                !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS, MOUNT_PRIVATE_TMPFS);
         if (submounts)
                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
         else
@@ -1783,7 +1800,7 @@ static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mo
         if (flags_mask == 0) /* No Change? */
                 return 0;
 
-        submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
+        submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS, MOUNT_PRIVATE_TMPFS);
 
         if (submounts)
                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
@@ -1808,7 +1825,7 @@ static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) {
         if (m->state != MOUNT_APPLIED)
                 return 0;
 
-        submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
+        submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS, MOUNT_PRIVATE_TMPFS);
         if (submounts)
                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo);
         else
@@ -1959,8 +1976,11 @@ static int apply_mounts(
                         if (m->state != MOUNT_PENDING)
                                 continue;
 
-                        /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
-                        r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) ? root : NULL, m);
+                        /* ExtensionImages/Directories are first opened in the propagate directory, not in
+                         * the root_directory. A private (invisible to the guest) tmpfs instance is mounted
+                         * on /run/[user/xyz/]systemd/unit-private-tmp as the storage backend of private
+                         * /tmp and /var/tmp. */
+                        r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY, MOUNT_PRIVATE_TMPFS) ? root : NULL, m);
                         if (r < 0) {
                                 mount_entry_path_debug_string(root, m, error_path);
                                 return r;
@@ -2245,39 +2265,88 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) {
         if (r < 0)
                 return r;
 
-        if (p->tmp_dir) {
-                bool ro = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY);
+        /* When DynamicUser=yes enforce that /tmp/ and /var/tmp/ are disconnected from the host's
+         * directories, as they are world writable and ephemeral uid/gid will be used. */
+        if (p->private_tmp == PRIVATE_TMP_DISCONNECTED) {
+                _cleanup_free_ char *tmpfs_dir = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
+                MountEntry *tmpfs_entry, *tmp_entry, *var_tmp_entry;
 
-                MountEntry *me = mount_list_extend(&ml);
-                if (!me)
+                tmpfs_dir = path_join(p->private_namespace_dir, "unit-private-tmp");
+                tmp_dir = path_join(tmpfs_dir, "tmp");
+                var_tmp_dir = path_join(tmpfs_dir, "var-tmp");
+                if (!tmpfs_dir || !tmp_dir || !var_tmp_dir)
                         return log_oom_debug();
 
-                *me = (MountEntry) {
-                        .path_const = "/tmp",
-                        .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
-                        .source_const = p->tmp_dir,
+                tmpfs_entry = mount_list_extend(&ml);
+                if (!tmpfs_entry)
+                        return log_oom_debug();
+                *tmpfs_entry = (MountEntry) {
+                        .path_malloc = TAKE_PTR(tmpfs_dir),
+                        .mode = MOUNT_PRIVATE_TMPFS,
+                        .options_const = "mode=0700" NESTED_TMPFS_LIMITS,
+                        .flags = MS_NODEV|MS_STRICTATIME,
+                        .has_prefix = true,
                 };
-        }
 
-        if (p->var_tmp_dir) {
-                bool ro = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY);
-
-                MountEntry *me = mount_list_extend(&ml);
-                if (!me)
+                tmp_entry = mount_list_extend(&ml);
+                if (!tmp_entry)
                         return log_oom_debug();
+                *tmp_entry = (MountEntry) {
+                        .source_malloc = TAKE_PTR(tmp_dir),
+                        .path_const = "/tmp",
+                        .mode = MOUNT_BIND,
+                        .source_dir_mode = 01777,
+                        .create_source_dir = true,
+                };
 
-                *me = (MountEntry) {
+                var_tmp_entry = mount_list_extend(&ml);
+                if (!var_tmp_entry)
+                        return log_oom_debug();
+                *var_tmp_entry = (MountEntry) {
+                        .source_malloc = TAKE_PTR(var_tmp_dir),
                         .path_const = "/var/tmp",
-                        .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
-                        .source_const = p->var_tmp_dir,
+                        .mode = MOUNT_BIND,
+                        .source_dir_mode = 01777,
+                        .create_source_dir = true,
                 };
+
+                /* Ensure that the tmpfs is mounted first, and bind mounts are added later. */
+                assert_cc(MOUNT_BIND < MOUNT_PRIVATE_TMPFS);
+        } else {
+                if (p->tmp_dir) {
+                        bool ro = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY);
+
+                        MountEntry *me = mount_list_extend(&ml);
+                        if (!me)
+                                return log_oom_debug();
+
+                        *me = (MountEntry) {
+                                .path_const = "/tmp",
+                                .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
+                                .source_const = p->tmp_dir,
+                        };
+                }
+
+                if (p->var_tmp_dir) {
+                        bool ro = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY);
+
+                        MountEntry *me = mount_list_extend(&ml);
+                        if (!me)
+                                return log_oom_debug();
+
+                        *me = (MountEntry) {
+                                .path_const = "/var/tmp",
+                                .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
+                                .source_const = p->var_tmp_dir,
+                        };
+                }
         }
 
         r = append_mount_images(&ml, p->mount_images, p->n_mount_images);
         if (r < 0)
                 return r;
 
-        r = append_extensions(&ml, root, p->extension_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories);
+        r = append_extensions(&ml, root, p->private_namespace_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories);
         if (r < 0)
                 return r;
 
@@ -2530,10 +2599,12 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) {
         if (setup_propagate)
                 (void) mkdir_p(p->propagate_dir, 0600);
 
-        if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories))
+        if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) {
                 /* ExtensionImages/Directories mountpoint directories will be created while parsing the
                  * mounts to create, so have the parent ready */
-                (void) mkdir_p(p->extension_dir, 0600);
+                char *extension_dir = strjoina(p->private_namespace_dir, "/unit-extensions");
+                (void) mkdir_p(extension_dir, 0600);
+        }
 
         /* Remount / as SLAVE so that nothing now mounted in the namespace
          * shows up in the parent */
@@ -3074,3 +3145,11 @@ static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
 };
 
 DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
+
+static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = {
+        [PRIVATE_TMP_OFF]          = "off",
+        [PRIVATE_TMP_CONNECTED]    = "connected",
+        [PRIVATE_TMP_DISCONNECTED] = "disconnected",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(private_tmp, PrivateTmp);
index 921716bf3ec9337f6387187c0ccd2e3eb411cd0d..bff99b9daa87e4d24a050f7173f96fab30b4b672 100644 (file)
@@ -53,6 +53,14 @@ typedef enum ProcSubset {
         _PROC_SUBSET_INVALID = -EINVAL,
 } ProcSubset;
 
+typedef enum PrivateTmp {
+        PRIVATE_TMP_OFF,
+        PRIVATE_TMP_CONNECTED, /* Bind mounted from the host's filesystem */
+        PRIVATE_TMP_DISCONNECTED, /* A completely private tmpfs, invisible from the host */
+        _PRIVATE_TMP_MAX,
+        _PRIVATE_TMP_INVALID = -EINVAL,
+} PrivateTmp;
+
 struct BindMount {
         char *source;
         char *destination;
@@ -127,7 +135,7 @@ struct NamespaceParameters {
         const char *propagate_dir;
         const char *incoming_dir;
 
-        const char *extension_dir;
+        const char *private_namespace_dir;
         const char *notify_socket;
         const char *host_os_release_stage;
 
@@ -150,6 +158,7 @@ struct NamespaceParameters {
         ProtectSystem protect_system;
         ProtectProc protect_proc;
         ProcSubset proc_subset;
+        PrivateTmp private_tmp;
 };
 
 int setup_namespace(const NamespaceParameters *p, char **error_path);
@@ -184,6 +193,9 @@ ProtectProc protect_proc_from_string(const char *s) _pure_;
 const char* proc_subset_to_string(ProcSubset i) _const_;
 ProcSubset proc_subset_from_string(const char *s) _pure_;
 
+const char* private_tmp_to_string(PrivateTmp i) _const_;
+PrivateTmp private_tmp_from_string(const char *s) _pure_;
+
 void bind_mount_free_many(BindMount *b, size_t n);
 int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
 
index 2f82902d4f23f66e105dd3314c274c6bfe85b6ac..ac91af187ac89f5408ffc8940fbb8efbb45d3846 100644 (file)
@@ -1275,7 +1275,7 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
                         return r;
         }
 
-        if (c->private_tmp) {
+        if (c->private_tmp == PRIVATE_TMP_CONNECTED) {
                 r = unit_add_mounts_for(u, "/tmp", UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS);
                 if (r < 0)
                         return r;
@@ -4287,7 +4287,11 @@ int unit_patch_contexts(Unit *u) {
                          * UID/GID around in the file system or on IPC objects. Hence enforce a strict
                          * sandbox. */
 
-                        ec->private_tmp = true;
+                        /* With DynamicUser= we want private directories, so if the user hasn't manually
+                         * selected PrivateTmp=, enable it, but to a fully private (disconnected) tmpfs
+                         * instance. */
+                        if (ec->private_tmp == PRIVATE_TMP_OFF)
+                                ec->private_tmp = PRIVATE_TMP_DISCONNECTED;
                         ec->remove_ipc = true;
                         ec->protect_system = PROTECT_SYSTEM_STRICT;
                         if (ec->protect_home == PROTECT_HOME_NO)
index a3379ef402064ce5cad0d761a181dfc843689efa..69274a57439a85c47b3656805b6d191d2d152872 100755 (executable)
@@ -340,6 +340,19 @@ if [[ ! -v ASAN_OPTIONS ]] && systemctl --version | grep "+BPF_FRAMEWORK" && ker
     (! systemd-run --wait --pipe -p RestrictFileSystems="~proc devtmpfs sysfs" ls /sys)
 fi
 
+if [[ ! -v ASAN_OPTIONS ]]; then
+    # Ensure DynamicUser=yes does not imply PrivateTmp=yes if TemporaryFileSystem=/tmp /var/tmp is set
+    systemd-run --unit test-07-dynamic-user-tmp.service \
+                --service-type=notify \
+                -p DynamicUser=yes \
+                -p NotifyAccess=all \
+                sh -c 'touch /tmp/a && touch /var/tmp/b && ! test -f /tmp/b && ! test -f /var/tmp/a && systemd-notify --ready && sleep infinity'
+    (! ls /tmp/systemd-private-"$(tr -d '-' < /proc/sys/kernel/random/boot_id)"-test-07-dynamic-user-tmp.service-* &>/dev/null)
+    (! ls /var/tmp/systemd-private-"$(tr -d '-' < /proc/sys/kernel/random/boot_id)"-test-07-dynamic-user-tmp.service-* &>/dev/null)
+    systemctl is-active test-07-dynamic-user-tmp.service
+    systemctl stop test-07-dynamic-user-tmp.service
+fi
+
 # Make sure we properly (de)serialize various string arrays, including whitespaces
 # See: https://github.com/systemd/systemd/issues/31214
 systemd-run --wait --pipe -p Environment="FOO='bar4    '" \