From: Lennart Poettering Date: Tue, 10 Oct 2023 19:36:50 +0000 (+0200) Subject: namespace: make setup_namespace() less crazy X-Git-Tag: v255-rc1~279 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=79d956d;p=thirdparty%2Fsystemd.git namespace: make setup_namespace() less crazy Let's replace the ridiculous number of arguments with a structure, to make this function less weird. No change in behaviour, just some refactoring. --- diff --git a/src/core/execute.c b/src/core/execute.c index e25552daa67..7cf6601ee7e 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -3138,7 +3138,6 @@ static int apply_mount_namespace( *extension_dir = NULL, *host_os_release_stage = NULL; const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL; char **read_write_paths; - NamespaceInfo ns_info; bool needs_sandboxing, setup_os_release_symlink; BindMount *bind_mounts = NULL; size_t n_bind_mounts = 0; @@ -3180,10 +3179,9 @@ static int apply_mount_namespace( needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED); if (needs_sandboxing) { - /* The runtime struct only contains the parent of the private /tmp, - * which is non-accessible to world users. Inside of it there's a /tmp - * that is sticky, and that's the one we want to use here. - * This does not apply when we are using /run/systemd/empty as fallback. */ + /* The runtime struct only contains the parent of the private /tmp, which is non-accessible + * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to + * use here. This does not apply when we are using /run/systemd/empty as fallback. */ if (context->private_tmp && runtime && runtime->shared) { if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY)) @@ -3196,39 +3194,10 @@ static int apply_mount_namespace( else if (runtime->shared->var_tmp_dir) var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp"); } - - ns_info = (NamespaceInfo) { - .ignore_protect_paths = false, - .private_dev = context->private_devices, - .protect_control_groups = context->protect_control_groups, - .protect_kernel_tunables = context->protect_kernel_tunables, - .protect_kernel_modules = context->protect_kernel_modules, - .protect_kernel_logs = context->protect_kernel_logs, - .protect_hostname = context->protect_hostname, - .mount_apivfs = exec_context_get_effective_mount_apivfs(context), - .protect_home = context->protect_home, - .protect_system = context->protect_system, - .protect_proc = context->protect_proc, - .proc_subset = context->proc_subset, - .private_network = exec_needs_network_namespace(context), - .private_ipc = exec_needs_ipc_namespace(context), - /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */ - .mount_nosuid = context->no_new_privileges && !mac_selinux_use(), - }; - } else if (!context->dynamic_user && root_dir) - /* - * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed - * sandbox info, otherwise enforce it, don't ignore protected paths and - * fail if we are enable to apply the sandbox inside the mount namespace. - */ - ns_info = (NamespaceInfo) { - .ignore_protect_paths = true, - }; - else - ns_info = (NamespaceInfo) {}; + } /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */ - setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image); + setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image); r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks); if (r < 0) return r; @@ -3287,44 +3256,81 @@ static int apply_mount_namespace( return r; } - r = setup_namespace( - root_dir, - root_image, - context->root_image_options, - context->root_image_policy ?: &image_policy_service, - &ns_info, - read_write_paths, - needs_sandboxing ? context->read_only_paths : NULL, - needs_sandboxing ? context->inaccessible_paths : NULL, - needs_sandboxing ? context->exec_paths : NULL, - needs_sandboxing ? context->no_exec_paths : NULL, - empty_directories, - symlinks, - bind_mounts, - n_bind_mounts, - context->temporary_filesystems, - context->n_temporary_filesystems, - context->mount_images, - context->n_mount_images, - context->mount_image_policy ?: &image_policy_service, - tmp_dir, - var_tmp_dir, - creds_path, - context->log_namespace, - context->mount_propagation_flag, - &verity, - context->extension_images, - context->n_extension_images, - context->extension_image_policy ?: &image_policy_sysext, - context->extension_directories, - propagate_dir, - incoming_dir, - extension_dir, - root_dir || root_image ? params->notify_socket : NULL, - host_os_release_stage, - params->runtime_scope, - error_path); + NamespaceParameters parameters = { + .runtime_scope = params->runtime_scope, + + .root_directory = root_dir, + .root_image = root_image, + .root_image_options = context->root_image_options, + .root_image_policy = context->root_image_policy ?: &image_policy_service, + + .read_write_paths = read_write_paths, + .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL, + .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL, + + .exec_paths = needs_sandboxing ? context->exec_paths : NULL, + .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL, + + .empty_directories = empty_directories, + .symlinks = symlinks, + + .bind_mounts = bind_mounts, + .n_bind_mounts = n_bind_mounts, + + .temporary_filesystems = context->temporary_filesystems, + .n_temporary_filesystems = context->n_temporary_filesystems, + + .mount_images = context->mount_images, + .n_mount_images = context->n_mount_images, + .mount_image_policy = context->mount_image_policy ?: &image_policy_service, + + .tmp_dir = tmp_dir, + .var_tmp_dir = var_tmp_dir, + + .creds_path = creds_path, + .log_namespace = context->log_namespace, + .mount_propagation_flag = context->mount_propagation_flag, + + .verity = &verity, + + .extension_images = context->extension_images, + .n_extension_images = context->n_extension_images, + .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext, + .extension_directories = context->extension_directories, + + .propagate_dir = propagate_dir, + .incoming_dir = incoming_dir, + .extension_dir = extension_dir, + .notify_socket = root_dir || root_image ? params->notify_socket : NULL, + .host_os_release_stage = host_os_release_stage, + + /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info, + * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the + * sandbox inside the mount namespace. */ + .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir, + + .protect_control_groups = needs_sandboxing && context->protect_control_groups, + .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables, + .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules, + .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs, + .protect_hostname = needs_sandboxing && context->protect_hostname, + + .private_dev = needs_sandboxing && context->private_devices, + .private_network = needs_sandboxing && exec_needs_network_namespace(context), + .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context), + + .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context), + + /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */ + .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(), + + .protect_home = needs_sandboxing && context->protect_home, + .protect_system = needs_sandboxing && context->protect_system, + .protect_proc = needs_sandboxing && context->protect_proc, + .proc_subset = needs_sandboxing && context->proc_subset, + }; + r = setup_namespace(¶meters, error_path); /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a diff --git a/src/core/namespace.c b/src/core/namespace.c index ffb1bb18874..00b1a17b4f3 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1121,16 +1121,16 @@ static int mount_private_sysfs(const MountEntry *m) { return 0; } -static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) { +static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) { _cleanup_free_ char *opts = NULL; const char *entry_path; int r, n; assert(m); - assert(ns_info); + assert(p); - if (ns_info->protect_proc != PROTECT_PROC_DEFAULT || - ns_info->proc_subset != PROC_SUBSET_ALL) { + if (p->protect_proc != PROTECT_PROC_DEFAULT || + p->proc_subset != PROC_SUBSET_ALL) { /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it * pretended to be per-instance but actually was per-namespace), hence let's make use of it @@ -1138,9 +1138,9 @@ static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) { * per-instance, we'll exclusively use the textual value for hidepid=, since support was * added in the same commit: if it's supported it is thus also per-instance. */ - const char *hpv = ns_info->protect_proc == PROTECT_PROC_DEFAULT ? + const char *hpv = p->protect_proc == PROTECT_PROC_DEFAULT ? "off" : - protect_proc_to_string(ns_info->protect_proc); + protect_proc_to_string(p->protect_proc); /* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in * 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when @@ -1154,7 +1154,7 @@ static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) { return -ENOMEM; } - if (ns_info->proc_subset == PROC_SUBSET_PID && + if (p->proc_subset == PROC_SUBSET_PID && mount_option_supported("proc", "subset", "pid") != 0) if (!strextend_with_separator(&opts, ",", "subset=pid")) return -ENOMEM; @@ -1383,10 +1383,7 @@ static int follow_symlink( static int apply_one_mount( const char *root_directory, MountEntry *m, - const ImagePolicy *mount_image_policy, - const ImagePolicy *extension_image_policy, - const NamespaceInfo *ns_info, - RuntimeScope scope) { + const NamespaceParameters *p) { _cleanup_free_ char *inaccessible = NULL; bool rbind = true, make = false; @@ -1394,7 +1391,7 @@ static int apply_one_mount( int r; assert(m); - assert(ns_info); + assert(p); log_debug("Applying namespace mount on %s", mount_entry_path(m)); @@ -1544,7 +1541,7 @@ static int apply_one_mount( break; case PRIVATE_DEV: - return mount_private_dev(m, scope); + return mount_private_dev(m, p->runtime_scope); case BIND_DEV: return mount_bind_dev(m); @@ -1556,7 +1553,7 @@ static int apply_one_mount( return mount_bind_sysfs(m); case PROCFS: - return mount_procfs(m, ns_info); + return mount_procfs(m, p); case RUN: return mount_run(m); @@ -1565,10 +1562,10 @@ static int apply_one_mount( return mount_mqueuefs(m); case MOUNT_IMAGES: - return mount_image(m, NULL, mount_image_policy); + return mount_image(m, NULL, p->mount_image_policy); case EXTENSION_IMAGES: - return mount_image(m, root_directory, extension_image_policy); + return mount_image(m, root_directory, p->extension_image_policy); case OVERLAY_MOUNT: return mount_overlay(m); @@ -1709,8 +1706,8 @@ static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) { return 0; } -static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) { - assert(ns_info); +static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) { + assert(p); /* * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, @@ -1718,81 +1715,64 @@ static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) { * first place... */ - return ns_info->mount_apivfs || - ns_info->protect_control_groups || - ns_info->protect_kernel_tunables || - ns_info->protect_proc != PROTECT_PROC_DEFAULT || - ns_info->proc_subset != PROC_SUBSET_ALL; + return p->mount_apivfs || + p->protect_control_groups || + p->protect_kernel_tunables || + p->protect_proc != PROTECT_PROC_DEFAULT || + p->proc_subset != PROC_SUBSET_ALL; } static size_t namespace_calculate_mounts( - const NamespaceInfo *ns_info, - char** read_write_paths, - char** read_only_paths, - char** inaccessible_paths, - char** exec_paths, - char** no_exec_paths, - char** empty_directories, - size_t n_bind_mounts, - size_t n_temporary_filesystems, - size_t n_mount_images, - size_t n_extension_images, - size_t n_extension_directories, - size_t n_hierarchies, - const char* tmp_dir, - const char* var_tmp_dir, - const char *creds_path, - const char* log_namespace, - bool setup_propagate, - const char* notify_socket, - const char* host_os_release) { + const NamespaceParameters *p, + char **hierarchies, + bool setup_propagate) { size_t protect_home_cnt; size_t protect_system_cnt = - (ns_info->protect_system == PROTECT_SYSTEM_STRICT ? + (p->protect_system == PROTECT_SYSTEM_STRICT ? ELEMENTSOF(protect_system_strict_table) : - ((ns_info->protect_system == PROTECT_SYSTEM_FULL) ? + ((p->protect_system == PROTECT_SYSTEM_FULL) ? ELEMENTSOF(protect_system_full_table) : - ((ns_info->protect_system == PROTECT_SYSTEM_YES) ? + ((p->protect_system == PROTECT_SYSTEM_YES) ? ELEMENTSOF(protect_system_yes_table) : 0))); protect_home_cnt = - (ns_info->protect_home == PROTECT_HOME_YES ? + (p->protect_home == PROTECT_HOME_YES ? ELEMENTSOF(protect_home_yes_table) : - ((ns_info->protect_home == PROTECT_HOME_READ_ONLY) ? + ((p->protect_home == PROTECT_HOME_READ_ONLY) ? ELEMENTSOF(protect_home_read_only_table) : - ((ns_info->protect_home == PROTECT_HOME_TMPFS) ? + ((p->protect_home == PROTECT_HOME_TMPFS) ? ELEMENTSOF(protect_home_tmpfs_table) : 0))); - return !!tmp_dir + !!var_tmp_dir + - strv_length(read_write_paths) + - strv_length(read_only_paths) + - strv_length(inaccessible_paths) + - strv_length(exec_paths) + - strv_length(no_exec_paths) + - strv_length(empty_directories) + - n_bind_mounts + - n_mount_images + - (n_extension_images > 0 || n_extension_directories > 0 ? /* Mount each image and directory plus an overlay per hierarchy */ - n_hierarchies + n_extension_images + n_extension_directories: 0) + - n_temporary_filesystems + - ns_info->private_dev + - (ns_info->protect_kernel_tunables ? + return !!p->tmp_dir + !!p->var_tmp_dir + + strv_length(p->read_write_paths) + + strv_length(p->read_only_paths) + + strv_length(p->inaccessible_paths) + + strv_length(p->exec_paths) + + strv_length(p->no_exec_paths) + + strv_length(p->empty_directories) + + p->n_bind_mounts + + p->n_mount_images + + (p->n_extension_images > 0 || !strv_isempty(p->extension_directories) ? /* Mount each image and directory plus an overlay per hierarchy */ + strv_length(hierarchies) + p->n_extension_images + strv_length(p->extension_directories) : 0) + + p->n_temporary_filesystems + + p->private_dev + + (p->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_proc_table) + ELEMENTSOF(protect_kernel_tunables_sys_table) : 0) + - (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + - (ns_info->protect_kernel_logs ? + (p->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + + (p->protect_kernel_logs ? ELEMENTSOF(protect_kernel_logs_proc_table) + ELEMENTSOF(protect_kernel_logs_dev_table) : 0) + - (ns_info->protect_control_groups ? 1 : 0) + + (p->protect_control_groups ? 1 : 0) + protect_home_cnt + protect_system_cnt + - (ns_info->protect_hostname ? 2 : 0) + - (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0) + - (creds_path ? 2 : 1) + - !!log_namespace + + (p->protect_hostname ? 2 : 0) + + (namespace_parameters_mount_apivfs(p) ? ELEMENTSOF(apivfs_table) : 0) + + (p->creds_path ? 2 : 1) + + !!p->log_namespace + setup_propagate + /* /run/systemd/incoming */ - !!notify_socket + - !!host_os_release + - ns_info->private_network + /* /sys */ - ns_info->private_ipc; /* /dev/mqueue */ + !!p->notify_socket + + !!p->host_os_release_stage + + p->private_network + /* /sys */ + p->private_ipc; /* /dev/mqueue */ } /* Walk all mount entries and dropping any unused mounts. This affects all @@ -1880,13 +1860,9 @@ static void mount_entry_path_debug_string(const char *root, MountEntry *m, char static int apply_mounts( const char *root, - const ImagePolicy *mount_image_policy, - const ImagePolicy *extension_image_policy, - const NamespaceInfo *ns_info, + const NamespaceParameters *p, MountEntry *mounts, size_t *n_mounts, - RuntimeScope scope, - char **symlinks, char **error_path) { _cleanup_fclose_ FILE *proc_self_mountinfo = NULL; @@ -1936,7 +1912,7 @@ static int apply_mounts( break; } - r = apply_one_mount(root, m, mount_image_policy, extension_image_policy, ns_info, scope); + r = apply_one_mount(root, m, p); if (r < 0) { mount_entry_path_debug_string(root, m, error_path); return r; @@ -1955,7 +1931,7 @@ static int apply_mounts( * read-only switches are flipped, create the exec dirs and other symlinks. * Note that when /var/lib is not empty/tmpfs, these symlinks will already * exist, which means this will be a no-op. */ - r = create_symlinks_from_tuples(root, symlinks); + r = create_symlinks_from_tuples(root, p->symlinks); if (r < 0) return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m"); @@ -1991,7 +1967,7 @@ static int apply_mounts( } /* Fourth round, flip the nosuid bits without a deny list. */ - if (ns_info->mount_nosuid) + if (p->mount_nosuid) for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) { r = make_nosuid(m, proc_self_mountinfo); if (r < 0) { @@ -2052,49 +2028,13 @@ static bool home_read_only( return false; } -int setup_namespace( - const char* root_directory, - const char* root_image, - const MountOptions *root_image_mount_options, - const ImagePolicy *root_image_policy, - const NamespaceInfo *ns_info, - char** read_write_paths, - char** read_only_paths, - char** inaccessible_paths, - char** exec_paths, - char** no_exec_paths, - char** empty_directories, - char** symlinks, - const BindMount *bind_mounts, - size_t n_bind_mounts, - const TemporaryFileSystem *temporary_filesystems, - size_t n_temporary_filesystems, - const MountImage *mount_images, - size_t n_mount_images, - const ImagePolicy *mount_image_policy, - const char* tmp_dir, - const char* var_tmp_dir, - const char *creds_path, - const char *log_namespace, - unsigned long mount_propagation_flag, - VeritySettings *verity, - const MountImage *extension_images, - size_t n_extension_images, - const ImagePolicy *extension_image_policy, - char **extension_directories, - const char *propagate_dir, - const char *incoming_dir, - const char *extension_dir, - const char *notify_socket, - const char *host_os_release_stage, - RuntimeScope scope, - char **error_path) { +int setup_namespace(const NamespaceParameters *p, char **error_path) { _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; _cleanup_strv_free_ char **hierarchies = NULL; MountEntry *m = NULL, *mounts = NULL; - bool require_prefix = false, setup_propagate = false; + bool require_prefix = false; const char *root; DissectImageFlags dissect_image_flags = DISSECT_IMAGE_GENERIC_ROOT | @@ -2109,32 +2049,29 @@ int setup_namespace( size_t n_mounts; int r; - assert(ns_info); + assert(p); /* Make sure that all mknod(), mkdir() calls we do are unaffected by the umask, and the access modes * we configure take effect */ BLOCK_WITH_UMASK(0000); - if (!isempty(propagate_dir) && !isempty(incoming_dir)) - setup_propagate = true; - - if (mount_propagation_flag == 0) - mount_propagation_flag = MS_SHARED; + bool setup_propagate = !isempty(p->propagate_dir) && !isempty(p->incoming_dir); + unsigned long mount_propagation_flag = p->mount_propagation_flag != 0 ? p->mount_propagation_flag : MS_SHARED; - if (root_image) { + if (p->root_image) { /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */ - if (root_read_only(read_only_paths, - ns_info->protect_system) && - home_read_only(read_only_paths, inaccessible_paths, empty_directories, - bind_mounts, n_bind_mounts, temporary_filesystems, n_temporary_filesystems, - ns_info->protect_home) && - strv_isempty(read_write_paths)) + if (root_read_only(p->read_only_paths, + p->protect_system) && + home_read_only(p->read_only_paths, p->inaccessible_paths, p->empty_directories, + p->bind_mounts, p->n_bind_mounts, p->temporary_filesystems, p->n_temporary_filesystems, + p->protect_home) && + strv_isempty(p->read_write_paths)) dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; - SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity && verity->data_path); + SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path); r = loop_device_make_by_path( - root_image, + p->root_image, FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, /* sector_size= */ UINT32_MAX, FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, @@ -2145,9 +2082,9 @@ int setup_namespace( r = dissect_loop_device( loop_device, - verity, - root_image_mount_options, - root_image_policy, + p->verity, + p->root_image_options, + p->root_image_policy, dissect_image_flags, &dissected_image); if (r < 0) @@ -2156,21 +2093,21 @@ int setup_namespace( r = dissected_image_load_verity_sig_partition( dissected_image, loop_device->fd, - verity); + p->verity); if (r < 0) return r; r = dissected_image_decrypt( dissected_image, NULL, - verity, + p->verity, dissect_image_flags); if (r < 0) return log_debug_errno(r, "Failed to decrypt dissected image: %m"); } - if (root_directory) - root = root_directory; + if (p->root_directory) + root = p->root_directory; else { /* /run/systemd should have been created by PID 1 early on already, but in some cases, like * when running tests (test-execute), it might not have been created yet so let's make sure @@ -2189,7 +2126,7 @@ int setup_namespace( require_prefix = true; } - if (n_extension_images > 0 || !strv_isempty(extension_directories)) { + if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) { /* Hierarchy population needs to be done for sysext and confext extension images */ r = parse_env_extension_hierarchies(&hierarchies, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES"); if (r < 0) @@ -2197,92 +2134,76 @@ int setup_namespace( } n_mounts = namespace_calculate_mounts( - ns_info, - read_write_paths, - read_only_paths, - inaccessible_paths, - exec_paths, - no_exec_paths, - empty_directories, - n_bind_mounts, - n_temporary_filesystems, - n_mount_images, - n_extension_images, - strv_length(extension_directories), - strv_length(hierarchies), - tmp_dir, var_tmp_dir, - creds_path, - log_namespace, - setup_propagate, - notify_socket, - host_os_release_stage); + p, + hierarchies, + setup_propagate); if (n_mounts > 0) { m = mounts = new0(MountEntry, n_mounts); if (!mounts) return -ENOMEM; - r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix); + r = append_access_mounts(&m, p->read_write_paths, READWRITE, require_prefix); if (r < 0) goto finish; - r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix); + r = append_access_mounts(&m, p->read_only_paths, READONLY, require_prefix); if (r < 0) goto finish; - r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix); + r = append_access_mounts(&m, p->inaccessible_paths, INACCESSIBLE, require_prefix); if (r < 0) goto finish; - r = append_access_mounts(&m, exec_paths, EXEC, require_prefix); + r = append_access_mounts(&m, p->exec_paths, EXEC, require_prefix); if (r < 0) goto finish; - r = append_access_mounts(&m, no_exec_paths, NOEXEC, require_prefix); + r = append_access_mounts(&m, p->no_exec_paths, NOEXEC, require_prefix); if (r < 0) goto finish; - r = append_empty_dir_mounts(&m, empty_directories); + r = append_empty_dir_mounts(&m, p->empty_directories); if (r < 0) goto finish; - r = append_bind_mounts(&m, bind_mounts, n_bind_mounts); + r = append_bind_mounts(&m, p->bind_mounts, p->n_bind_mounts); if (r < 0) goto finish; - r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems); + r = append_tmpfs_mounts(&m, p->temporary_filesystems, p->n_temporary_filesystems); if (r < 0) goto finish; - if (tmp_dir) { - bool ro = streq(tmp_dir, RUN_SYSTEMD_EMPTY); + if (p->tmp_dir) { + bool ro = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY); *(m++) = (MountEntry) { .path_const = "/tmp", .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP, - .source_const = tmp_dir, + .source_const = p->tmp_dir, }; } - if (var_tmp_dir) { - bool ro = streq(var_tmp_dir, RUN_SYSTEMD_EMPTY); + if (p->var_tmp_dir) { + bool ro = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY); *(m++) = (MountEntry) { .path_const = "/var/tmp", .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP, - .source_const = var_tmp_dir, + .source_const = p->var_tmp_dir, }; } - r = append_mount_images(&m, mount_images, n_mount_images); + r = append_mount_images(&m, p->mount_images, p->n_mount_images); if (r < 0) goto finish; - r = append_extensions(&m, root, extension_dir, hierarchies, extension_images, n_extension_images, extension_directories); + r = append_extensions(&m, root, p->extension_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories); if (r < 0) goto finish; - if (ns_info->private_dev) + if (p->private_dev) *(m++) = (MountEntry) { .path_const = "/dev", .mode = PRIVATE_DEV, @@ -2292,8 +2213,8 @@ int setup_namespace( /* In case /proc is successfully mounted with pid tree subset only (ProcSubset=pid), the protective mounts to non-pid /proc paths would fail. But the pid only option may have failed gracefully, so let's try the mounts but it's not fatal if they don't succeed. */ - bool ignore_protect_proc = ns_info->ignore_protect_paths || ns_info->proc_subset == PROC_SUBSET_PID; - if (ns_info->protect_kernel_tunables) { + bool ignore_protect_proc = p->ignore_protect_paths || p->proc_subset == PROC_SUBSET_PID; + if (p->protect_kernel_tunables) { r = append_static_mounts(&m, protect_kernel_tunables_proc_table, ELEMENTSOF(protect_kernel_tunables_proc_table), @@ -2304,21 +2225,21 @@ int setup_namespace( r = append_static_mounts(&m, protect_kernel_tunables_sys_table, ELEMENTSOF(protect_kernel_tunables_sys_table), - ns_info->ignore_protect_paths); + p->ignore_protect_paths); if (r < 0) goto finish; } - if (ns_info->protect_kernel_modules) { + if (p->protect_kernel_modules) { r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), - ns_info->ignore_protect_paths); + p->ignore_protect_paths); if (r < 0) goto finish; } - if (ns_info->protect_kernel_logs) { + if (p->protect_kernel_logs) { r = append_static_mounts(&m, protect_kernel_logs_proc_table, ELEMENTSOF(protect_kernel_logs_proc_table), @@ -2329,30 +2250,30 @@ int setup_namespace( r = append_static_mounts(&m, protect_kernel_logs_dev_table, ELEMENTSOF(protect_kernel_logs_dev_table), - ns_info->ignore_protect_paths); + p->ignore_protect_paths); if (r < 0) goto finish; } - if (ns_info->protect_control_groups) + if (p->protect_control_groups) *(m++) = (MountEntry) { .path_const = "/sys/fs/cgroup", .mode = READONLY, }; - r = append_protect_home(&m, ns_info->protect_home, ns_info->ignore_protect_paths); + r = append_protect_home(&m, p->protect_home, p->ignore_protect_paths); if (r < 0) goto finish; - r = append_protect_system(&m, ns_info->protect_system, false); + r = append_protect_system(&m, p->protect_system, false); if (r < 0) goto finish; - if (namespace_info_mount_apivfs(ns_info)) { + if (namespace_parameters_mount_apivfs(p)) { r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), - ns_info->ignore_protect_paths); + p->ignore_protect_paths); if (r < 0) goto finish; } @@ -2360,7 +2281,7 @@ int setup_namespace( /* Note, if proc is mounted with subset=pid then neither of the * two paths will exist, i.e. they are implicitly protected by * the mount option. */ - if (ns_info->protect_hostname) { + if (p->protect_hostname) { *(m++) = (MountEntry) { .path_const = "/proc/sys/kernel/hostname", .mode = READONLY, @@ -2373,20 +2294,20 @@ int setup_namespace( }; } - if (ns_info->private_network) + if (p->private_network) *(m++) = (MountEntry) { .path_const = "/sys", .mode = PRIVATE_SYSFS, }; - if (ns_info->private_ipc) + if (p->private_ipc) *(m++) = (MountEntry) { .path_const = "/dev/mqueue", .mode = MQUEUEFS, .flags = MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, }; - if (creds_path) { + if (p->creds_path) { /* If our service has a credentials store configured, then bind that one in, but hide * everything else. */ @@ -2399,10 +2320,10 @@ int setup_namespace( }; *(m++) = (MountEntry) { - .path_const = creds_path, + .path_const = p->creds_path, .mode = BIND_MOUNT, .read_only = true, - .source_const = creds_path, + .source_const = p->creds_path, .ignore = true, }; } else { @@ -2416,10 +2337,10 @@ int setup_namespace( }; } - if (log_namespace) { + if (p->log_namespace) { _cleanup_free_ char *q = NULL; - q = strjoin("/run/systemd/journal.", log_namespace); + q = strjoin("/run/systemd/journal.", p->log_namespace); if (!q) { r = -ENOMEM; goto finish; @@ -2436,24 +2357,24 @@ int setup_namespace( /* Will be used to add bind mounts at runtime */ if (setup_propagate) *(m++) = (MountEntry) { - .source_const = propagate_dir, - .path_const = incoming_dir, + .source_const = p->propagate_dir, + .path_const = p->incoming_dir, .mode = BIND_MOUNT, .read_only = true, }; - if (notify_socket) + if (p->notify_socket) *(m++) = (MountEntry) { - .path_const = notify_socket, - .source_const = notify_socket, + .path_const = p->notify_socket, + .source_const = p->notify_socket, .mode = BIND_MOUNT, .read_only = true, }; - if (host_os_release_stage) + if (p->host_os_release_stage) *(m++) = (MountEntry) { .path_const = "/run/host/.os-release-stage/", - .source_const = host_os_release_stage, + .source_const = p->host_os_release_stage, .mode = BIND_MOUNT, .read_only = true, .ignore = true, /* Live copy, don't hard-fail if it goes missing */ @@ -2486,12 +2407,12 @@ int setup_namespace( /* Create the source directory to allow runtime propagation of mounts */ if (setup_propagate) - (void) mkdir_p(propagate_dir, 0600); + (void) mkdir_p(p->propagate_dir, 0600); - if (n_extension_images > 0 || !strv_isempty(extension_directories)) + if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) /* ExtensionImages/Directories mountpoint directories will be created while parsing the * mounts to create, so have the parent ready */ - (void) mkdir_p(extension_dir, 0600); + (void) mkdir_p(p->extension_dir, 0600); /* Remount / as SLAVE so that nothing now mounted in the namespace * shows up in the parent */ @@ -2500,7 +2421,7 @@ int setup_namespace( goto finish; } - if (root_image) { + if (p->root_image) { /* A root image is specified, mount it to the right place */ r = dissected_image_mount( dissected_image, @@ -2528,7 +2449,7 @@ int setup_namespace( goto finish; } - } else if (root_directory) { + } else if (p->root_directory) { /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */ r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW); @@ -2550,24 +2471,20 @@ int setup_namespace( } /* Try to set up the new root directory before mounting anything else there. */ - if (root_image || root_directory) + if (p->root_image || p->root_directory) (void) base_filesystem_create(root, UID_INVALID, GID_INVALID); /* Now make the magic happen */ r = apply_mounts(root, - mount_image_policy, - extension_image_policy, - ns_info, + p, mounts, &n_mounts, - scope, - symlinks, error_path); if (r < 0) goto finish; /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */ r = mount_switch_root(root, /* mount_propagation_flag = */ 0); - if (r == -EINVAL && root_directory) { + if (r == -EINVAL && p->root_directory) { /* If we are using root_directory and we don't have privileges (ie: user manager in a user * namespace) and the root_directory is already a mount point in the parent namespace, * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than @@ -2593,9 +2510,9 @@ int setup_namespace( /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only * supported for non-shared mounts. This needs to happen after remounting / or it will fail. */ if (setup_propagate) { - r = mount(NULL, incoming_dir, NULL, MS_SLAVE, NULL); + r = mount(NULL, p->incoming_dir, NULL, MS_SLAVE, NULL); if (r < 0) { - log_error_errno(r, "Failed to remount %s with MS_SLAVE: %m", incoming_dir); + log_error_errno(r, "Failed to remount %s with MS_SLAVE: %m", p->incoming_dir); goto finish; } } diff --git a/src/core/namespace.h b/src/core/namespace.h index 581403d8982..921716bf3ec 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -5,7 +5,7 @@ Copyright © 2016 Djalal Harouni ***/ -typedef struct NamespaceInfo NamespaceInfo; +typedef struct NamespaceParameters NamespaceParameters; typedef struct BindMount BindMount; typedef struct TemporaryFileSystem TemporaryFileSystem; typedef struct MountImage MountImage; @@ -53,24 +53,6 @@ typedef enum ProcSubset { _PROC_SUBSET_INVALID = -EINVAL, } ProcSubset; -struct NamespaceInfo { - bool ignore_protect_paths; - bool private_dev; - bool protect_control_groups; - bool protect_kernel_tunables; - bool protect_kernel_modules; - bool protect_kernel_logs; - bool mount_apivfs; - bool protect_hostname; - bool private_network; - bool private_ipc; - bool mount_nosuid; - ProtectHome protect_home; - ProtectSystem protect_system; - ProtectProc protect_proc; - ProcSubset proc_subset; -}; - struct BindMount { char *source; char *destination; @@ -100,43 +82,77 @@ struct MountImage { MountImageType type; }; -int setup_namespace( - const char *root_directory, - const char *root_image, - const MountOptions *root_image_options, - const ImagePolicy *root_image_policy, - const NamespaceInfo *ns_info, - char **read_write_paths, - char **read_only_paths, - char **inaccessible_paths, - char **exec_paths, - char **no_exec_paths, - char **empty_directories, - char **symlinks, - const BindMount *bind_mounts, - size_t n_bind_mounts, - const TemporaryFileSystem *temporary_filesystems, - size_t n_temporary_filesystems, - const MountImage *mount_images, - size_t n_mount_images, - const ImagePolicy *mount_image_policy, - const char *tmp_dir, - const char *var_tmp_dir, - const char *creds_path, - const char *log_namespace, - unsigned long mount_propagation_flag, - VeritySettings *verity, - const MountImage *extension_images, - size_t n_extension_images, - const ImagePolicy *extension_image_policy, - char **extension_directories, - const char *propagate_dir, - const char *incoming_dir, - const char *extension_dir, - const char *notify_socket, - const char *host_os_release_stage, - RuntimeScope scope, - char **error_path); +struct NamespaceParameters { + RuntimeScope runtime_scope; + + const char *root_directory; + const char *root_image; + const MountOptions *root_image_options; + const ImagePolicy *root_image_policy; + + char **read_write_paths; + char **read_only_paths; + char **inaccessible_paths; + + char **exec_paths; + char **no_exec_paths; + + char **empty_directories; + char **symlinks; + + const BindMount *bind_mounts; + size_t n_bind_mounts; + + const TemporaryFileSystem *temporary_filesystems; + size_t n_temporary_filesystems; + + const MountImage *mount_images; + size_t n_mount_images; + const ImagePolicy *mount_image_policy; + + const char *tmp_dir; + const char *var_tmp_dir; + + const char *creds_path; + const char *log_namespace; + + unsigned long mount_propagation_flag; + VeritySettings *verity; + + const MountImage *extension_images; + size_t n_extension_images; + const ImagePolicy *extension_image_policy; + char **extension_directories; + + const char *propagate_dir; + const char *incoming_dir; + + const char *extension_dir; + const char *notify_socket; + const char *host_os_release_stage; + + bool ignore_protect_paths; + + bool protect_control_groups; + bool protect_kernel_tunables; + bool protect_kernel_modules; + bool protect_kernel_logs; + bool protect_hostname; + + bool private_dev; + bool private_network; + bool private_ipc; + + bool mount_apivfs; + bool mount_nosuid; + + ProtectHome protect_home; + ProtectSystem protect_system; + ProtectProc protect_proc; + ProcSubset proc_subset; +}; + +int setup_namespace(const NamespaceParameters *p, char **error_path); #define RUN_SYSTEMD_EMPTY "/run/systemd/empty" diff --git a/src/test/test-namespace.c b/src/test/test-namespace.c index 42ac65d08c8..1c99c69d5f0 100644 --- a/src/test/test-namespace.c +++ b/src/test/test-namespace.c @@ -149,11 +149,12 @@ TEST(ipcns) { } TEST(protect_kernel_logs) { - int r; - pid_t pid; - static const NamespaceInfo ns_info = { + static const NamespaceParameters p = { + .runtime_scope = RUNTIME_SCOPE_SYSTEM, .protect_kernel_logs = true, }; + pid_t pid; + int r; if (geteuid() > 0) { (void) log_tests_skipped("not root"); @@ -175,39 +176,7 @@ TEST(protect_kernel_logs) { fd = open("/dev/kmsg", O_RDONLY | O_CLOEXEC); assert_se(fd > 0); - r = setup_namespace(NULL, - NULL, - NULL, - NULL, - &ns_info, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, 0, - NULL, 0, - NULL, 0, - NULL, - NULL, - NULL, - NULL, - NULL, - 0, - NULL, - NULL, - 0, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - RUNTIME_SCOPE_SYSTEM, - NULL); + r = setup_namespace(&p, NULL); assert_se(r == 0); assert_se(setresuid(UID_NOBODY, UID_NOBODY, UID_NOBODY) >= 0); diff --git a/src/test/test-ns.c b/src/test/test-ns.c index eb3afed9e1c..97b9fc98669 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -44,13 +44,15 @@ int main(int argc, char *argv[]) { NULL }; - static const NamespaceInfo ns_info = { - .private_dev = true, - .protect_control_groups = true, - .protect_kernel_tunables = true, - .protect_kernel_modules = true, - .protect_proc = PROTECT_PROC_NOACCESS, - .proc_subset = PROC_SUBSET_PID, + static const BindMount bind_mount = { + .source = (char*) "/usr/bin", + .destination = (char*) "/etc/systemd", + .read_only = true, + }; + + static const TemporaryFileSystem tmpfs = { + .path = (char*) "/var", + .options = (char*) "ro", }; char *root_directory; @@ -76,40 +78,36 @@ int main(int argc, char *argv[]) { else log_info("Not chrooted"); - r = setup_namespace(root_directory, - NULL, - NULL, - NULL, - &ns_info, - (char **) writable, - (char **) readonly, - (char **) inaccessible, - NULL, - (char **) exec, - (char **) no_exec, - NULL, - &(BindMount) { .source = (char*) "/usr/bin", .destination = (char*) "/etc/systemd", .read_only = true }, 1, - &(TemporaryFileSystem) { .path = (char*) "/var", .options = (char*) "ro" }, 1, - NULL, - 0, - NULL, - tmp_dir, - var_tmp_dir, - NULL, - NULL, - 0, - NULL, - NULL, - 0, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - RUNTIME_SCOPE_SYSTEM, - NULL); + NamespaceParameters p = { + .runtime_scope = RUNTIME_SCOPE_SYSTEM, + + .root_directory = root_directory, + + .read_write_paths = (char**) writable, + .read_only_paths = (char**) readonly, + .inaccessible_paths = (char**) inaccessible, + + .exec_paths = (char**) exec, + .no_exec_paths = (char**) no_exec, + + .tmp_dir = tmp_dir, + .var_tmp_dir = var_tmp_dir, + + .bind_mounts = &bind_mount, + .n_bind_mounts = 1, + + .temporary_filesystems = &tmpfs, + .n_temporary_filesystems = 1, + + .private_dev = true, + .protect_control_groups = true, + .protect_kernel_tunables = true, + .protect_kernel_modules = true, + .protect_proc = PROTECT_PROC_NOACCESS, + .proc_subset = PROC_SUBSET_PID, + }; + + r = setup_namespace(&p, NULL); if (r < 0) { log_error_errno(r, "Failed to set up namespace: %m");