X-Git-Url: http://git.ipfire.org/?a=blobdiff_plain;f=src%2Fcore%2Fnamespace.c;h=0d30f17b9a05e1800c2401d6d7d7eddbc3f72002;hb=db9ecf050165fd1033c6f81485917e229c4be537;hp=36d5ff67aed6db4f4add24dbb5b05bc1816179df;hpb=653ca0d913d2d14d234b26c7b5914de75b60e1c0;p=thirdparty%2Fsystemd.git diff --git a/src/core/namespace.c b/src/core/namespace.c index 36d5ff67aed..0d30f17b9a0 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: LGPL-2.1+ */ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ #include #include @@ -15,6 +15,7 @@ #include "format-util.h" #include "fs-util.h" #include "label.h" +#include "list.h" #include "loop-util.h" #include "loopback-setup.h" #include "mkdir.h" @@ -40,6 +41,7 @@ typedef enum MountMode { /* This is ordered by priority! */ INACCESSIBLE, + MOUNT_IMAGES, BIND_MOUNT, BIND_MOUNT_RECURSIVE, PRIVATE_TMP, @@ -65,12 +67,13 @@ typedef struct MountEntry { bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */ bool applied:1; /* Already applied */ char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */ - const char *source_const; /* The source path, for bind mounts */ + const char *source_const; /* The source path, for bind mounts or images */ char *source_malloc; const char *options_const;/* Mount options for tmpfs */ char *options_malloc; unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */ unsigned n_followed; + LIST_HEAD(MountOptions, image_options); } MountEntry; /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted @@ -94,7 +97,7 @@ static const MountEntry protect_kernel_tunables_table[] = { { "/proc/latency_stats", READONLY, true }, { "/proc/mtrr", READONLY, true }, { "/proc/scsi", READONLY, true }, - { "/proc/sys", READONLY, false }, + { "/proc/sys", READONLY, true }, { "/proc/sysrq-trigger", READONLY, true }, { "/proc/timer_stats", READONLY, true }, { "/sys", READONLY, false }, @@ -205,6 +208,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = { [READONLY] = "read-only", [READWRITE] = "read-write", [TMPFS] = "tmpfs", + [MOUNT_IMAGES] = "mount-images", [READWRITE_IMPLICIT] = "rw-implicit", }; @@ -243,6 +247,7 @@ static void mount_entry_done(MountEntry *p) { p->path_malloc = mfree(p->path_malloc); p->source_malloc = mfree(p->source_malloc); p->options_malloc = mfree(p->options_malloc); + p->image_options = mount_options_free_all(p->image_options); } static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) { @@ -325,6 +330,24 @@ static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) return 0; } +static int append_mount_images(MountEntry **p, const MountImage *mount_images, size_t n) { + assert(p); + + for (size_t i = 0; i < n; i++) { + const MountImage *m = mount_images + i; + + *((*p)++) = (MountEntry) { + .path_const = m->destination, + .mode = MOUNT_IMAGES, + .source_const = m->source, + .image_options = m->mount_options, + .ignore = m->ignore_enoent, + }; + } + + return 0; +} + static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) { assert(p); @@ -627,20 +650,19 @@ static int clone_device_node( *make_devnode = false; } - /* We're about to fallback to bind-mounting the device + /* We're about to fall back to bind-mounting the device * node. So create a dummy bind-mount target. * Do not prepare device-node SELinux label (see issue 13762) */ r = mknod(dn, S_IFREG, 0); if (r < 0 && errno != EEXIST) return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d); - /* Fallback to bind-mounting: - * The assumption here is that all used device nodes carry standard - * properties. Specifically, the devices nodes we bind-mount should - * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx) - * and should not carry ACLs. */ - if (mount(d, dn, NULL, MS_BIND, NULL) < 0) - return log_debug_errno(errno, "Bind mounting failed for '%s': %m", d); + /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard + * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or + * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */ + r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL); + if (r < 0) + return r; add_symlink: bn = path_startswith(d, "/dev/"); @@ -687,10 +709,10 @@ static int mount_private_dev(MountEntry *m) { dev = strjoina(temporary_mount, "/dev"); (void) mkdir(dev, 0755); - if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755" TMPFS_LIMITS_DEV) < 0) { - r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev); + r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755" TMPFS_LIMITS_DEV); + if (r < 0) goto fail; - } + r = label_fix_container(dev, "/dev", 0); if (r < 0) { log_debug_errno(errno, "Failed to fix label of '%s' as /dev: %m", dev); @@ -699,10 +721,9 @@ static int mount_private_dev(MountEntry *m) { devpts = strjoina(temporary_mount, "/dev/pts"); (void) mkdir(devpts, 0755); - if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) { - r = log_debug_errno(errno, "Failed to bind mount /dev/pts on '%s': %m", devpts); + r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL); + if (r < 0) goto fail; - } /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx. * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible. @@ -726,21 +747,17 @@ static int mount_private_dev(MountEntry *m) { devshm = strjoina(temporary_mount, "/dev/shm"); (void) mkdir(devshm, 0755); - r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL); - if (r < 0) { - r = log_debug_errno(errno, "Failed to bind mount /dev/shm on '%s': %m", devshm); + r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL); + if (r < 0) goto fail; - } devmqueue = strjoina(temporary_mount, "/dev/mqueue"); (void) mkdir(devmqueue, 0755); - if (mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL) < 0) - log_debug_errno(errno, "Failed to bind mount /dev/mqueue on '%s', ignoring: %m", devmqueue); + (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL); devhugepages = strjoina(temporary_mount, "/dev/hugepages"); (void) mkdir(devhugepages, 0755); - if (mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL) < 0) - log_debug_errno(errno, "Failed to bind mount /dev/hugepages on '%s', ignoring: %m", devhugepages); + (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL); devlog = strjoina(temporary_mount, "/dev/log"); if (symlink("/run/systemd/journal/dev-log", devlog) < 0) @@ -768,10 +785,9 @@ static int mount_private_dev(MountEntry *m) { if (r < 0) log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m)); - if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) { - r = log_debug_errno(errno, "Failed to move mount point '%s' to '%s': %m", dev, mount_entry_path(m)); + r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL); + if (r < 0) goto fail; - } (void) rmdir(dev); (void) rmdir(temporary_mount); @@ -780,18 +796,18 @@ static int mount_private_dev(MountEntry *m) { fail: if (devpts) - (void) umount(devpts); + (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW); if (devshm) - (void) umount(devshm); + (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW); if (devhugepages) - (void) umount(devhugepages); + (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW); if (devmqueue) - (void) umount(devmqueue); + (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW); - (void) umount(dev); + (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW); (void) rmdir(dev); (void) rmdir(temporary_mount); @@ -814,8 +830,9 @@ static int mount_bind_dev(const MountEntry *m) { if (r > 0) /* make this a NOP if /dev is already a mount point */ return 0; - if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) - return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m)); + r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; return 1; } @@ -834,50 +851,162 @@ static int mount_sysfs(const MountEntry *m) { return 0; /* Bind mount the host's version so that we get all child mounts of it, too. */ - if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) - return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; return 1; } -static int mount_procfs(const MountEntry *m) { +static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) { + const char *entry_path; int r; assert(m); + assert(ns_info); - (void) mkdir_p_label(mount_entry_path(m), 0755); + entry_path = mount_entry_path(m); - r = path_is_mount_point(mount_entry_path(m), NULL, 0); - if (r < 0) - return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m"); - if (r > 0) /* make this a NOP if /proc is already a mount point */ - return 0; + /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in + * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by + * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything + * mounted on /proc/ first. */ + + (void) mkdir_p_label(entry_path, 0755); + (void) umount_recursive(entry_path, 0); + + if (ns_info->protect_proc != PROTECT_PROC_DEFAULT || + ns_info->proc_subset != PROC_SUBSET_ALL) { + _cleanup_free_ char *opts = NULL; + + /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it + * pretended to be per-instance but actually was per-namespace), hence let's make use of it + * if requested. To make sure this logic succeeds only on kernels where hidepid= is + * per-instance, we'll exclusively use the textual value for hidepid=, since support was + * added in the same commit: if it's supported it is thus also per-instance. */ + + opts = strjoin("hidepid=", + ns_info->protect_proc == PROTECT_PROC_DEFAULT ? "off" : + protect_proc_to_string(ns_info->protect_proc), + ns_info->proc_subset == PROC_SUBSET_PID ? ",subset=pid" : ""); + if (!opts) + return -ENOMEM; + + r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); + if (r < 0) { + if (r != -EINVAL) + return r; + + /* If this failed with EINVAL then this likely means the textual hidepid= stuff is + * not supported by the kernel, and thus the per-instance hidepid= neither, which + * means we really don't want to use it, since it would affect our host's /proc + * mount. Hence let's gracefully fallback to a classic, unrestricted version. */ + } else + return 1; + } - /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */ - if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0) - return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (r < 0) + return r; return 1; } static int mount_tmpfs(const MountEntry *m) { + const char *entry_path, *inner_path; int r; - const char *entry_path = mount_entry_path(m); - const char *source_path = m->path_const; assert(m); + entry_path = mount_entry_path(m); + inner_path = m->path_const; + /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */ (void) mkdir_p_label(entry_path, 0755); (void) umount_recursive(entry_path, 0); - if (mount("tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m)) < 0) - return log_debug_errno(errno, "Failed to mount %s: %m", entry_path); + r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m)); + if (r < 0) + return r; + + r = label_fix_container(entry_path, inner_path, 0); + if (r < 0) + return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path); + + return 1; +} + +static int mount_images(const MountEntry *m) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; + DissectImageFlags dissect_image_flags; + int r; + + assert(m); + + r = verity_settings_load(&verity, mount_entry_source(m), NULL, NULL); + if (r < 0) + return log_debug_errno(r, "Failed to load root hash: %m"); + + dissect_image_flags = + (m->read_only ? DISSECT_IMAGE_READ_ONLY : 0) | + (verity.data_path ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0); - r = label_fix_container(entry_path, source_path, 0); + r = loop_device_make_by_path( + mount_entry_source(m), + m->read_only ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, + verity.data_path ? 0 : LO_FLAGS_PARTSCAN, + &loop_device); if (r < 0) - return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, source_path); + return log_debug_errno(r, "Failed to create loop device for image: %m"); + + r = dissect_image( + loop_device->fd, + &verity, + m->image_options, + dissect_image_flags, + &dissected_image); + /* No partition table? Might be a single-filesystem image, try again */ + if (!verity.data_path && r == -ENOPKG) + r = dissect_image( + loop_device->fd, + &verity, + m->image_options, + dissect_image_flags|DISSECT_IMAGE_NO_PARTITION_TABLE, + &dissected_image); + if (r < 0) + return log_debug_errno(r, "Failed to dissect image: %m"); + + r = dissected_image_decrypt( + dissected_image, + NULL, + &verity, + dissect_image_flags, + &decrypted_image); + if (r < 0) + return log_debug_errno(r, "Failed to decrypt dissected image: %m"); + + r = mkdir_p_label(mount_entry_path(m), 0755); + if (r < 0) + return log_debug_errno(r, "Failed to create destination directory %s: %m", mount_entry_path(m)); + r = umount_recursive(mount_entry_path(m), 0); + if (r < 0) + return log_debug_errno(r, "Failed to umount under destination directory %s: %m", mount_entry_path(m)); + + r = dissected_image_mount(dissected_image, mount_entry_path(m), UID_INVALID, dissect_image_flags); + if (r < 0) + return log_debug_errno(r, "Failed to mount image: %m"); + + if (decrypted_image) { + r = decrypted_image_relinquish(decrypted_image); + if (r < 0) + return log_debug_errno(r, "Failed to relinquish decrypted image: %m"); + } + + loop_device_relinquish(loop_device); return 1; } @@ -917,7 +1046,8 @@ static int follow_symlink( static int apply_mount( const char *root_directory, - MountEntry *m) { + MountEntry *m, + const NamespaceInfo *ns_info) { _cleanup_free_ char *inaccessible = NULL; bool rbind = true, make = false; @@ -925,6 +1055,7 @@ static int apply_mount( int r; assert(m); + assert(ns_info); log_debug("Applying namespace mount on %s", mount_entry_path(m)); @@ -1029,7 +1160,10 @@ static int apply_mount( return mount_sysfs(m); case PROCFS: - return mount_procfs(m); + return mount_procfs(m, ns_info); + + case MOUNT_IMAGES: + return mount_images(m); default: assert_not_reached("Unknown mode"); @@ -1037,9 +1171,9 @@ static int apply_mount( assert(what); - if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) { + r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL); + if (r < 0) { bool try_again = false; - r = -errno; if (r == -ENOENT && make) { struct stat st; @@ -1067,13 +1201,8 @@ static int apply_mount( } } - if (try_again) { - if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) - r = -errno; - else - r = 0; - } - + if (try_again) + r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL); if (r < 0) return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); } @@ -1138,7 +1267,9 @@ static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) { return ns_info->mount_apivfs || ns_info->protect_control_groups || - ns_info->protect_kernel_tunables; + ns_info->protect_kernel_tunables || + ns_info->protect_proc != PROTECT_PROC_DEFAULT || + ns_info->proc_subset != PROC_SUBSET_ALL; } static size_t namespace_calculate_mounts( @@ -1149,27 +1280,27 @@ static size_t namespace_calculate_mounts( char** empty_directories, size_t n_bind_mounts, size_t n_temporary_filesystems, + size_t n_mount_images, const char* tmp_dir, const char* var_tmp_dir, - const char* log_namespace, - ProtectHome protect_home, - ProtectSystem protect_system) { + const char *creds_path, + const char* log_namespace) { size_t protect_home_cnt; size_t protect_system_cnt = - (protect_system == PROTECT_SYSTEM_STRICT ? + (ns_info->protect_system == PROTECT_SYSTEM_STRICT ? ELEMENTSOF(protect_system_strict_table) : - ((protect_system == PROTECT_SYSTEM_FULL) ? + ((ns_info->protect_system == PROTECT_SYSTEM_FULL) ? ELEMENTSOF(protect_system_full_table) : - ((protect_system == PROTECT_SYSTEM_YES) ? + ((ns_info->protect_system == PROTECT_SYSTEM_YES) ? ELEMENTSOF(protect_system_yes_table) : 0))); protect_home_cnt = - (protect_home == PROTECT_HOME_YES ? + (ns_info->protect_home == PROTECT_HOME_YES ? ELEMENTSOF(protect_home_yes_table) : - ((protect_home == PROTECT_HOME_READ_ONLY) ? + ((ns_info->protect_home == PROTECT_HOME_READ_ONLY) ? ELEMENTSOF(protect_home_read_only_table) : - ((protect_home == PROTECT_HOME_TMPFS) ? + ((ns_info->protect_home == PROTECT_HOME_TMPFS) ? ELEMENTSOF(protect_home_tmpfs_table) : 0))); return !!tmp_dir + !!var_tmp_dir + @@ -1178,6 +1309,7 @@ static size_t namespace_calculate_mounts( strv_length(inaccessible_paths) + strv_length(empty_directories) + n_bind_mounts + + n_mount_images + n_temporary_filesystems + ns_info->private_dev + (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + @@ -1187,6 +1319,7 @@ static size_t namespace_calculate_mounts( protect_home_cnt + protect_system_cnt + (ns_info->protect_hostname ? 2 : 0) + (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0) + + (creds_path ? 2 : 1) + !!log_namespace; } @@ -1254,9 +1387,66 @@ static bool home_read_only( return false; } +static int verity_settings_prepare( + VeritySettings *verity, + const char *root_image, + const void *root_hash, + size_t root_hash_size, + const char *root_hash_path, + const void *root_hash_sig, + size_t root_hash_sig_size, + const char *root_hash_sig_path, + const char *verity_data_path) { + + int r; + + assert(verity); + + if (root_hash) { + void *d; + + d = memdup(root_hash, root_hash_size); + if (!d) + return -ENOMEM; + + free_and_replace(verity->root_hash, d); + verity->root_hash_size = root_hash_size; + verity->designator = PARTITION_ROOT; + } + + if (root_hash_sig) { + void *d; + + d = memdup(root_hash_sig, root_hash_sig_size); + if (!d) + return -ENOMEM; + + free_and_replace(verity->root_hash_sig, d); + verity->root_hash_sig_size = root_hash_sig_size; + verity->designator = PARTITION_ROOT; + } + + if (verity_data_path) { + r = free_and_strdup(&verity->data_path, verity_data_path); + if (r < 0) + return r; + } + + r = verity_settings_load( + verity, + root_image, + root_hash_path, + root_hash_sig_path); + if (r < 0) + return log_debug_errno(r, "Failed to load root hash: %m"); + + return 0; +} + int setup_namespace( const char* root_directory, const char* root_image, + const MountOptions *root_image_options, const NamespaceInfo *ns_info, char** read_write_paths, char** read_only_paths, @@ -1266,11 +1456,12 @@ int setup_namespace( size_t n_bind_mounts, const TemporaryFileSystem *temporary_filesystems, size_t n_temporary_filesystems, + const MountImage *mount_images, + size_t n_mount_images, const char* tmp_dir, const char* var_tmp_dir, + const char *creds_path, const char *log_namespace, - ProtectHome protect_home, - ProtectSystem protect_system, unsigned long mount_flags, const void *root_hash, size_t root_hash_size, @@ -1278,20 +1469,19 @@ int setup_namespace( const void *root_hash_sig, size_t root_hash_sig_size, const char *root_hash_sig_path, - const char *root_verity, + const char *verity_data_path, DissectImageFlags dissect_image_flags, char **error_path) { _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; - _cleanup_free_ void *root_hash_decoded = NULL; - _cleanup_free_ char *verity_data = NULL, *hash_sig_path = NULL; + _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; MountEntry *m = NULL, *mounts = NULL; - size_t n_mounts; bool require_prefix = false; const char *root; - int r = 0; + size_t n_mounts; + int r; assert(ns_info); @@ -1303,49 +1493,47 @@ int setup_namespace( /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */ if (root_read_only(read_only_paths, - protect_system) && + ns_info->protect_system) && home_read_only(read_only_paths, inaccessible_paths, empty_directories, bind_mounts, n_bind_mounts, temporary_filesystems, n_temporary_filesystems, - protect_home) && + ns_info->protect_home) && strv_isempty(read_write_paths)) dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; - r = loop_device_make_by_path(root_image, - FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, - LO_FLAGS_PARTSCAN, - &loop_device); + r = verity_settings_prepare( + &verity, + root_image, + root_hash, root_hash_size, root_hash_path, + root_hash_sig, root_hash_sig_size, root_hash_sig_path, + verity_data_path); if (r < 0) - return log_debug_errno(r, "Failed to create loop device for root image: %m"); + return r; + + SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity.data_path); - r = verity_metadata_load(root_image, - root_hash_path, - root_hash ? NULL : &root_hash_decoded, - root_hash ? NULL : &root_hash_size, - root_verity ? NULL : &verity_data, - root_hash_sig || root_hash_sig_path ? NULL : &hash_sig_path); + r = loop_device_make_by_path( + root_image, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + &loop_device); if (r < 0) - return log_debug_errno(r, "Failed to load root hash: %m"); - dissect_image_flags |= root_verity || verity_data ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0; - - r = dissect_image(loop_device->fd, - root_hash ?: root_hash_decoded, - root_hash_size, - root_verity ?: verity_data, - dissect_image_flags, - &dissected_image); + return log_debug_errno(r, "Failed to create loop device for root image: %m"); + + r = dissect_image( + loop_device->fd, + &verity, + root_image_options, + dissect_image_flags, + &dissected_image); if (r < 0) return log_debug_errno(r, "Failed to dissect image: %m"); - r = dissected_image_decrypt(dissected_image, - NULL, - root_hash ?: root_hash_decoded, - root_hash_size, - root_verity ?: verity_data, - root_hash_sig_path ?: hash_sig_path, - root_hash_sig, - root_hash_sig_size, - dissect_image_flags, - &decrypted_image); + r = dissected_image_decrypt( + dissected_image, + NULL, + &verity, + dissect_image_flags, + &decrypted_image); if (r < 0) return log_debug_errno(r, "Failed to decrypt dissected image: %m"); } @@ -1372,9 +1560,10 @@ int setup_namespace( empty_directories, n_bind_mounts, n_temporary_filesystems, + n_mount_images, tmp_dir, var_tmp_dir, - log_namespace, - protect_home, protect_system); + creds_path, + log_namespace); if (n_mounts > 0) { m = mounts = new0(MountEntry, n_mounts); @@ -1425,13 +1614,16 @@ int setup_namespace( }; } - if (ns_info->private_dev) { + r = append_mount_images(&m, mount_images, n_mount_images); + if (r < 0) + goto finish; + + if (ns_info->private_dev) *(m++) = (MountEntry) { .path_const = "/dev", .mode = PRIVATE_DEV, .flags = DEV_MOUNT_OPTIONS, }; - } if (ns_info->protect_kernel_tunables) { r = append_static_mounts(&m, @@ -1460,18 +1652,17 @@ int setup_namespace( goto finish; } - if (ns_info->protect_control_groups) { + if (ns_info->protect_control_groups) *(m++) = (MountEntry) { .path_const = "/sys/fs/cgroup", .mode = READONLY, }; - } - r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths); + r = append_protect_home(&m, ns_info->protect_home, ns_info->ignore_protect_paths); if (r < 0) goto finish; - r = append_protect_system(&m, protect_system, false); + r = append_protect_system(&m, ns_info->protect_system, false); if (r < 0) goto finish; @@ -1495,6 +1686,35 @@ int setup_namespace( }; } + if (creds_path) { + /* If our service has a credentials store configured, then bind that one in, but hide + * everything else. */ + + *(m++) = (MountEntry) { + .path_const = "/run/credentials", + .mode = TMPFS, + .read_only = true, + .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, + .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC, + }; + + *(m++) = (MountEntry) { + .path_const = creds_path, + .mode = BIND_MOUNT, + .read_only = true, + .source_const = creds_path, + }; + } else { + /* If our service has no credentials store configured, then make the whole + * credentials tree inaccessible wholesale. */ + + *(m++) = (MountEntry) { + .path_const = "/run/credentials", + .mode = INACCESSIBLE, + .ignore = true, + }; + } + if (log_namespace) { _cleanup_free_ char *q; @@ -1570,19 +1790,16 @@ int setup_namespace( goto finish; } if (r == 0) { - if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) { - r = log_debug_errno(errno, "Failed to bind mount '%s': %m", root); + r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) goto finish; - } } } else { - /* Let's mount the main root directory to the root directory to use */ - if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) { - r = log_debug_errno(errno, "Failed to bind mount '/' on '%s': %m", root); + r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) goto finish; - } } /* Try to set up the new root directory before mounting anything else there. */ @@ -1628,7 +1845,7 @@ int setup_namespace( break; } - r = apply_mount(root, m); + r = apply_mount(root, m, ns_info); if (r < 0) { if (error_path && mount_entry_path(m)) *error_path = strdup(mount_entry_path(m)); @@ -1739,6 +1956,74 @@ int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) { return 0; } +MountImage* mount_image_free_many(MountImage *m, size_t *n) { + size_t i; + + assert(n); + assert(m || *n == 0); + + for (i = 0; i < *n; i++) { + free(m[i].source); + free(m[i].destination); + mount_options_free_all(m[i].mount_options); + } + + free(m); + *n = 0; + return NULL; +} + +int mount_image_add(MountImage **m, size_t *n, const MountImage *item) { + _cleanup_free_ char *s = NULL, *d = NULL; + _cleanup_(mount_options_free_allp) MountOptions *options = NULL; + MountOptions *i; + MountImage *c; + + assert(m); + assert(n); + assert(item); + + s = strdup(item->source); + if (!s) + return -ENOMEM; + + d = strdup(item->destination); + if (!d) + return -ENOMEM; + + LIST_FOREACH(mount_options, i, item->mount_options) { + _cleanup_(mount_options_free_allp) MountOptions *o; + + o = new(MountOptions, 1); + if (!o) + return -ENOMEM; + + *o = (MountOptions) { + .partition_designator = i->partition_designator, + .options = strdup(i->options), + }; + if (!o->options) + return -ENOMEM; + + LIST_APPEND(mount_options, options, TAKE_PTR(o)); + } + + c = reallocarray(*m, *n + 1, sizeof(MountImage)); + if (!c) + return -ENOMEM; + + *m = c; + + c[(*n) ++] = (MountImage) { + .source = TAKE_PTR(s), + .destination = TAKE_PTR(d), + .mount_options = TAKE_PTR(options), + .ignore_enoent = item->ignore_enoent, + }; + + return 0; +} + void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) { size_t i; @@ -1827,25 +2112,9 @@ static int make_tmp_prefix(const char *prefix) { } -static int make_tmp_subdir(const char *parent, char **ret) { - _cleanup_free_ char *y = NULL; - - RUN_WITH_UMASK(0000) { - y = strjoin(parent, "/tmp"); - if (!y) - return -ENOMEM; - - if (mkdir(y, 0777 | S_ISVTX) < 0) - return -errno; - } - - if (ret) - *ret = TAKE_PTR(y); - return 0; -} - static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) { _cleanup_free_ char *x = NULL; + _cleanup_free_ char *y = NULL; char bid[SD_ID128_STRING_MAX]; sd_id128_t boot_id; bool rw = true; @@ -1879,9 +2148,21 @@ static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, ch } if (rw) { - r = make_tmp_subdir(x, tmp_path); + y = strjoin(x, "/tmp"); + if (!y) + return -ENOMEM; + + RUN_WITH_UMASK(0000) { + if (mkdir(y, 0777 | S_ISVTX) < 0) + return -errno; + } + + r = label_fix_container(y, prefix, 0); if (r < 0) return r; + + if (tmp_path) + *tmp_path = TAKE_PTR(y); } else { /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being * read-only. This way the service will get the EROFS result as if it was writing to the real @@ -1890,9 +2171,9 @@ static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, ch if (r < 0) return r; - x = strdup(RUN_SYSTEMD_EMPTY); - if (!x) - return -ENOMEM; + r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY); + if (r < 0) + return r; } *path = TAKE_PTR(x); @@ -2080,3 +2361,19 @@ static const char* const namespace_type_table[] = { }; DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType); + +static const char* const protect_proc_table[_PROTECT_PROC_MAX] = { + [PROTECT_PROC_DEFAULT] = "default", + [PROTECT_PROC_NOACCESS] = "noaccess", + [PROTECT_PROC_INVISIBLE] = "invisible", + [PROTECT_PROC_PTRACEABLE] = "ptraceable", +}; + +DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc); + +static const char* const proc_subset_table[_PROC_SUBSET_MAX] = { + [PROC_SUBSET_ALL] = "all", + [PROC_SUBSET_PID] = "pid", +}; + +DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);