#include "blkid-util.h"
#include "blockdev-util.h"
#include "btrfs-util.h"
-#include "chase-symlinks.h"
+#include "chase.h"
#include "conf-files.h"
#include "constants.h"
#include "copy.h"
#include "dm-util.h"
#include "env-file.h"
#include "env-util.h"
-#include "extension-release.h"
+#include "extension-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "id128-util.h"
#include "import-util.h"
#include "io-util.h"
+#include "missing_mount.h"
#include "mkdir-label.h"
#include "mount-util.h"
#include "mountpoint-util.h"
if (!b)
return -ENOMEM;
+ /* The Linux kernel maintains separate block device caches for main ("whole") and partition block
+ * devices, which means making a change to one might not be reflected immediately when reading via
+ * the other. That's massively confusing when mixing accesses to such devices. Let's address this in
+ * a limited way: when probing a file system that is not at the beginning of the block device we
+ * apparently probe a partition via the main block device, and in that case let's first flush the
+ * main block device cache, so that we get the data that the per-partition block device last
+ * sync'ed on.
+ *
+ * This only works under the assumption that any tools that write to the partition block devices
+ * issue an syncfs()/fsync() on the device after making changes. Typically file system formatting
+ * tools that write a superblock onto a partition block device do that, however. */
+ if (offset != 0)
+ if (ioctl(fd, BLKFLSBUF, 0) < 0)
+ log_debug_errno(errno, "Failed to flush block device cache, ignoring: %m");
+
errno = 0;
r = blkid_probe_set_device(
b,
}
#if HAVE_BLKID
-static int dissected_image_probe_filesystems(DissectedImage *m, int fd) {
+static int image_policy_may_use(
+ const ImagePolicy *policy,
+ PartitionDesignator designator) {
+
+ PartitionPolicyFlags f;
+
+ /* For each partition we find in the partition table do a first check if it may exist at all given
+ * the policy, or if it shall be ignored. */
+
+ f = image_policy_get_exhaustively(policy, designator);
+ if (f < 0)
+ return f;
+
+ if ((f & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_ABSENT)
+ /* only flag set in policy is "absent"? then this partition may not exist at all */
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ERFKILL),
+ "Partition of designator '%s' exists, but not allowed by policy, refusing.",
+ partition_designator_to_string(designator));
+ if ((f & _PARTITION_POLICY_USE_MASK & ~PARTITION_POLICY_ABSENT) == PARTITION_POLICY_UNUSED) {
+ /* only "unused" or "unused" + "absent" are set? then don't use it */
+ log_debug("Partition of designator '%s' exists, and policy dictates to ignore it, doing so.",
+ partition_designator_to_string(designator));
+ return false; /* ignore! */
+ }
+
+ return true; /* use! */
+}
+
+static int image_policy_check_protection(
+ const ImagePolicy *policy,
+ PartitionDesignator designator,
+ PartitionPolicyFlags found_flags) {
+
+ PartitionPolicyFlags policy_flags;
+
+ /* Checks if the flags in the policy for the designated partition overlap the flags of what we found */
+
+ if (found_flags < 0)
+ return found_flags;
+
+ policy_flags = image_policy_get_exhaustively(policy, designator);
+ if (policy_flags < 0)
+ return policy_flags;
+
+ if ((found_flags & policy_flags) == 0) {
+ _cleanup_free_ char *found_flags_string = NULL, *policy_flags_string = NULL;
+
+ (void) partition_policy_flags_to_string(found_flags, /* simplify= */ true, &found_flags_string);
+ (void) partition_policy_flags_to_string(policy_flags, /* simplify= */ true, &policy_flags_string);
+
+ return log_debug_errno(SYNTHETIC_ERRNO(ERFKILL), "Partition %s discovered with policy '%s' but '%s' was required, refusing.",
+ partition_designator_to_string(designator),
+ strnull(found_flags_string), strnull(policy_flags_string));
+ }
+
+ return 0;
+}
+
+static int image_policy_check_partition_flags(
+ const ImagePolicy *policy,
+ PartitionDesignator designator,
+ uint64_t gpt_flags) {
+
+ PartitionPolicyFlags policy_flags;
+ bool b;
+
+ /* Checks if the partition flags in the policy match reality */
+
+ policy_flags = image_policy_get_exhaustively(policy, designator);
+ if (policy_flags < 0)
+ return policy_flags;
+
+ b = FLAGS_SET(gpt_flags, SD_GPT_FLAG_READ_ONLY);
+ if ((policy_flags & _PARTITION_POLICY_READ_ONLY_MASK) == (b ? PARTITION_POLICY_READ_ONLY_OFF : PARTITION_POLICY_READ_ONLY_ON))
+ return log_debug_errno(SYNTHETIC_ERRNO(ERFKILL), "Partition %s has 'read-only' flag incorrectly set (must be %s, is %s), refusing.",
+ partition_designator_to_string(designator),
+ one_zero(!b), one_zero(b));
+
+ b = FLAGS_SET(gpt_flags, SD_GPT_FLAG_GROWFS);
+ if ((policy_flags & _PARTITION_POLICY_GROWFS_MASK) == (b ? PARTITION_POLICY_GROWFS_OFF : PARTITION_POLICY_GROWFS_ON))
+ return log_debug_errno(SYNTHETIC_ERRNO(ERFKILL), "Partition %s has 'growfs' flag incorrectly set (must be %s, is %s), refusing.",
+ partition_designator_to_string(designator),
+ one_zero(!b), one_zero(b));
+
+ return 0;
+}
+
+static int dissected_image_probe_filesystems(
+ DissectedImage *m,
+ int fd,
+ const ImagePolicy *policy) {
+
int r;
assert(m);
for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) {
DissectedPartition *p = m->partitions + i;
+ PartitionPolicyFlags found_flags;
if (!p->found)
continue;
return r;
}
- if (streq_ptr(p->fstype, "crypto_LUKS"))
+ if (streq_ptr(p->fstype, "crypto_LUKS")) {
m->encrypted = true;
+ found_flags = PARTITION_POLICY_ENCRYPTED; /* found this one, and its definitely encrypted */
+ } else
+ /* found it, but it's definitely not encrypted, hence mask the encrypted flag, but
+ * set all other ways that indicate "present". */
+ found_flags = PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED;
if (p->fstype && fstype_is_ro(p->fstype))
p->rw = false;
if (!p->rw)
p->growfs = false;
+
+ /* We might have learnt more about the file system now (i.e. whether it is encrypted or not),
+ * hence we need to validate this against policy again, to see if the policy still matches
+ * with this new information. Note that image_policy_check_protection() will check for
+ * overlap between what's allowed in the policy and what we pass as 'found_policy' here. In
+ * the unencrypted case we thus might pass an overly unspecific mask here (i.e. unprotected
+ * OR verity OR signed), but that's fine since the earlier policy check already checked more
+ * specific which of those three cases where OK. Keep in mind that this function here only
+ * looks at specific partitions (and thus can only deduce encryption or not) but not the
+ * overall partition table (and thus cannot deduce verity or not). The earlier dissection
+ * checks already did the relevant checks that look at the whole partition table, and
+ * enforced policy there as needed. */
+ r = image_policy_check_protection(policy, i, found_flags);
+ if (r < 0)
+ return r;
}
return 0;
log_debug("Unexpected partition flag %llu set on %s!", bit, node);
}
}
-#endif
-#if HAVE_BLKID
static int dissected_image_new(const char *path, DissectedImage **ret) {
_cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
_cleanup_free_ char *name = NULL;
const char *devname,
const VeritySettings *verity,
const MountOptions *mount_options,
+ const ImagePolicy *policy,
DissectImageFlags flags) {
sd_id128_t root_uuid = SD_ID128_NULL, root_verity_uuid = SD_ID128_NULL;
* Returns -ENOPKG if no suitable partition table or file system could be found.
* Returns -EADDRNOTAVAIL if a root hash was specified but no matching root/verity partitions found.
* Returns -ENXIO if we couldn't find any partition suitable as root or /usr partition
- * Returns -ENOTUNIQ if we only found multiple generic partitions and thus don't know what to do with that */
+ * Returns -ENOTUNIQ if we only found multiple generic partitions and thus don't know what to do with that
+ * Returns -ERFKILL if image doesn't match image policy
+ * Returns -EBADR if verity data was provided externally for an image that has a GPT partition table (i.e. is not just a naked fs)
+ * Returns -EPROTONOSUPPORT if DISSECT_IMAGE_ADD_PARTITION_DEVICES is set but the block device does not have partition logic enabled
+ * Returns -ENOMSG if we didn't find a single usable partition (and DISSECT_IMAGE_REFUSE_EMPTY is set) */
uint64_t diskseq = m->loop ? m->loop->diskseq : 0;
sd_id128_t fsuuid, vuuid;
/* If a root hash is supplied, then we use the root partition that has a UUID that match the
- * first 128bit of the root hash. And we use the verity partition that has a UUID that match
- * the final 128bit. */
+ * first 128-bit of the root hash. And we use the verity partition that has a UUID that match
+ * the final 128-bit. */
if (verity->root_hash_size < sizeof(sd_id128_t))
return -EINVAL;
const char *fstype = NULL, *options = NULL, *suuid = NULL;
_cleanup_close_ int mount_node_fd = -EBADF;
sd_id128_t uuid = SD_ID128_NULL;
+ PartitionPolicyFlags found_flags;
+ bool encrypted;
+
+ /* OK, we have found a file system, that's our root partition then. */
+
+ r = image_policy_may_use(policy, PARTITION_ROOT);
+ if (r < 0)
+ return r;
+ if (r == 0) /* policy says ignore this, so we ignore it */
+ return -ENOPKG;
+
+ (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL);
+ (void) blkid_probe_lookup_value(b, "UUID", &suuid, NULL);
+
+ encrypted = streq_ptr(fstype, "crypto_LUKS");
+
+ if (verity_settings_data_covers(verity, PARTITION_ROOT))
+ found_flags = verity->root_hash_sig ? PARTITION_POLICY_SIGNED : PARTITION_POLICY_VERITY;
+ else
+ found_flags = encrypted ? PARTITION_POLICY_ENCRYPTED : PARTITION_POLICY_UNPROTECTED;
+
+ r = image_policy_check_protection(policy, PARTITION_ROOT, found_flags);
+ if (r < 0)
+ return r;
+
+ r = image_policy_check_partition_flags(policy, PARTITION_ROOT, 0); /* we have no gpt partition flags, hence check against all bits off */
+ if (r < 0)
+ return r;
if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES)) {
mount_node_fd = open_partition(devname, /* is_partition = */ false, m->loop);
return mount_node_fd;
}
- /* OK, we have found a file system, that's our root partition then. */
- (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL);
- (void) blkid_probe_lookup_value(b, "UUID", &suuid, NULL);
-
if (fstype) {
t = strdup(fstype);
if (!t)
return r;
m->single_file_system = true;
- m->encrypted = streq_ptr(fstype, "crypto_LUKS");
+ m->encrypted = encrypted;
m->has_verity = verity && verity->data_path;
m->verity_ready = verity_settings_data_covers(verity, PARTITION_ROOT);
if (pflags & SD_GPT_FLAG_NO_AUTO)
continue;
- fstype = "swap";
+ /* Note: we don't set fstype = "swap" here, because we still need to probe if
+ * it might be encrypted (i.e. fstype "crypt_LUKS") or unencrypted
+ * (i.e. fstype "swap"), and the only way to figure that out is via fstype
+ * probing. */
/* We don't have a designator for SD_GPT_LINUX_GENERIC so check the UUID instead. */
} else if (sd_id128_equal(type.uuid, SD_GPT_LINUX_GENERIC)) {
_cleanup_close_ int mount_node_fd = -EBADF;
const char *options = NULL;
+ r = image_policy_may_use(policy, type.designator);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* Policy says: ignore; Remember this fact, so that we later can distinguish between "found but ignored" and "not found at all" */
+
+ if (!m->partitions[type.designator].found)
+ m->partitions[type.designator].ignored = true;
+
+ continue;
+ }
+
if (m->partitions[type.designator].found) {
+ int c;
+
/* For most partition types the first one we see wins. Except for the
* rootfs and /usr, where we do a version compare of the label, and
* let the newest version win. This permits a simple A/B versioning
* scheme in OS images. */
- if (compare_arch(type.arch, m->partitions[type.designator].architecture) <= 0)
+ c = compare_arch(type.arch, m->partitions[type.designator].architecture);
+ if (c < 0) /* the arch we already found is better than the one we found now */
continue;
-
- if (!partition_designator_is_versioned(type.designator) ||
- strverscmp_improved(m->partitions[type.designator].label, label) >= 0)
+ if (c == 0 && /* same arch? then go by version in label */
+ (!partition_designator_is_versioned(type.designator) ||
+ strverscmp_improved(label, m->partitions[type.designator].label) <= 0))
continue;
dissected_partition_done(m->partitions + type.designator);
sd_id128_t id = SD_ID128_NULL;
const char *options = NULL;
+ r = image_policy_may_use(policy, PARTITION_XBOOTLDR);
+ if (r < 0)
+ return r;
+ if (r == 0) { /* policy says: ignore */
+ if (!m->partitions[PARTITION_XBOOTLDR].found)
+ m->partitions[PARTITION_XBOOTLDR].ignored = true;
+
+ continue;
+ }
+
/* First one wins */
if (m->partitions[PARTITION_XBOOTLDR].found)
continue;
/* If we didn't find a generic node, then we can't fix this up either */
if (generic_node) {
- _cleanup_close_ int mount_node_fd = -EBADF;
- _cleanup_free_ char *o = NULL, *n = NULL;
- const char *options;
-
- if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES)) {
- mount_node_fd = open_partition(generic_node, /* is_partition = */ true, m->loop);
- if (mount_node_fd < 0)
- return mount_node_fd;
- }
-
- r = make_partition_devname(devname, diskseq, generic_nr, flags, &n);
+ r = image_policy_may_use(policy, PARTITION_ROOT);
if (r < 0)
return r;
+ if (r == 0)
+ /* Policy says: ignore; remember that we did */
+ m->partitions[PARTITION_ROOT].ignored = true;
+ else {
+ _cleanup_close_ int mount_node_fd = -EBADF;
+ _cleanup_free_ char *o = NULL, *n = NULL;
+ const char *options;
- options = mount_options_from_designator(mount_options, PARTITION_ROOT);
- if (options) {
- o = strdup(options);
- if (!o)
- return -ENOMEM;
- }
+ if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES)) {
+ mount_node_fd = open_partition(generic_node, /* is_partition = */ true, m->loop);
+ if (mount_node_fd < 0)
+ return mount_node_fd;
+ }
- assert(generic_nr >= 0);
- m->partitions[PARTITION_ROOT] = (DissectedPartition) {
- .found = true,
- .rw = generic_rw,
- .growfs = generic_growfs,
- .partno = generic_nr,
- .architecture = _ARCHITECTURE_INVALID,
- .node = TAKE_PTR(n),
- .uuid = generic_uuid,
- .mount_options = TAKE_PTR(o),
- .mount_node_fd = TAKE_FD(mount_node_fd),
- .offset = UINT64_MAX,
- .size = UINT64_MAX,
- };
+ r = make_partition_devname(devname, diskseq, generic_nr, flags, &n);
+ if (r < 0)
+ return r;
+
+ options = mount_options_from_designator(mount_options, PARTITION_ROOT);
+ if (options) {
+ o = strdup(options);
+ if (!o)
+ return -ENOMEM;
+ }
+
+ assert(generic_nr >= 0);
+ m->partitions[PARTITION_ROOT] = (DissectedPartition) {
+ .found = true,
+ .rw = generic_rw,
+ .growfs = generic_growfs,
+ .partno = generic_nr,
+ .architecture = _ARCHITECTURE_INVALID,
+ .node = TAKE_PTR(n),
+ .uuid = generic_uuid,
+ .mount_options = TAKE_PTR(o),
+ .mount_node_fd = TAKE_FD(mount_node_fd),
+ .offset = UINT64_MAX,
+ .size = UINT64_MAX,
+ };
+ }
}
}
}
}
- r = dissected_image_probe_filesystems(m, fd);
+ bool any = false;
+
+ /* After we discovered all partitions let's see if the verity requirements match the policy. (Note:
+ * we don't check encryption requirements here, because we haven't probed the file system yet, hence
+ * don't know if this is encrypted or not) */
+ for (PartitionDesignator di = 0; di < _PARTITION_DESIGNATOR_MAX; di++) {
+ PartitionDesignator vi, si;
+ PartitionPolicyFlags found_flags;
+
+ any = any || m->partitions[di].found;
+
+ vi = partition_verity_of(di);
+ si = partition_verity_sig_of(di);
+
+ /* Determine the verity protection level for this partition. */
+ found_flags = m->partitions[di].found ?
+ (vi >= 0 && m->partitions[vi].found ?
+ (si >= 0 && m->partitions[si].found ? PARTITION_POLICY_SIGNED : PARTITION_POLICY_VERITY) :
+ PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED) :
+ (m->partitions[di].ignored ? PARTITION_POLICY_UNUSED : PARTITION_POLICY_ABSENT);
+
+ r = image_policy_check_protection(policy, di, found_flags);
+ if (r < 0)
+ return r;
+
+ if (m->partitions[di].found) {
+ r = image_policy_check_partition_flags(policy, di, m->partitions[di].gpt_flags);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (!any && !FLAGS_SET(flags, DISSECT_IMAGE_ALLOW_EMPTY))
+ return -ENOMSG;
+
+ r = dissected_image_probe_filesystems(m, fd, policy);
if (r < 0)
return r;
const char *path,
const VeritySettings *verity,
const MountOptions *mount_options,
+ const ImagePolicy *image_policy,
DissectImageFlags flags,
DissectedImage **ret) {
int r;
assert(path);
- assert(ret);
fd = open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
if (fd < 0)
if (r < 0)
return r;
- r = dissect_image(m, fd, path, verity, mount_options, flags);
+ r = dissect_image(m, fd, path, verity, mount_options, image_policy, flags);
if (r < 0)
return r;
- *ret = TAKE_PTR(m);
+ if (ret)
+ *ret = TAKE_PTR(m);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
+int dissect_log_error(int log_level, int r, const char *name, const VeritySettings *verity) {
+ assert(log_level >= 0 && log_level <= LOG_DEBUG);
+ assert(name);
+
+ switch (r) {
+
+ case 0 ... INT_MAX: /* success! */
+ return r;
+
+ case -EOPNOTSUPP:
+ return log_full_errno(log_level, r, "Dissecting images is not supported, compiled without blkid support.");
+
+ case -ENOPKG:
+ return log_full_errno(log_level, r, "%s: Couldn't identify a suitable partition table or file system.", name);
+
+ case -ENOMEDIUM:
+ return log_full_errno(log_level, r, "%s: The image does not pass os-release/extension-release validation.", name);
+
+ case -EADDRNOTAVAIL:
+ return log_full_errno(log_level, r, "%s: No root partition for specified root hash found.", name);
+
+ case -ENOTUNIQ:
+ return log_full_errno(log_level, r, "%s: Multiple suitable root partitions found in image.", name);
+
+ case -ENXIO:
+ return log_full_errno(log_level, r, "%s: No suitable root partition found in image.", name);
+
+ case -EPROTONOSUPPORT:
+ return log_full_errno(log_level, r, "Device '%s' is a loopback block device with partition scanning turned off, please turn it on.", name);
+
+ case -ENOTBLK:
+ return log_full_errno(log_level, r, "%s: Image is not a block device.", name);
+
+ case -EBADR:
+ return log_full_errno(log_level, r,
+ "Combining partitioned images (such as '%s') with external Verity data (such as '%s') not supported. "
+ "(Consider setting $SYSTEMD_DISSECT_VERITY_SIDECAR=0 to disable automatic discovery of external Verity data.)",
+ name, strna(verity ? verity->data_path : NULL));
+
+ case -ERFKILL:
+ return log_full_errno(log_level, r, "%s: image does not match image policy.", name);
+
+ case -ENOMSG:
+ return log_full_errno(log_level, r, "%s: no suitable partitions found.", name);
+
+ default:
+ return log_full_errno(log_level, r, "%s: cannot dissect image: %m", name);
+ }
+}
+
+int dissect_image_file_and_warn(
+ const char *path,
+ const VeritySettings *verity,
+ const MountOptions *mount_options,
+ const ImagePolicy *image_policy,
+ DissectImageFlags flags,
+ DissectedImage **ret) {
+
+ return dissect_log_error(
+ LOG_ERR,
+ dissect_image_file(path, verity, mount_options, image_policy, flags, ret),
+ path,
+ verity);
+}
+
DissectedImage* dissected_image_unref(DissectedImage *m) {
if (!m)
return NULL;
r = safe_fork_full(
"(fsck)",
+ NULL,
&node_fd, 1, /* Leave the node fd open */
- FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG|FORK_NULL_STDIO|FORK_CLOEXEC_OFF,
+ FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG|FORK_REARRANGE_STDIO|FORK_CLOEXEC_OFF,
&pid);
if (r < 0)
return log_debug_errno(r, "Failed to fork off fsck: %m");
if (r == 0) {
/* Child */
- execl("/sbin/fsck", "/sbin/fsck", "-aT", FORMAT_PROC_FD_PATH(node_fd), NULL);
+ execlp("fsck", "fsck", "-aT", FORMAT_PROC_FD_PATH(node_fd), NULL);
log_open();
log_debug_errno(errno, "Failed to execl() fsck: %m");
_exit(FSCK_OPERATIONAL_ERROR);
exit_status = wait_for_terminate_and_check("fsck", pid, 0);
if (exit_status < 0)
- return log_debug_errno(exit_status, "Failed to fork off /sbin/fsck: %m");
+ return log_debug_errno(exit_status, "Failed to fork off fsck: %m");
if ((exit_status & ~FSCK_ERROR_CORRECTED) != FSCK_SUCCESS) {
log_debug("fsck failed with exit status %i.", exit_status);
return 0;
}
+int partition_pick_mount_options(
+ PartitionDesignator d,
+ const char *fstype,
+ bool rw,
+ bool discard,
+ char **ret_options,
+ unsigned long *ret_ms_flags) {
+
+ _cleanup_free_ char *options = NULL;
+
+ assert(ret_options);
+
+ /* Selects a baseline of bind mount flags, that should always apply.
+ *
+ * Firstly, we set MS_NODEV universally on all mounts, since we don't want to allow device nodes outside of /dev/.
+ *
+ * On /var/tmp/ we'll also set MS_NOSUID, same as we set for /tmp/ on the host.
+ *
+ * On the ESP and XBOOTLDR partitions we'll also disable symlinks, and execution. These file systems
+ * are generally untrusted (i.e. not encrypted or authenticated), and typically VFAT hence we should
+ * be as restrictive as possible, and this shouldn't hurt, since the functionality is not available
+ * there anyway. */
+
+ unsigned long flags = MS_NODEV;
+
+ if (!rw)
+ flags |= MS_RDONLY;
+
+ switch (d) {
+
+ case PARTITION_ESP:
+ case PARTITION_XBOOTLDR:
+ flags |= MS_NOSUID|MS_NOEXEC|ms_nosymfollow_supported();
+
+ /* The ESP might contain a pre-boot random seed. Let's make this unaccessible to regular
+ * userspace. ESP/XBOOTLDR is almost certainly VFAT, hence if we don't know assume it is. */
+ if (!fstype || fstype_can_umask(fstype))
+ if (!strextend_with_separator(&options, ",", "umask=0077"))
+ return -ENOMEM;
+ break;
+
+ case PARTITION_TMP:
+ flags |= MS_NOSUID;
+ break;
+
+ default:
+ break;
+ }
+
+ /* So, when you request MS_RDONLY from ext4, then this means nothing. It happily still writes to the
+ * backing storage. What's worse, the BLKRO[GS]ET flag and (in case of loopback devices)
+ * LO_FLAGS_READ_ONLY don't mean anything, they affect userspace accesses only, and write accesses
+ * from the upper file system still get propagated through to the underlying file system,
+ * unrestricted. To actually get ext4/xfs/btrfs to stop writing to the device we need to specify
+ * "norecovery" as mount option, in addition to MS_RDONLY. Yes, this sucks, since it means we need to
+ * carry a per file system table here.
+ *
+ * Note that this means that we might not be able to mount corrupted file systems as read-only
+ * anymore (since in some cases the kernel implementations will refuse mounting when corrupted,
+ * read-only and "norecovery" is specified). But I think for the case of automatically determined
+ * mount options for loopback devices this is the right choice, since otherwise using the same
+ * loopback file twice even in read-only mode, is going to fail badly sooner or later. The usecase of
+ * making reuse of the immutable images "just work" is more relevant to us than having read-only
+ * access that actually modifies stuff work on such image files. Or to say this differently: if
+ * people want their file systems to be fixed up they should just open them in writable mode, where
+ * all these problems don't exist. */
+ if (!rw && fstype && fstype_can_norecovery(fstype))
+ if (!strextend_with_separator(&options, ",", "norecovery"))
+ return -ENOMEM;
+
+ if (discard && fstype && fstype_can_discard(fstype))
+ if (!strextend_with_separator(&options, ",", "discard"))
+ return -ENOMEM;
+
+ if (!ret_ms_flags) /* Fold flags into option string if ret_flags specified as NULL */
+ if (!strextend_with_separator(&options, ",",
+ FLAGS_SET(flags, MS_RDONLY) ? "ro" : "rw",
+ FLAGS_SET(flags, MS_NODEV) ? "nodev" : "dev",
+ FLAGS_SET(flags, MS_NOSUID) ? "nosuid" : "suid",
+ FLAGS_SET(flags, MS_NOEXEC) ? "noexec" : "exec",
+ FLAGS_SET(flags, MS_NOSYMFOLLOW) ? "nosymfollow" : NULL))
+ /* NB: we suppress 'symfollow' here, since it's the default, and old /bin/mount might not know it */
+ return -ENOMEM;
+
+ if (ret_ms_flags)
+ *ret_ms_flags = flags;
+
+ *ret_options = TAKE_PTR(options);
+ return 0;
+}
+
static int mount_partition(
+ PartitionDesignator d,
DissectedPartition *m,
const char *where,
const char *directory,
DissectImageFlags flags) {
_cleanup_free_ char *chased = NULL, *options = NULL;
+ bool rw, discard, remap_uid_gid = false;
const char *p, *node, *fstype;
- bool rw, remap_uid_gid = false;
+ unsigned long ms_flags;
int r;
assert(m);
/* Use decrypted node and matching fstype if available, otherwise use the original device */
node = FORMAT_PROC_FD_PATH(m->mount_node_fd);
- fstype = m->decrypted_node ? m->decrypted_fstype: m->fstype;
+ fstype = dissected_partition_fstype(m);
if (!fstype)
return -EAFNOSUPPORT;
- r = dissect_fstype_ok(fstype);
- if (r < 0)
- return r;
- if (!r)
- return -EIDRM; /* Recognizable error */
/* We are looking at an encrypted partition? This either means stacked encryption, or the caller
* didn't call dissected_image_decrypt() beforehand. Let's return a recognizable error for this
if (streq(fstype, "crypto_LUKS"))
return -EUNATCH;
+ r = dissect_fstype_ok(fstype);
+ if (r < 0)
+ return r;
+ if (!r)
+ return -EIDRM; /* Recognizable error */
+
rw = m->rw && !(flags & DISSECT_IMAGE_MOUNT_READ_ONLY);
+ discard = ((flags & DISSECT_IMAGE_DISCARD) ||
+ ((flags & DISSECT_IMAGE_DISCARD_ON_LOOP) && is_loop_device(m->node) > 0));
+
if (FLAGS_SET(flags, DISSECT_IMAGE_FSCK) && rw) {
r = run_fsck(m->mount_node_fd, fstype);
if (r < 0)
if (r < 0 && r != -EROFS)
return r;
- r = chase_symlinks(directory, where, CHASE_PREFIX_ROOT, &chased, NULL);
+ r = chase(directory, where, CHASE_PREFIX_ROOT, &chased, NULL);
if (r < 0)
return r;
p = where;
}
- /* If requested, turn on discard support. */
- if (fstype_can_discard(fstype) &&
- ((flags & DISSECT_IMAGE_DISCARD) ||
- ((flags & DISSECT_IMAGE_DISCARD_ON_LOOP) && is_loop_device(m->node) > 0))) {
- options = strdup("discard");
- if (!options)
- return -ENOMEM;
- }
+ r = partition_pick_mount_options(d, dissected_partition_fstype(m), rw, discard, &options, &ms_flags);
+ if (r < 0)
+ return r;
if (uid_is_valid(uid_shift) && uid_shift != 0) {
if (!strextend_with_separator(&options, ",", m->mount_options))
return -ENOMEM;
- /* So, when you request MS_RDONLY from ext4, then this means nothing. It happily still writes to the
- * backing storage. What's worse, the BLKRO[GS]ET flag and (in case of loopback devices)
- * LO_FLAGS_READ_ONLY don't mean anything, they affect userspace accesses only, and write accesses
- * from the upper file system still get propagated through to the underlying file system,
- * unrestricted. To actually get ext4/xfs/btrfs to stop writing to the device we need to specify
- * "norecovery" as mount option, in addition to MS_RDONLY. Yes, this sucks, since it means we need to
- * carry a per file system table here.
- *
- * Note that this means that we might not be able to mount corrupted file systems as read-only
- * anymore (since in some cases the kernel implementations will refuse mounting when corrupted,
- * read-only and "norecovery" is specified). But I think for the case of automatically determined
- * mount options for loopback devices this is the right choice, since otherwise using the same
- * loopback file twice even in read-only mode, is going to fail badly sooner or later. The usecase of
- * making reuse of the immutable images "just work" is more relevant to us than having read-only
- * access that actually modifies stuff work on such image files. Or to say this differently: if
- * people want their file systems to be fixed up they should just open them in writable mode, where
- * all these problems don't exist. */
- if (!rw && STRPTR_IN_SET(fstype, "ext3", "ext4", "xfs", "btrfs"))
- if (!strextend_with_separator(&options, ",", "norecovery"))
- return -ENOMEM;
-
- r = mount_nofollow_verbose(LOG_DEBUG, node, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
+ r = mount_nofollow_verbose(LOG_DEBUG, node, p, fstype, ms_flags, options);
if (r < 0)
return r;
return 1;
}
+static int mount_point_is_available(const char *where, const char *path, bool missing_ok) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ /* Check whether <path> is suitable as a mountpoint, i.e. is an empty directory
+ * or does not exist at all (when missing_ok). */
+
+ r = chase(path, where, CHASE_PREFIX_ROOT, &p, NULL);
+ if (r == -ENOENT)
+ return missing_ok;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to chase \"%s\": %m", path);
+
+ r = dir_is_empty(p, /* ignore_hidden_or_backup= */ false);
+ if (r == -ENOTDIR)
+ return false;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to check directory \"%s\": %m", p);
+ return true;
+}
+
int dissected_image_mount(
DissectedImage *m,
const char *where,
uid_t uid_range,
DissectImageFlags flags) {
- int r, xbootldr_mounted;
+ int r;
assert(m);
assert(where);
/* First mount the root fs. If there's none we use a tmpfs. */
if (m->partitions[PARTITION_ROOT].found)
- r = mount_partition(m->partitions + PARTITION_ROOT, where, NULL, uid_shift, uid_range, flags);
+ r = mount_partition(PARTITION_ROOT, m->partitions + PARTITION_ROOT, where, NULL, uid_shift, uid_range, flags);
else
r = mount_root_tmpfs(where, uid_shift, flags);
if (r < 0)
return r;
/* For us mounting root always means mounting /usr as well */
- r = mount_partition(m->partitions + PARTITION_USR, where, "/usr", uid_shift, uid_range, flags);
+ r = mount_partition(PARTITION_USR, m->partitions + PARTITION_USR, where, "/usr", uid_shift, uid_range, flags);
if (r < 0)
return r;
ok = true;
}
if (!ok && FLAGS_SET(flags, DISSECT_IMAGE_VALIDATE_OS_EXT)) {
- r = path_is_extension_tree(where, m->image_name, FLAGS_SET(flags, DISSECT_IMAGE_RELAX_SYSEXT_CHECK));
+ r = extension_has_forbidden_content(where);
if (r < 0)
return r;
- if (r > 0)
- ok = true;
+ if (r == 0) {
+ r = path_is_extension_tree(IMAGE_SYSEXT, where, m->image_name, FLAGS_SET(flags, DISSECT_IMAGE_RELAX_SYSEXT_CHECK));
+ if (r < 0)
+ return r;
+ if (r > 0)
+ ok = true;
+ }
}
if (!ok)
if (flags & DISSECT_IMAGE_MOUNT_ROOT_ONLY)
return 0;
- r = mount_partition(m->partitions + PARTITION_HOME, where, "/home", uid_shift, uid_range, flags);
+ r = mount_partition(PARTITION_HOME, m->partitions + PARTITION_HOME, where, "/home", uid_shift, uid_range, flags);
if (r < 0)
return r;
- r = mount_partition(m->partitions + PARTITION_SRV, where, "/srv", uid_shift, uid_range, flags);
+ r = mount_partition(PARTITION_SRV, m->partitions + PARTITION_SRV, where, "/srv", uid_shift, uid_range, flags);
if (r < 0)
return r;
- r = mount_partition(m->partitions + PARTITION_VAR, where, "/var", uid_shift, uid_range, flags);
+ r = mount_partition(PARTITION_VAR, m->partitions + PARTITION_VAR, where, "/var", uid_shift, uid_range, flags);
if (r < 0)
return r;
- r = mount_partition(m->partitions + PARTITION_TMP, where, "/var/tmp", uid_shift, uid_range, flags);
+ r = mount_partition(PARTITION_TMP, m->partitions + PARTITION_TMP, where, "/var/tmp", uid_shift, uid_range, flags);
if (r < 0)
return r;
- xbootldr_mounted = mount_partition(m->partitions + PARTITION_XBOOTLDR, where, "/boot", uid_shift, uid_range, flags);
- if (xbootldr_mounted < 0)
- return xbootldr_mounted;
+ int slash_boot_is_available;
+ r = slash_boot_is_available = mount_point_is_available(where, "/boot", /* missing_ok = */ true);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ r = mount_partition(PARTITION_XBOOTLDR, m->partitions + PARTITION_XBOOTLDR, where, "/boot", uid_shift, uid_range, flags);
+ if (r < 0)
+ return r;
+ slash_boot_is_available = !r;
+ }
if (m->partitions[PARTITION_ESP].found) {
- int esp_done = false;
+ const char *esp_path = NULL;
- /* Mount the ESP to /efi if it exists. If it doesn't exist, use /boot instead, but only if it
- * exists and is empty, and we didn't already mount the XBOOTLDR partition into it. */
+ /* Mount the ESP to /boot/ if it exists and is empty and we didn't already mount the XBOOTLDR
+ * partition into it. Otherwise, use /efi instead, but only if it exists and is empty. */
- r = chase_symlinks("/efi", where, CHASE_PREFIX_ROOT, NULL, NULL);
- if (r < 0) {
- if (r != -ENOENT)
+ if (slash_boot_is_available) {
+ r = mount_point_is_available(where, "/boot", /* missing_ok = */ false);
+ if (r < 0)
return r;
-
- /* /efi doesn't exist. Let's see if /boot is suitable then */
-
- if (!xbootldr_mounted) {
- _cleanup_free_ char *p = NULL;
-
- r = chase_symlinks("/boot", where, CHASE_PREFIX_ROOT, &p, NULL);
- if (r < 0) {
- if (r != -ENOENT)
- return r;
- } else if (dir_is_empty(p, /* ignore_hidden_or_backup= */ false) > 0) {
- /* It exists and is an empty directory. Let's mount the ESP there. */
- r = mount_partition(m->partitions + PARTITION_ESP, where, "/boot", uid_shift, uid_range, flags);
- if (r < 0)
- return r;
-
- esp_done = true;
- }
- }
+ if (r > 0)
+ esp_path = "/boot";
}
- if (!esp_done) {
- /* OK, let's mount the ESP now to /efi (possibly creating the dir if missing) */
+ if (!esp_path) {
+ r = mount_point_is_available(where, "/efi", /* missing_ok = */ true);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ esp_path = "/efi";
+ }
- r = mount_partition(m->partitions + PARTITION_ESP, where, "/efi", uid_shift, uid_range, flags);
+ if (esp_path) {
+ /* OK, let's mount the ESP now (possibly creating the dir if missing) */
+ r = mount_partition(PARTITION_ESP, m->partitions + PARTITION_ESP, where, esp_path, uid_shift, uid_range, flags);
if (r < 0)
return r;
}
try_again:
/* Device is being removed by another process. Let's wait for a while. */
- (void) usleep(2 * USEC_PER_MSEC);
+ (void) usleep_safe(2 * USEC_PER_MSEC);
}
/* All trials failed or a conflicting verity device exists. Let's try to activate with a unique name. */
return -EINVAL;
if (p->size > 4*1024*1024) /* Signature data cannot possible be larger than 4M, refuse that */
- return -EFBIG;
+ return log_debug_errno(SYNTHETIC_ERRNO(EFBIG), "Verity signature partition is larger than 4M, refusing.");
buf = new(char, p->size+1);
if (!buf)
* we allow a fallback that matches on the first extension-release
* file found in the directory, if one named after the image cannot
* be found first. */
- r = open_extension_release(t, m->image_name, /* relax_extension_release_check= */ false, NULL, &fd);
+ r = open_extension_release(t, IMAGE_SYSEXT, m->image_name, /* relax_extension_release_check= */ false, NULL, &fd);
if (r < 0)
fd = r; /* Propagate the error. */
break;
"/lib/systemd/systemd", /* systemd on /usr non-merged systems */
"/sbin/init") { /* traditional path the Linux kernel invokes */
- r = chase_symlinks(init, t, CHASE_PREFIX_ROOT, NULL, NULL);
+ r = chase(init, t, CHASE_PREFIX_ROOT, NULL, NULL);
if (r < 0) {
if (r != -ENOENT)
log_debug_errno(r, "Failed to resolve %s, ignoring: %m", init);
default:
NULSTR_FOREACH(p, paths[k]) {
- fd = chase_symlinks_and_open(p, t, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL);
+ fd = chase_and_open(p, t, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL);
if (fd >= 0)
break;
}
return r;
}
+Architecture dissected_image_architecture(DissectedImage *img) {
+ assert(img);
+
+ if (img->partitions[PARTITION_ROOT].found &&
+ img->partitions[PARTITION_ROOT].architecture >= 0)
+ return img->partitions[PARTITION_ROOT].architecture;
+
+ if (img->partitions[PARTITION_USR].found &&
+ img->partitions[PARTITION_USR].architecture >= 0)
+ return img->partitions[PARTITION_USR].architecture;
+
+ return _ARCHITECTURE_INVALID;
+}
+
int dissect_loop_device(
LoopDevice *loop,
const VeritySettings *verity,
const MountOptions *mount_options,
+ const ImagePolicy *image_policy,
DissectImageFlags flags,
DissectedImage **ret) {
int r;
assert(loop);
- assert(ret);
r = dissected_image_new(loop->backing_file ?: loop->node, &m);
if (r < 0)
m->loop = loop_device_ref(loop);
m->sector_size = m->loop->sector_size;
- r = dissect_image(m, loop->fd, loop->node, verity, mount_options, flags);
+ r = dissect_image(m, loop->fd, loop->node, verity, mount_options, image_policy, flags);
if (r < 0)
return r;
- *ret = TAKE_PTR(m);
+ if (ret)
+ *ret = TAKE_PTR(m);
+
return 0;
#else
return -EOPNOTSUPP;
LoopDevice *loop,
const VeritySettings *verity,
const MountOptions *mount_options,
+ const ImagePolicy *image_policy,
DissectImageFlags flags,
DissectedImage **ret) {
- const char *name;
- int r;
-
assert(loop);
- assert(loop->fd >= 0);
-
- name = ASSERT_PTR(loop->backing_file ?: loop->node);
-
- r = dissect_loop_device(loop, verity, mount_options, flags, ret);
- switch (r) {
-
- case -EOPNOTSUPP:
- return log_error_errno(r, "Dissecting images is not supported, compiled without blkid support.");
-
- case -ENOPKG:
- return log_error_errno(r, "%s: Couldn't identify a suitable partition table or file system.", name);
-
- case -ENOMEDIUM:
- return log_error_errno(r, "%s: The image does not pass validation.", name);
-
- case -EADDRNOTAVAIL:
- return log_error_errno(r, "%s: No root partition for specified root hash found.", name);
-
- case -ENOTUNIQ:
- return log_error_errno(r, "%s: Multiple suitable root partitions found in image.", name);
-
- case -ENXIO:
- return log_error_errno(r, "%s: No suitable root partition found in image.", name);
-
- case -EPROTONOSUPPORT:
- return log_error_errno(r, "Device '%s' is loopback block device with partition scanning turned off, please turn it on.", name);
- case -ENOTBLK:
- return log_error_errno(r, "%s: Image is not a block device.", name);
-
- case -EBADR:
- return log_error_errno(r,
- "Combining partitioned images (such as '%s') with external Verity data (such as '%s') not supported. "
- "(Consider setting $SYSTEMD_DISSECT_VERITY_SIDECAR=0 to disable automatic discovery of external Verity data.)",
- name, strna(verity ? verity->data_path : NULL));
+ return dissect_log_error(
+ LOG_ERR,
+ dissect_loop_device(loop, verity, mount_options, image_policy, flags, ret),
+ loop->backing_file ?: loop->node,
+ verity);
- default:
- if (r < 0)
- return log_error_errno(r, "Failed to dissect image '%s': %m", name);
-
- return r;
- }
}
bool dissected_image_verity_candidate(const DissectedImage *image, PartitionDesignator partition_designator) {
int mount_image_privately_interactively(
const char *image,
+ const ImagePolicy *image_policy,
DissectImageFlags flags,
char **ret_directory,
int *ret_dir_fd,
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
- _cleanup_(rmdir_and_freep) char *created_dir = NULL;
- _cleanup_free_ char *temp = NULL;
+ _cleanup_free_ char *dir = NULL;
int r;
/* Mounts an OS image at a temporary place, inside a newly created mount namespace of our own. This
* easily. */
assert(image);
- assert(ret_directory);
assert(ret_loop_device);
/* We intend to mount this right-away, hence add the partitions if needed and pin them. */
if (r < 0)
return log_error_errno(r, "Failed to load root hash data: %m");
- r = tempfn_random_child(NULL, program_invocation_short_name, &temp);
- if (r < 0)
- return log_error_errno(r, "Failed to generate temporary mount directory: %m");
-
r = loop_device_make_by_path(
image,
FLAGS_SET(flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR,
if (r < 0)
return log_error_errno(r, "Failed to set up loopback device for %s: %m", image);
- r = dissect_loop_device_and_warn(d, &verity, NULL, flags, &dissected_image);
+ r = dissect_loop_device_and_warn(
+ d,
+ &verity,
+ /* mount_options= */ NULL,
+ image_policy,
+ flags,
+ &dissected_image);
if (r < 0)
return r;
if (r < 0)
return log_error_errno(r, "Failed to detach mount namespace: %m");
- r = mkdir_p(temp, 0700);
+ r = mkdir_p("/run/systemd/mount-rootfs", 0555);
if (r < 0)
return log_error_errno(r, "Failed to create mount point: %m");
- created_dir = TAKE_PTR(temp);
-
- r = dissected_image_mount_and_warn(dissected_image, created_dir, UID_INVALID, UID_INVALID, flags);
+ r = dissected_image_mount_and_warn(
+ dissected_image,
+ "/run/systemd/mount-rootfs",
+ /* uid_shift= */ UID_INVALID,
+ /* uid_range= */ UID_INVALID,
+ flags);
if (r < 0)
return r;
if (r < 0)
return log_error_errno(r, "Failed to relinquish DM and loopback block devices: %m");
+ if (ret_directory) {
+ dir = strdup("/run/systemd/mount-rootfs");
+ if (!dir)
+ return log_oom();
+ }
+
if (ret_dir_fd) {
_cleanup_close_ int dir_fd = -EBADF;
- dir_fd = open(created_dir, O_CLOEXEC|O_DIRECTORY);
+ dir_fd = open("/run/systemd/mount-rootfs", O_CLOEXEC|O_DIRECTORY);
if (dir_fd < 0)
return log_error_errno(errno, "Failed to open mount point directory: %m");
*ret_dir_fd = TAKE_FD(dir_fd);
}
- *ret_directory = TAKE_PTR(created_dir);
- *ret_loop_device = TAKE_PTR(d);
+ if (ret_directory)
+ *ret_directory = TAKE_PTR(dir);
+ *ret_loop_device = TAKE_PTR(d);
return 0;
}
const char *src,
const char *dest,
const MountOptions *options,
+ const ImagePolicy *image_policy,
const char *required_host_os_release_id,
const char *required_host_os_release_version_id,
const char *required_host_os_release_sysext_level,
loop_device,
&verity,
options,
+ image_policy,
dissect_image_flags,
&dissected_image);
/* No partition table? Might be a single-filesystem image, try again */
loop_device,
&verity,
options,
+ image_policy,
dissect_image_flags | DISSECT_IMAGE_NO_PARTITION_TABLE,
&dissected_image);
if (r < 0)
assert(!isempty(required_host_os_release_id));
- r = load_extension_release_pairs(dest, dissected_image->image_name, relax_extension_release_check, &extension_release);
+ r = load_extension_release_pairs(dest, IMAGE_SYSEXT, dissected_image->image_name, relax_extension_release_check, &extension_release);
if (r < 0)
return log_debug_errno(r, "Failed to parse image %s extension-release metadata: %m", dissected_image->image_name);
required_host_os_release_version_id,
required_host_os_release_sysext_level,
required_sysext_scope,
- extension_release);
+ extension_release,
+ IMAGE_SYSEXT);
if (r == 0)
return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Image %s extension-release metadata does not match the root's", dissected_image->image_name);
if (r < 0)