]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
storage: add systemd-storage-fs@.service provider
authorLennart Poettering <lennart@amutable.com>
Thu, 23 Apr 2026 07:00:46 +0000 (09:00 +0200)
committerLennart Poettering <lennart@amutable.com>
Wed, 29 Apr 2026 10:25:47 +0000 (12:25 +0200)
Second StorageProvider implementation, exposing regular files and
directories from a backing filesystem. In system mode the backing
directory is /var/lib/storage/, in user mode $XDG_STATE_HOME/storage/;
entries with a .volume suffix are exposed, with the inode type
determining whether the volume is reported as reg, dir or (via
symlinked/bind-mounted device node) blk.

Unlike the block provider, this one supports creating volumes
on-demand from a small set of built-in templates: sparse-file,
allocated-file, directory and subvolume.

man/rules/meson.build
man/systemd-storage-fs@.service.xml [new file with mode: 0644]
src/storage/io.systemd.storage.policy
src/storage/meson.build
src/storage/storage-fs.c [new file with mode: 0644]
units/meson.build
units/systemd-storage-fs.socket [new file with mode: 0644]
units/systemd-storage-fs@.service.in [new file with mode: 0644]
units/user/meson.build
units/user/systemd-storage-fs.socket [new file with mode: 0644]
units/user/systemd-storage-fs@.service.in [new file with mode: 0644]

index 439c33d5abdc19cef92b1d284579ec1fe403ec89..7f4fa07f7ba776e97429d13679c00a773dafd080 100644 (file)
@@ -1190,6 +1190,10 @@ manpages = [
   '8',
   ['systemd-storage-block', 'systemd-storage-block.socket'],
   ''],
+ ['systemd-storage-fs@.service',
+  '8',
+  ['systemd-storage-fs', 'systemd-storage-fs.socket'],
+  ''],
  ['systemd-storagetm.service', '8', ['systemd-storagetm'], 'ENABLE_STORAGETM'],
  ['systemd-stub',
   '7',
diff --git a/man/systemd-storage-fs@.service.xml b/man/systemd-storage-fs@.service.xml
new file mode 100644 (file)
index 0000000..4fe0734
--- /dev/null
@@ -0,0 +1,199 @@
+<?xml version='1.0'?> <!--*-nxml-*-->
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+  "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<!-- SPDX-License-Identifier: LGPL-2.1-or-later -->
+
+<refentry id="systemd-storage-fs_.service"
+          xmlns:xi="http://www.w3.org/2001/XInclude">
+
+  <refentryinfo>
+    <title>systemd-storage-fs@.service</title>
+    <productname>systemd</productname>
+  </refentryinfo>
+
+  <refmeta>
+    <refentrytitle>systemd-storage-fs@.service</refentrytitle>
+    <manvolnum>8</manvolnum>
+  </refmeta>
+
+  <refnamediv>
+    <refname>systemd-storage-fs@.service</refname>
+    <refname>systemd-storage-fs.socket</refname>
+    <refname>systemd-storage-fs</refname>
+    <refpurpose>Storage provider exposing regular files and directories as storage volumes</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <para><filename>systemd-storage-fs@.service</filename></para>
+    <para><filename>systemd-storage-fs.socket</filename></para>
+  </refsynopsisdiv>
+
+  <refsect1>
+    <title>Description</title>
+
+    <para><filename>systemd-storage-fs@.service</filename> is a service that implements the
+    <constant>io.systemd.StorageProvider</constant> <ulink url="https://varlink.org/">Varlink</ulink>
+    interface, exposing regular files and directories in <filename>/var/lib/storage/*.volume</filename> (if
+    used in system mode) or <filename>$XDG_STATE_HOME/storage</filename> (when used in user mode) as storage
+    volumes. Acquired volumes are returned to the caller as file descriptors. Unlike
+    <citerefentry><refentrytitle>systemd-storage-block@.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>,
+    this implementation also supports creating new volumes on demand from a small set of built-in
+    templates.</para>
+
+    <para>The service is socket-activated via <filename>systemd-storage-fs.socket</filename>. In system mode
+    it listens on the AF_UNIX socket <filename>/run/systemd/io.systemd.StorageProvider/fs</filename>, in user
+    mode on <filename>$XDG_RUNTIME_DIR/systemd/io.systemd.StorageProvider/fs</filename>. See
+    <citerefentry><refentrytitle>storagectl</refentrytitle><manvolnum>1</manvolnum></citerefentry> for an
+    enumeration tool.</para>
+
+    <para>See also
+    <citerefentry><refentrytitle>systemd-storage-block@.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+    for a complementary implementation that exposes local block devices as storage volumes.</para>
+  </refsect1>
+
+  <refsect1>
+    <title>Volumes</title>
+
+    <para>Volumes are stored below the storage directory:</para>
+
+    <itemizedlist>
+      <listitem><para><filename>/var/lib/storage/</filename> when run in system mode.</para></listitem>
+
+      <listitem><para><filename>$XDG_STATE_HOME/storage/</filename> (typically
+      <filename>~/.local/state/storage/</filename>) when run in user mode.</para></listitem>
+    </itemizedlist>
+
+    <para>Each volume on disk is stored as a directory entry with a <filename>.volume</filename> suffix in
+    the storage directory. Entries which are regular files are exposed as volumes of type
+    <constant>reg</constant>; entries which are directories are exposed as volumes of type
+    <constant>dir</constant>. Moreover, block device nodes may be symlinked (or bind mounted) into the
+    directory, which are then exposed as volumes of type <constant>blk</constant>.</para>
+
+    <para>For directory volumes, the root of the file system passed to clients is placed in a subdirectory
+    <filename>root/</filename> of the <filename><replaceable>NAME</replaceable>.volume</filename> directory. The former (and all inodes
+    below it) must be owned by the foreign UID range, the latter by the host's root.</para>
+
+    <para>When acquiring a volume, symlinks are followed.</para>
+
+    <para>An administrator is permitted to freely manipulate the volume hierarchy directly as long as the
+    rules described above are followed. In particular, it's permitted to copy, mount or symlink arbitrary
+    external resources (regardless if directory, regular file or block) into the volume directory, so that
+    they are exposed as additional volumes.</para>
+  </refsect1>
+
+  <refsect1>
+    <title>Templates</title>
+
+    <para>The provider supports creating new volumes automatically when they are acquired. The caller may
+    select a <emphasis>template</emphasis> that determines configuration details of the volume to create. The
+    following built-in templates are available:</para>
+
+    <variablelist>
+      <varlistentry>
+        <term><constant>sparse-file</constant></term>
+
+        <listitem><para>Creates a volume backed by a sparsely populated regular file. This is the default
+        template when creating a regular file volume. (Volume type is <literal>reg</literal>.)</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><constant>allocated-file</constant></term>
+
+        <listitem><para>Creates a volume backed by a fully allocated regular file. (Volume type is
+        <literal>reg</literal>.)</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><constant>directory</constant></term>
+
+        <listitem><para>Creates a volume backed by a regular directory. (Volume type is
+        <literal>dir</literal>.)</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><constant>subvolume</constant></term>
+
+        <listitem><para>Creates a btrfs subvolume as backing inode (falling back to a regular directory if
+        the storage directory is not on btrfs). This is the default template when creating a directory
+        volume. (Volume type is <literal>dir</literal>.)</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>Options</title>
+
+    <para>The following command-line options are understood:</para>
+
+    <variablelist>
+      <varlistentry>
+        <term><option>--system</option></term>
+
+        <listitem><para>Operate in system mode. Volumes are stored below
+        <filename>/var/lib/storage/</filename>. This is the default when invoked from
+        <filename>systemd-storage-fs@.service</filename> in the system manager.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>--user</option></term>
+
+        <listitem><para>Operate in user mode. Volumes are stored below
+        <filename>$XDG_STATE_HOME/storage/</filename>. This is the default when invoked from
+        <filename>systemd-storage-fs@.service</filename> in the user manager.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <xi:include href="standard-options.xml" xpointer="help" />
+      <xi:include href="standard-options.xml" xpointer="version" />
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>Files</title>
+
+    <variablelist>
+      <varlistentry>
+        <term><filename>/var/lib/storage/</filename></term>
+        <term><filename>$XDG_STATE_HOME/storage/</filename></term>
+
+        <listitem><para>The storage directory used to back the system mode and user mode service
+        instances respectively. Each volume is stored as an entry with a
+        <filename>.volume</filename> suffix below this directory.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><filename>/run/systemd/io.systemd.StorageProvider/fs</filename></term>
+        <term><filename>$XDG_RUNTIME_DIR/systemd/io.systemd.StorageProvider/fs</filename></term>
+
+        <listitem><para>AF_UNIX sockets the service listens on, in system and user mode
+        respectively. These are the canonical locations for the <literal>fs</literal> storage
+        provider, and are enumerated by <command>storagectl providers</command>.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>See Also</title>
+    <para><simplelist type="inline">
+      <member><citerefentry><refentrytitle>systemd</refentrytitle><manvolnum>1</manvolnum></citerefentry></member>
+      <member><citerefentry><refentrytitle>storagectl</refentrytitle><manvolnum>1</manvolnum></citerefentry></member>
+      <member><citerefentry><refentrytitle>systemd-storage-block@.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></member>
+    </simplelist></para>
+  </refsect1>
+
+</refentry>
index 06af278a5a42875183e97a209f4fb19f87e1ac91..7b25553501520eb73fee54f837df1f238ff03367 100644 (file)
                         <allow_active>auth_admin_keep</allow_active>
                 </defaults>
         </action>
+
+        <action id="io.systemd.storage.fs.acquire">
+                <description gettext-domain="systemd">Allow access to file system storage volumes</description>
+                <message gettext-domain="systemd">Authentication is required for an application to gain access to file system storage volume '$(name)'.</message>
+                <defaults>
+                        <allow_any>auth_admin</allow_any>
+                        <allow_inactive>auth_admin</allow_inactive>
+                        <allow_active>auth_admin_keep</allow_active>
+                </defaults>
+        </action>
 </policyconfig>
index 714e50ad9a1ca2ce03cb0272f5a0fa2195320f43..05c5e24ece4ace1b6ace89bc7a572bd3fe6bd8dd 100644 (file)
@@ -3,7 +3,13 @@
 executables += [
         libexec_template + {
                 'name' : 'systemd-storage-block',
-                'sources' : files('storage-block.c', 'storage-util.c'),
+                'sources' : files('storage-block.c'),
+                'extract' : files('storage-util.c')
+        },
+        libexec_template + {
+                'name' : 'systemd-storage-fs',
+                'sources' : files('storage-fs.c'),
+                'objects' : ['systemd-storage-block'],
         },
 ]
 
diff --git a/src/storage/storage-fs.c b/src/storage/storage-fs.c
new file mode 100644 (file)
index 0000000..47ec982
--- /dev/null
@@ -0,0 +1,807 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "sd-device.h"
+#include "sd-json.h"
+
+#include "alloc-util.h"
+#include "build.h"
+#include "bus-polkit.h"
+#include "chase.h"
+#include "chattr-util.h"
+#include "device-private.h"
+#include "device-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "format-table.h"
+#include "fs-util.h"
+#include "hashmap.h"
+#include "help-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "mount-util.h"
+#include "options.h"
+#include "path-lookup.h"
+#include "path-util.h"
+#include "recurse-dir.h"
+#include "runtime-scope.h"
+#include "stat-util.h"
+#include "storage-util.h"
+#include "string-table.h"
+#include "tmpfile-util.h"
+#include "uid-classification.h"
+#include "varlink-io.systemd.StorageProvider.h"
+#include "varlink-util.h"
+
+static RuntimeScope arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
+
+/* For now we maintain a simple, compiled-in list of templates. One of those days we might want to move these
+ * into configurable drop-in files on disk. */
+typedef enum Template {
+        TEMPLATE_SPARSE_FILE,
+        TEMPLATE_ALLOCATED_FILE,
+        TEMPLATE_DIRECTORY,
+        TEMPLATE_SUBVOLUME,
+        _TEMPLATE_MAX,
+        _TEMPLATE_INVALID = -EINVAL,
+} Template;
+
+static const char *template_table[_TEMPLATE_MAX] = {
+        [TEMPLATE_SPARSE_FILE]    = "sparse-file",
+        [TEMPLATE_ALLOCATED_FILE] = "allocated-file",
+        [TEMPLATE_DIRECTORY]      = "directory",
+        [TEMPLATE_SUBVOLUME]      = "subvolume",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(template, Template);
+
+static VolumeType volume_type_from_template(Template t) {
+        switch (t) {
+
+        case TEMPLATE_SPARSE_FILE:
+        case TEMPLATE_ALLOCATED_FILE:
+                return VOLUME_REG;
+
+        case TEMPLATE_DIRECTORY:
+        case TEMPLATE_SUBVOLUME:
+                return VOLUME_DIR;
+
+        default:
+                return _VOLUME_TYPE_INVALID;
+        }
+}
+
+static int open_storage_dir(void) {
+        int r;
+
+        _cleanup_free_ char *state_dir = NULL;
+        r = state_directory_generic(arg_runtime_scope, /* suffix= */ NULL, &state_dir);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get state directory path: %m");
+
+        _cleanup_close_ int state_fd = chase_and_open(state_dir, /* root= */ NULL, CHASE_TRIGGER_AUTOFS|CHASE_MKDIR_0755|CHASE_MUST_BE_DIRECTORY, O_CLOEXEC|O_CREAT|O_DIRECTORY, /* ret_path= */ NULL);
+        if (state_fd < 0)
+                return log_error_errno(state_fd, "Failed to open '%s': %m", state_dir);
+
+        /* First we try to open the storage directory. If it exists this will work and we are happy. If we
+         * get ENOENT we'll try to create it. If that works, great. If we get EEXIST we'll try to reopen it
+         * again, to deal with other instances of ourselves racing with us. We only do this exactly once
+         * though, under the assumption that the dir is never removed, only created during runtime. */
+        _cleanup_close_ int storage_fd = chase_and_openat(XAT_FDROOT, state_fd, "storage", CHASE_TRIGGER_AUTOFS|CHASE_MUST_BE_DIRECTORY, O_CLOEXEC|O_DIRECTORY, /* ret_path= */ NULL);
+        if (storage_fd == -ENOENT) {
+                storage_fd = xopenat_full(state_fd, "storage", O_EXCL|O_CREAT|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW, XO_LABEL|XO_SUBVOLUME, 0700);
+                if (storage_fd == -EEXIST)
+                        storage_fd = chase_and_openat(XAT_FDROOT, state_fd, "storage", CHASE_TRIGGER_AUTOFS|CHASE_MUST_BE_DIRECTORY, O_CLOEXEC|O_DIRECTORY, /* ret_path= */ NULL);
+        }
+        if (storage_fd < 0)
+                return log_error_errno(storage_fd, "Failed to open '%s/storage/': %m", state_dir);
+
+        return TAKE_FD(storage_fd);
+}
+
+static int vl_method_list_volumes(
+                sd_varlink *link,
+                sd_json_variant *parameters,
+                sd_varlink_method_flags_t flags,
+                void *userdata) {
+
+        int r;
+
+        assert(link);
+        assert(FLAGS_SET(flags, SD_VARLINK_METHOD_MORE));
+
+        struct {
+                const char *match_name;
+        } p = {};
+
+        static const sd_json_dispatch_field dispatch_table[] = {
+                { "matchName", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, voffsetof(p, match_name), 0 },
+                {}
+        };
+
+        r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        _cleanup_close_ int fd = open_storage_dir();
+        if (fd < 0)
+                return fd;
+
+        _cleanup_free_ DirectoryEntries *dentries = NULL;
+        r = readdir_all(fd, RECURSE_DIR_SORT, &dentries);
+        if (r < 0)
+                return r;
+
+        r = sd_varlink_set_sentinel(link, "io.systemd.StorageProvider.NoSuchVolume");
+        if (r < 0)
+                return r;
+
+        FOREACH_ARRAY(dp, dentries->entries, dentries->n_entries) {
+                struct dirent *d = *dp;
+
+                if (!IN_SET(d->d_type, DT_REG, DT_DIR, DT_LNK, DT_BLK, DT_UNKNOWN))
+                        continue;
+
+                const char *e = endswith(d->d_name, ".volume");
+                if (!e)
+                        continue;
+
+                _cleanup_free_ char *n = strndup(d->d_name, e - d->d_name);
+                if (!n)
+                        return log_oom_debug();
+
+                if (!storage_volume_name_is_valid(n))
+                        continue;
+
+                if (p.match_name && fnmatch(p.match_name, n, FNM_NOESCAPE) != 0)
+                        continue;
+
+                _cleanup_close_ int pin_fd = -EBADF;
+                r = chaseat(XAT_FDROOT, fd, d->d_name, CHASE_TRIGGER_AUTOFS, /* ret_path= */ NULL, &pin_fd);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to stat() '%s' in storage directory, ignoring: %m", d->d_name);
+                        continue;
+                }
+
+                struct stat st;
+                if (fstat(pin_fd, &st) < 0)
+                        return log_debug_errno(errno, "Failed to stat() '%s' in storage directory: %m", d->d_name);
+
+                uint64_t size = UINT64_MAX, used = UINT64_MAX;
+                bool ro = false;
+
+                switch (st.st_mode & S_IFMT) {
+                case S_IFREG:
+                        ro = (st.st_mode & 0222) == 0;
+                        size = st.st_size;
+                        used = (uint64_t) st.st_blocks * UINT64_C(512);
+                        break;
+
+                case S_IFDIR:
+                        r = fd_is_read_only_fs(pin_fd);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to determine if '%s' is read-only, ignoring", d->d_name);
+                        else
+                                ro = r > 0;
+                        break;
+
+                case S_IFBLK: {
+                        _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+
+                        r = sd_device_new_from_stat_rdev(&dev, &st);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to acquire device for '%s', ignoring: %m", d->d_name);
+                        else {
+                                r = device_get_sysattr_bool(dev, "ro");
+                                if (r < 0)
+                                        log_device_debug_errno(dev, r, "Failed to get read/only state of '%s', ignoring: %m", d->d_name);
+                                else
+                                        ro = r > 0;
+
+                                r = device_get_sysattr_u64(dev, "size", &size);
+                                if (r < 0)
+                                        log_device_debug_errno(dev, r, "Failed to acquire size of device '%s', ignoring: %m", d->d_name);
+                                else
+                                        /* the 'size' sysattr is always in multiples of 512, even on 4K sector block devices! */
+                                        assert_se(MUL_ASSIGN_SAFE(&size, 512)); /* Overflow check for coverity */
+                        }
+
+                        break;
+                }
+
+                default:
+                        log_debug("Volume of unexpected inode type, ignoring: %s", d->d_name);
+                        continue;
+                }
+
+                r = sd_varlink_replybo(
+                                link,
+                                SD_JSON_BUILD_PAIR_STRING("name", n),
+                                SD_JSON_BUILD_PAIR_STRING("type", inode_type_to_string(st.st_mode)),
+                                SD_JSON_BUILD_PAIR_BOOLEAN("readOnly", ro),
+                                SD_JSON_BUILD_PAIR_CONDITION(size != UINT64_MAX, "sizeBytes", SD_JSON_BUILD_UNSIGNED(size)),
+                                SD_JSON_BUILD_PAIR_CONDITION(used != UINT64_MAX, "usedBytes", SD_JSON_BUILD_UNSIGNED(used)));
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int vl_method_list_templates(
+                sd_varlink *link,
+                sd_json_variant *parameters,
+                sd_varlink_method_flags_t flags,
+                void *userdata) {
+
+        int r;
+
+        assert(link);
+        assert(FLAGS_SET(flags, SD_VARLINK_METHOD_MORE));
+
+        struct {
+                const char *match_name;
+        } p = {};
+
+        static const sd_json_dispatch_field dispatch_table[] = {
+                { "matchName", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, voffsetof(p, match_name), 0 },
+                {}
+        };
+
+        r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        r = sd_varlink_set_sentinel(link, "io.systemd.StorageProvider.NoSuchTemplate");
+        if (r < 0)
+                return r;
+
+        for (Template t = 0; t < _TEMPLATE_MAX; t++) {
+                const char *n = template_to_string(t);
+
+                if (p.match_name && fnmatch(p.match_name, n, FNM_NOESCAPE) != 0)
+                        continue;
+
+                r = sd_varlink_replybo(
+                                link,
+                                SD_JSON_BUILD_PAIR_STRING("name", n),
+                                SD_JSON_BUILD_PAIR_STRING("type", volume_type_to_string(volume_type_from_template(t))));
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int create_volume_dir(
+                int storage_fd,
+                const char *filename,
+                Template t) {
+
+        int r;
+
+        assert(storage_fd >= 0);
+        assert(filename);
+
+        XOpenFlags xopen_flags;
+        switch (t) {
+
+        case TEMPLATE_DIRECTORY:
+                xopen_flags = 0;
+                break;
+
+        case TEMPLATE_SUBVOLUME:
+                xopen_flags = XO_SUBVOLUME;
+                break;
+
+        default:
+                return -ENOMEDIUM; /* Recognizable error for: template doesn't apply here */
+        }
+
+        _cleanup_free_ char *tf = NULL;
+        r = tempfn_random(filename, /* extra= */ NULL, &tf);
+        if (r < 0)
+                return r;
+
+        _cleanup_close_ int volume_fd = xopenat_full(storage_fd, tf, O_CREAT|O_EXCL|O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW, xopen_flags, 0700);
+        if (volume_fd < 0)
+                return volume_fd;
+
+        _cleanup_close_ int root_fd = xopenat_full(volume_fd, "root", O_CREAT|O_EXCL|O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW, xopen_flags, 0755);
+        if (root_fd < 0) {
+                r = root_fd;
+                goto fail;
+        }
+
+        r = RET_NERRNO(fchown(root_fd, FOREIGN_UID_MIN, FOREIGN_UID_MIN));
+        if (r < 0)
+                goto fail;
+
+        r = rename_noreplace(storage_fd, tf, storage_fd, filename);
+        if (r < 0)
+                goto fail;
+
+        return TAKE_FD(root_fd);
+
+fail:
+        if (root_fd >= 0) {
+                assert(volume_fd >= 0);
+                root_fd = safe_close(root_fd);
+                (void) unlinkat(volume_fd, "root", AT_REMOVEDIR);
+        }
+
+        if (volume_fd >= 0) {
+                volume_fd = safe_close(volume_fd);
+                (void) unlinkat(storage_fd, tf, AT_REMOVEDIR);
+        }
+
+        return r;
+}
+
+static int create_volume_reg(
+                int storage_fd,
+                const char *filename,
+                Template t,
+                uint64_t create_size) {
+        int r;
+
+        assert(storage_fd >= 0);
+        assert(filename);
+
+        bool sparse;
+        switch (t) {
+
+        case TEMPLATE_SPARSE_FILE:
+                sparse = true;
+                break;
+
+        case TEMPLATE_ALLOCATED_FILE:
+                sparse = false;
+                break;
+
+        default:
+                return -ENOMEDIUM; /* Recognizable error for: template doesn't apply here */
+        }
+
+        _cleanup_free_ char *tf = NULL;
+        _cleanup_close_ int fd = open_tmpfile_linkable_at(storage_fd, filename, O_RDWR|O_CLOEXEC, &tf);
+        if (fd < 0)
+                return fd;
+
+        CLEANUP_TMPFILE_AT(storage_fd, tf);
+
+        r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL);
+        if (r < 0 && !ERRNO_IS_IOCTL_NOT_SUPPORTED(r))
+                return r;
+
+        if (create_size > 0) {
+                if (sparse)
+                        r = RET_NERRNO(ftruncate(fd, create_size));
+                else
+                        r = RET_NERRNO(fallocate(fd, /* mode= */ 0, /* offset= */ 0, create_size));
+                if (r < 0)
+                        return r;
+        }
+
+        r = RET_NERRNO(fchmod(fd, 0600));
+        if (r < 0)
+                return r;
+
+        r = link_tmpfile_at(fd, storage_fd, tf, filename, /* flags= */ 0);
+        if (r < 0)
+                return r;
+
+        tf = mfree(tf); /* disarm clean-up */
+
+        return TAKE_FD(fd);
+}
+
+static int vl_method_acquire(
+                sd_varlink *link,
+                sd_json_variant *parameters,
+                sd_varlink_method_flags_t flags,
+                void *userdata) {
+
+        Hashmap **polkit_registry = ASSERT_PTR(userdata);
+        int r;
+
+        assert(link);
+
+        struct {
+                const char *name;
+                CreateMode create_mode;
+                const char *template;
+                int read_only;
+                VolumeType request_as;
+                uint64_t create_size;
+        } p = {
+                .create_mode = CREATE_ANY,
+                .read_only = -1,
+                .request_as = _VOLUME_TYPE_INVALID,
+                .create_size = UINT64_MAX,
+        };
+
+        static const sd_json_dispatch_field dispatch_table[] = {
+                { "name",            SD_JSON_VARIANT_STRING,        sd_json_dispatch_const_string, voffsetof(p, name),        SD_JSON_MANDATORY },
+                { "createMode",      SD_JSON_VARIANT_STRING,        json_dispatch_create_mode,     voffsetof(p, create_mode), 0                 },
+                { "template",        SD_JSON_VARIANT_STRING,        sd_json_dispatch_const_string, voffsetof(p, template),    0                 },
+                { "readOnly",        SD_JSON_VARIANT_BOOLEAN,       sd_json_dispatch_tristate,     voffsetof(p, read_only),   0                 },
+                { "requestAs",       SD_JSON_VARIANT_STRING,        json_dispatch_volume_type,     voffsetof(p, request_as),  0                 },
+                { "createSizeBytes", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint64,       voffsetof(p, create_size), 0                 },
+                VARLINK_DISPATCH_POLKIT_FIELD,
+                {}
+        };
+
+        r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!storage_volume_name_is_valid(p.name))
+                return sd_varlink_error_invalid_parameter_name(link, "name");
+
+        if (!IN_SET(p.create_mode, CREATE_ANY, CREATE_OPEN, CREATE_NEW))
+                return sd_varlink_error(link, "io.systemd.StorageProvider.CreateNotSupported", NULL);
+
+        /* off_t is signed, hence refuse overly long requests */
+        if (p.create_size != UINT64_MAX && p.create_size > INT64_MAX)
+                return sd_varlink_error_invalid_parameter_name(link, "createSizeBytes");
+
+        Template t = _TEMPLATE_INVALID;
+        if (!isempty(p.template)) {
+                if (!storage_template_name_is_valid(p.template))
+                        return sd_varlink_error_invalid_parameter_name(link, "template");
+
+                t = template_from_string(p.template);
+                if (t < 0)
+                        return sd_varlink_error(link, "io.systemd.StorageProvider.NoSuchTemplate", NULL);
+        }
+
+        if (p.read_only > 0) {
+                if (p.create_mode == CREATE_NEW)
+                        return sd_varlink_error_invalid_parameter_name(link, "readOnly");
+
+                p.create_mode = CREATE_OPEN;
+        }
+
+        /* Add a suffix so that we are never attempted to open a temporary file assuming it was a valid
+         * volume.  */
+        _cleanup_free_ char *filename = strjoin(p.name, ".volume");
+        if (!filename)
+                return log_oom_debug();
+
+        if (!filename_is_valid(filename))
+                return sd_varlink_error_invalid_parameter_name(link, "name");
+
+        if (arg_runtime_scope != RUNTIME_SCOPE_USER) {
+                const char *details[] = {
+                        "name", p.name,
+                        NULL
+                };
+
+                r = varlink_verify_polkit_async(
+                                link,
+                                /* bus= */ NULL,
+                                "io.systemd.storage.fs.acquire",
+                                details,
+                                polkit_registry);
+                if (r <= 0)
+                        return r;
+        }
+
+        _cleanup_close_ int storage_fd = open_storage_dir();
+        if (storage_fd < 0)
+                return storage_fd;
+
+        _cleanup_close_ int pin_fd = -EBADF, real_fd = -EBADF;
+        r = chaseat(XAT_FDROOT, storage_fd, filename, CHASE_TRIGGER_AUTOFS, /* ret_path= */ NULL, &pin_fd);
+        if (r < 0) {
+                if (r != -ENOENT)
+                        return r;
+                if (p.create_mode == CREATE_OPEN || p.read_only > 0)
+                        return sd_varlink_error(link, "io.systemd.StorageProvider.NoSuchVolume", NULL);
+
+                /* Doesn't exist yet: create it now */
+
+                if (p.request_as < 0) /* Make a choice: pick default type */
+                        p.request_as = t < 0 ? VOLUME_DIR : volume_type_from_template(t);
+
+                /* Try to create the volume */
+                switch (p.request_as) {
+
+                case VOLUME_DIR: {
+
+                        if (t < 0) /* Make a choice: pick default template */
+                                t = TEMPLATE_SUBVOLUME;
+
+                        real_fd = create_volume_dir(storage_fd, filename, t);
+                        break;
+                }
+
+                case VOLUME_REG: {
+                        if (p.create_size == UINT64_MAX)
+                                return sd_varlink_error(link, "io.systemd.StorageProvider.CreateSizeRequired", NULL);
+
+                        if (t < 0) /* Make a choice: pick default template */
+                                t = TEMPLATE_SPARSE_FILE;
+
+                        real_fd = create_volume_reg(storage_fd, filename, t, p.create_size);
+                        break;
+                }
+
+                case VOLUME_BLK:
+                        /* We don't support creating block devices, we only support if they are symlinked
+                         * into the storage directory. */
+                        return sd_varlink_error(link, "io.systemd.StorageProvider.CreateNotSupported", NULL);
+
+                default:
+                        assert_not_reached();
+                }
+
+                if (real_fd == -ENOMEDIUM)
+                        return sd_varlink_error(link, "io.systemd.StorageProvider.BadTemplate", NULL);
+                if (real_fd == -EEXIST) {
+                        if (p.create_mode == CREATE_NEW)
+                                return sd_varlink_error(link, "io.systemd.StorageProvider.VolumeExists", NULL);
+
+                        /* If we failed to open the volume and reached this point, then the volume already
+                         * exists by now (i.e. we ran into a race). In that case, try to pin it a second time
+                         * (but only once, let's never loop around this). */
+                        r = chaseat(XAT_FDROOT, storage_fd, filename, CHASE_TRIGGER_AUTOFS, /* ret_path= */ NULL, &pin_fd);
+                        if (r < 0)
+                                return r;
+                } else if (real_fd < 0)
+                        return real_fd;
+
+        } else if (p.create_mode == CREATE_NEW)
+                return sd_varlink_error(link, "io.systemd.StorageProvider.VolumeExists", NULL);
+
+        /* At this point, we either already opened the real fd, or we managed to pin it (but not both) */
+        assert((real_fd >= 0) != (pin_fd >= 0));
+
+        /* Let's first settle the volume type */
+        struct stat st;
+        if (fstat(real_fd >= 0 ? real_fd : pin_fd, &st) < 0)
+                return -errno;
+
+        if (p.request_as == VOLUME_REG) {
+                /* First, check for the other supported types and generate a nice error */
+                if (IN_SET(st.st_mode & S_IFMT, S_IFDIR, S_IFBLK))
+                        return sd_varlink_error(link, "io.systemd.StorageProvider.WrongType", NULL);
+
+                /* Second verify cover all other types */
+                r = stat_verify_regular(&st);
+                if (r < 0)
+                        return r;
+        } else if (p.request_as == VOLUME_DIR) {
+                if (IN_SET(st.st_mode & S_IFMT, S_IFREG, S_IFBLK))
+                        return sd_varlink_error(link, "io.systemd.StorageProvider.WrongType", NULL);
+
+                r = stat_verify_directory(&st);
+                if (r < 0)
+                        return r;
+        } else if (p.request_as == VOLUME_BLK) {
+                if (IN_SET(st.st_mode & S_IFMT, S_IFREG, S_IFDIR))
+                        return sd_varlink_error(link, "io.systemd.StorageProvider.WrongType", NULL);
+
+                r = stat_verify_block(&st);
+                if (r < 0)
+                        return r;
+
+        } else if (S_ISREG(st.st_mode))
+                p.request_as = VOLUME_REG;
+        else if (S_ISDIR(st.st_mode))
+                p.request_as = VOLUME_DIR;
+        else if (S_ISBLK(st.st_mode))
+                p.request_as = VOLUME_BLK;
+        else
+                return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "Unexpected inode type, refusing.");
+
+        /* Let's now acquire a real fd for the pinned fd, if we still need to */
+        if (real_fd < 0) {
+                assert(pin_fd >= 0);
+
+                XOpenFlags xopen_flags =
+                        (p.read_only < 0 && !S_ISDIR(st.st_mode) ? XO_AUTO_RW_RO : 0);
+                int open_flags =
+                        (p.read_only < 0 ? 0 : (p.read_only > 0 || S_ISDIR(st.st_mode) ? O_RDONLY : O_RDWR));
+
+                const char *subdir = NULL;
+                if (p.request_as == VOLUME_DIR) {
+                        /* We place the root of the directory tree one level down, to separate ownership of
+                         * the inode: the upper inode is owned by the host, the lower one by the volume. This
+                         * matters so that the host one can be owned by the host's root, and the volume one
+                         * by the foreign UID range. */
+                        subdir = "root";
+                        open_flags |= O_DIRECTORY|O_NOFOLLOW;
+                }
+
+                real_fd = xopenat_full(pin_fd, subdir, open_flags|O_CLOEXEC, xopen_flags, /* mode= */ MODE_INVALID);
+                if (real_fd < 0)
+                        return log_debug_errno(real_fd, "Failed to reopen volume fd for '%s': %m", filename);
+
+                /* In directory mode we might be looking at a different inode node, refresh the stat data */
+                if (p.request_as == VOLUME_DIR && fstat(real_fd, &st) < 0)
+                        return -errno;
+        }
+
+        assert(real_fd >= 0);
+
+        bool ro;
+        switch (p.request_as) {
+
+        case VOLUME_REG:
+        case VOLUME_BLK: {
+                assert(IN_SET(st.st_mode & S_IFMT, S_IFREG, S_IFBLK));
+
+                int open_flags = fcntl(real_fd, F_GETFL, 0);
+                if (open_flags < 0)
+                        return -errno;
+
+                ro = (open_flags & O_ACCMODE_STRICT) == O_RDONLY;
+                break;
+        }
+
+        case VOLUME_DIR: {
+                assert(S_ISDIR(st.st_mode));
+
+                if (!uid_is_foreign(st.st_uid) ||
+                    !gid_is_foreign(st.st_gid))
+                        return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Storage directory not owned by foreign UID/GID range.");
+
+                /* Let's now generate a new mount for the directory tree, where propagation is disabled, and the
+                 * flags are all set to good defaults */
+                _cleanup_close_ int mount_fd = open_tree_attr_with_fallback(
+                                real_fd,
+                                /* path= */ NULL,
+                                OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW,
+                                &(struct mount_attr) {
+                                        .attr_set = (p.read_only > 0 ? MOUNT_ATTR_RDONLY : 0),
+                                        .attr_clr = MOUNT_ATTR_NOSUID|MOUNT_ATTR_NOEXEC|MOUNT_ATTR_NODEV,
+                                        .propagation = MS_PRIVATE,
+                                });
+                if (mount_fd < 0)
+                        return log_debug_errno(mount_fd, "Failed to generate per-volume mount: %m");
+
+                /* Let's turn on propagation again now that it is disconnected, simply because MS_SHARED is
+                 * generally the default for everything we return. */
+
+                if (mount_setattr(mount_fd, "", AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW,
+                                  &(struct mount_attr) {
+                                          .propagation = MS_SHARED,
+                                  }, MOUNT_ATTR_SIZE_VER0) < 0)
+                        return log_debug_errno(errno, "Failed to enable propagation on per-volume mount: %m");
+
+                close_and_replace(real_fd, mount_fd);
+
+                r = fd_is_read_only_fs(real_fd);
+                if (r < 0)
+                        return r;
+
+                ro = r > 0;
+                break;
+        }
+
+        default:
+                assert_not_reached();
+        }
+
+        if (p.read_only == 0 && ro)
+                return sd_varlink_error(link, "io.systemd.StorageProvider.ReadOnlyVolume", NULL);
+
+        int idx = sd_varlink_push_fd(link, real_fd);
+        if (idx < 0)
+                return idx;
+
+        TAKE_FD(real_fd);
+
+        return sd_varlink_replybo(
+                        link,
+                        SD_JSON_BUILD_PAIR_INTEGER("fileDescriptorIndex", idx),
+                        SD_JSON_BUILD_PAIR_STRING("type", inode_type_to_string(st.st_mode)),
+                        SD_JSON_BUILD_PAIR_BOOLEAN("readOnly", ro),
+                        SD_JSON_BUILD_PAIR_CONDITION(p.request_as == VOLUME_DIR, "baseUID", SD_JSON_BUILD_INTEGER(FOREIGN_UID_BASE)),
+                        SD_JSON_BUILD_PAIR_CONDITION(p.request_as == VOLUME_DIR, "baseGID", SD_JSON_BUILD_INTEGER(FOREIGN_UID_BASE)));
+}
+
+static int vl_server(void) {
+        int r;
+
+        _cleanup_(hashmap_freep) Hashmap *polkit_registry = NULL;
+        _cleanup_(sd_varlink_server_unrefp) sd_varlink_server *varlink_server = NULL;
+        r = varlink_server_new(
+                        &varlink_server,
+                        SD_VARLINK_SERVER_HANDLE_SIGINT|
+                        SD_VARLINK_SERVER_HANDLE_SIGTERM|
+                        SD_VARLINK_SERVER_ALLOW_FD_PASSING_OUTPUT|
+                        SD_VARLINK_SERVER_INHERIT_USERDATA,
+                        &polkit_registry);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate Varlink server: %m");
+
+        r = sd_varlink_server_add_interface(varlink_server, &vl_interface_io_systemd_StorageProvider);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add Varlink interface: %m");
+
+        r = sd_varlink_server_bind_method_many(
+                        varlink_server,
+                        "io.systemd.StorageProvider.Acquire", vl_method_acquire,
+                        "io.systemd.StorageProvider.ListVolumes", vl_method_list_volumes,
+                        "io.systemd.StorageProvider.ListTemplates", vl_method_list_templates);
+        if (r < 0)
+                return log_error_errno(r, "Failed to bind Varlink methods: %m");
+
+        r = sd_varlink_server_loop_auto(varlink_server);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run Varlink event loop: %m");
+
+        return 0;
+}
+
+static int help(void) {
+        int r;
+
+        help_cmdline("[OPTIONS...]");
+        help_abstract("Simple file system backed storage provider");
+
+        _cleanup_(table_unrefp) Table *options = NULL;
+        r = option_parser_get_help_table(&options);
+        if (r < 0)
+                return r;
+
+        help_section("Options:");
+
+        r = table_print_or_warn(options);
+        if (r < 0)
+                return r;
+
+        help_man_page_reference("systemd-storage-fs", "8");
+        return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+        assert(argc >= 0);
+        assert(argv);
+
+        OptionParser opts = { argc, argv };
+        FOREACH_OPTION(c, &opts, /* on_error= */ return c)
+                switch (c) {
+
+                OPTION_COMMON_HELP:
+                        return help();
+
+                OPTION_COMMON_VERSION:
+                        return version();
+
+                OPTION_LONG("system", NULL, "Operate in system mode"):
+                        arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
+                        break;
+
+                OPTION_LONG("user", NULL, "Operate in user mode"):
+                        arg_runtime_scope = RUNTIME_SCOPE_USER;
+                        break;
+                }
+
+        if (option_parser_get_n_args(&opts) > 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments.");
+
+        return 1;
+}
+
+static int run(int argc, char* argv[]) {
+        int r;
+
+        log_setup();
+
+        r = parse_argv(argc, argv);
+        if (r <= 0)
+                return r;
+
+        return vl_server();
+}
+
+DEFINE_MAIN_FUNCTION(run);
index 3cac3c876ae1c5714ce6f5151235f077f50f3d69..0f7ce75bd89673b5f4885f404baf274bbad3bc3c 100644 (file)
@@ -811,6 +811,13 @@ units = [
         {
           'file' : 'systemd-storage-block@.service.in',
         },
+        {
+          'file' : 'systemd-storage-fs.socket',
+          'symlinks' : ['sockets.target.wants/']
+        },
+        {
+          'file' : 'systemd-storage-fs@.service.in',
+        },
         {
           'file' : 'systemd-storagetm.service.in',
           'conditions' : ['ENABLE_STORAGETM'],
diff --git a/units/systemd-storage-fs.socket b/units/systemd-storage-fs.socket
new file mode 100644 (file)
index 0000000..c83cf0a
--- /dev/null
@@ -0,0 +1,25 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+[Unit]
+Description=Simple File System Backed Storage Provider
+Documentation=man:systemd-storage-fs@.service(8)
+DefaultDependencies=no
+RequiresMountsFor=/var/lib/storage
+Before=sockets.target
+
+[Socket]
+ListenStream=/run/systemd/io.systemd.StorageProvider/fs
+FileDescriptorName=varlink
+SocketMode=0666
+Accept=yes
+MaxConnectionsPerSource=16
+
+[Install]
+WantedBy=sockets.target
diff --git a/units/systemd-storage-fs@.service.in b/units/systemd-storage-fs@.service.in
new file mode 100644 (file)
index 0000000..39b6da3
--- /dev/null
@@ -0,0 +1,19 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+[Unit]
+Description=Simple File System Backed Storage Provider
+Documentation=man:systemd-storage-fs@.service(8)
+DefaultDependencies=no
+RequiresMountsFor=/var/lib/storage
+Conflicts=shutdown.target initrd-switch-root.target
+Before=shutdown.target initrd-switch-root.target
+
+[Service]
+ExecStart=-{{LIBEXECDIR}}/systemd-storage-fs
index a9c6d44281c28c7af81e414d480b2264e3177d91..39c41a4c1cd8c0a44f9a709aa85ff07492a6164d 100644 (file)
@@ -61,6 +61,13 @@ units = [
           'file' : 'systemd-journalctl.socket',
           'symlinks' : ['sockets.target.wants/'],
         },
+        {
+          'file' : 'systemd-storage-fs.socket',
+          'symlinks' : ['sockets.target.wants/']
+        },
+        {
+          'file' : 'systemd-storage-fs@.service.in',
+        },
         { 'file' : 'systemd-tmpfiles-clean.service' },
         { 'file' : 'systemd-tmpfiles-clean.timer' },
         { 'file' : 'systemd-tmpfiles-setup.service' },
diff --git a/units/user/systemd-storage-fs.socket b/units/user/systemd-storage-fs.socket
new file mode 100644 (file)
index 0000000..fa8018b
--- /dev/null
@@ -0,0 +1,23 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+[Unit]
+Description=Simple File System Backed Storage Provider
+Documentation=man:systemd-storage-fs.service(8)
+Before=sockets.target
+
+[Socket]
+ListenStream=%t/systemd/io.systemd.StorageProvider/fs
+FileDescriptorName=varlink
+SocketMode=0600
+Accept=yes
+MaxConnectionsPerSource=16
+
+[Install]
+WantedBy=sockets.target
diff --git a/units/user/systemd-storage-fs@.service.in b/units/user/systemd-storage-fs@.service.in
new file mode 100644 (file)
index 0000000..95afa91
--- /dev/null
@@ -0,0 +1,15 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+[Unit]
+Description=Simple File System Backed Storage Provider
+Documentation=man:systemd-storage-fs.service(8)
+
+[Service]
+ExecStart=-{{LIBEXECDIR}}/systemd-storage-fs --user