]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: add --suppress-sync=yes mode for turning sync() and friends into NOPs via... 21051/head
authorLennart Poettering <lennart@poettering.net>
Tue, 19 Oct 2021 12:56:49 +0000 (14:56 +0200)
committerLennart Poettering <lennart@poettering.net>
Wed, 20 Oct 2021 09:35:15 +0000 (11:35 +0200)
This is supposed to be used by package/image builders such as mkosi to
speed up building, since it allows us to suppress sync() inside a
container.

This does what Debian's eatmydata tool does, but for a container, and
via seccomp (instead of LD_PRELOAD).

docs/ENVIRONMENT.md
man/systemd-nspawn.xml
man/systemd.nspawn.xml
shell-completion/bash/systemd-nspawn
src/nspawn/nspawn-gperf.gperf
src/nspawn/nspawn-settings.h
src/nspawn/nspawn.c
src/shared/seccomp-util.c
src/shared/seccomp-util.h

index 9a824820dabf711b4a674db907a90d71fc3e0e9d..328934cd17b89726e5ffdf52b95422ec3d9eb44f 100644 (file)
@@ -138,6 +138,12 @@ All tools:
 * `$SYSTEMD_NSPAWN_TMPFS_TMP=0` — if set, do not overmount `/tmp/` in the
   container with a tmpfs, but leave the directory from the image in place.
 
+* `$SYSTEMD_SUPPRESS_SYNC=1` — if set, all disk synchronization syscalls are
+  blocked to the container payload (e.g. `sync()`, `fsync()`, `syncfs()`, …)
+  and the `O_SYNC`/`O_DSYNC` flags are made unavailable to `open()` and
+  friends. This is equivalent to passing `--suppress-sync=yes` on the
+  `systemd-nspawn` command line.
+
 `systemd-logind`:
 
 * `$SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK=1` — if set, report that
index e84ac6ae42bc1c2e4ca081b64807485656f04115..aec0b0e1299661560a5d17b5abd04ea17410a8b2 100644 (file)
         before sending its own to systemd. For more details about notifications
         see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>.</para></listitem>
       </varlistentry>
+
+      <varlistentry>
+        <term><option>--suppress-sync=</option></term>
+
+        <listitem><para>Expects a boolean argument. If true, turns off any form of on-disk file system
+        synchronization for the container payload. This means all system calls such as <citerefentry
+        project='man-pages'><refentrytitle>sync</refentrytitle><manvolnum>2</manvolnum></citerefentry>,
+        <function>fsync()</function>, <function>syncfs()</function>, … will execute no operation, and the
+        <constant>O_SYNC</constant>/<constant>O_DSYNC</constant> flags to <citerefentry
+        project='man-pages'><refentrytitle>open</refentrytitle><manvolnum>2</manvolnum></citerefentry> and
+        related calls will be made unavailable. This is potentially dangerous, as assumed data integrity
+        guarantees to the container payload are not actually enforced (i.e. data assumed to have been written
+        to disk might be lost if the system is shut down abnormally). However, this can dramatically improve
+        container runtime performance – as long as these guarantees are not required or desirable, for
+        example because any data written by the container is of temporary, redundant nature, or just an
+        intermediary artifact that will be further processed and finalized by a later step in a
+        pipeline. Defaults to false.</para></listitem>
+      </varlistentry>
     </variablelist>
 
     </refsect2><refsect2>
index dc0e2f9fd2141c9cfb58c59ecabd0efe9fe28392..bb9bf4b5d97d8ecc4fb0f98fdf53d4066de9fed6 100644 (file)
         details.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>SuppressSync=</varname></term>
+
+        <listitem><para>Configures whether to suppress disk synchronization for the container payload. This
+        is equivalent to the <option>--suppress-sync=</option> command line switch, and takes the same
+        parameter. See
+        <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
+        for details.</para></listitem>
+      </varlistentry>
+
     </variablelist>
   </refsect1>
 
index f367c7d14cc3bcb85b77299a299619b598a27672..3b6d65d96a96915571bcb66e52fdbd2bdc0e2b6a 100644 (file)
@@ -63,7 +63,7 @@ _systemd_nspawn() {
 
     local -A OPTS=(
         [STANDALONE]='-h --help --version --private-network -b --boot --read-only -q --quiet --share-system
-                      --keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U'
+                      --keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U --suppress-sync=yes'
         [ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro
                       -M --machine -S --slice -E --setenv -Z --selinux-context -L --selinux-apifs-context
                       --register --network-interface --network-bridge --personality -i --image --tmpfs
index ea15e271482fb04101e2fae5b83991fd9268c34a..4af00c8d95b0114f0f0956d2d1e664c95ab1f60d 100644 (file)
@@ -59,6 +59,7 @@ Exec.CPUAffinity,             config_parse_cpu_affinity,   0,                 0
 Exec.ResolvConf,              config_parse_resolv_conf,    0,                 offsetof(Settings, resolv_conf)
 Exec.LinkJournal,             config_parse_link_journal,   0,                 0
 Exec.Timezone,                config_parse_timezone,       0,                 offsetof(Settings, timezone)
+Exec.SuppressSync,            config_parse_bool,           0,                 offsetof(Settings, suppress_sync)
 Files.ReadOnly,               config_parse_tristate,       0,                 offsetof(Settings, read_only)
 Files.Volatile,               config_parse_volatile_mode,  0,                 offsetof(Settings, volatile_mode)
 Files.Bind,                   config_parse_bind,           0,                 0
index 939e1c757b77323ab28cbe64830793546024d868..1b3ace5f8fa0dc783797da308202510c2fb57419 100644 (file)
@@ -127,9 +127,10 @@ typedef enum SettingsMask {
         SETTING_CONSOLE_MODE      = UINT64_C(1) << 29,
         SETTING_CREDENTIALS       = UINT64_C(1) << 30,
         SETTING_BIND_USER         = UINT64_C(1) << 31,
-        SETTING_RLIMIT_FIRST      = UINT64_C(1) << 32, /* we define one bit per resource limit here */
-        SETTING_RLIMIT_LAST       = UINT64_C(1) << (32 + _RLIMIT_MAX - 1),
-        _SETTINGS_MASK_ALL        = (UINT64_C(1) << (32 + _RLIMIT_MAX)) -1,
+        SETTING_SUPPRESS_SYNC     = UINT64_C(1) << 32,
+        SETTING_RLIMIT_FIRST      = UINT64_C(1) << 33, /* we define one bit per resource limit here */
+        SETTING_RLIMIT_LAST       = UINT64_C(1) << (33 + _RLIMIT_MAX - 1),
+        _SETTINGS_MASK_ALL        = (UINT64_C(1) << (33 + _RLIMIT_MAX)) -1,
         _SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
 } SettingsMask;
 
@@ -189,6 +190,7 @@ typedef struct Settings {
         LinkJournal link_journal;
         bool link_journal_try;
         TimezoneMode timezone;
+        bool suppress_sync;
 
         /* [Files] */
         int read_only;
index 7b767fb2963696e02c4fd75f0e592c4e028e9061..7dbc84369b54e44826efb781ddfc0580b18e6eac 100644 (file)
@@ -229,6 +229,7 @@ static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
 static Credential *arg_credentials = NULL;
 static size_t arg_n_credentials = 0;
 static char **arg_bind_user = NULL;
+static bool arg_suppress_sync = false;
 
 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
@@ -342,7 +343,9 @@ static int help(void) {
                "  -E --setenv=NAME[=VALUE]  Pass an environment variable to PID 1\n"
                "  -u --user=USER            Run the command under specified user or UID\n"
                "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
-               "     --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
+               "     --notify-ready=BOOLEAN Receive notifications from the child init process\n"
+               "     --suppress-sync=BOOLEAN\n"
+               "                            Suppress any form of disk data synchronization\n\n"
                "%3$sSystem Identity:%4$s\n"
                "  -M --machine=NAME         Set the machine name for the container\n"
                "     --hostname=NAME        Override the hostname for the container\n"
@@ -654,6 +657,12 @@ static int parse_environment(void) {
         if (e)
                 arg_container_service_name = e;
 
+        r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
+        if (r >= 0)
+                arg_suppress_sync = r;
+        else if (r != -ENXIO)
+                log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
+
         return detect_unified_cgroup_hierarchy_from_environment();
 }
 
@@ -713,6 +722,7 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_SET_CREDENTIAL,
                 ARG_LOAD_CREDENTIAL,
                 ARG_BIND_USER,
+                ARG_SUPPRESS_SYNC,
         };
 
         static const struct option options[] = {
@@ -785,6 +795,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "set-credential",         required_argument, NULL, ARG_SET_CREDENTIAL         },
                 { "load-credential",        required_argument, NULL, ARG_LOAD_CREDENTIAL        },
                 { "bind-user",              required_argument, NULL, ARG_BIND_USER              },
+                { "suppress-sync",          required_argument, NULL, ARG_SUPPRESS_SYNC          },
                 {}
         };
 
@@ -1668,6 +1679,14 @@ static int parse_argv(int argc, char *argv[]) {
                         arg_settings_mask |= SETTING_BIND_USER;
                         break;
 
+                case ARG_SUPPRESS_SYNC:
+                        r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
+                        if (r < 0)
+                                return r;
+
+                        arg_settings_mask |= SETTING_SUPPRESS_SYNC;
+                        break;
+
                 case '?':
                         return -EINVAL;
 
@@ -3385,6 +3404,12 @@ static int inner_child(
                         return r;
         }
 
+        if (arg_suppress_sync) {
+                r = seccomp_suppress_sync();
+                if (r < 0)
+                        log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
+        }
+
 #if HAVE_SELINUX
         if (arg_selinux_context)
                 if (setexeccon(arg_selinux_context) < 0)
@@ -4552,6 +4577,9 @@ static int merge_settings(Settings *settings, const char *path) {
                         arg_console_mode = settings->console_mode;
         }
 
+        if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0)
+                arg_suppress_sync = settings->suppress_sync;
+
         /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
          * don't consult arg_settings_mask for them. */
 
index 31d6b542c03f6f37745a0962cde0f4f4e9aff2a5..ff90af538b390a144b44f48233de98164c5fa97c 100644 (file)
@@ -2205,3 +2205,98 @@ int parse_syscall_and_errno(const char *in, char **name, int *error) {
 
         return 0;
 }
+
+static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
+        bool any = false;
+        int r;
+
+        /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
+         * EINVAL, in the hope the client code will retry without O_SYNC then.  */
+
+#if SCMP_SYS(open) > 0
+        r = seccomp_rule_add_exact(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EINVAL),
+                        SCMP_SYS(open),
+                        1,
+                        SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
+        if (r < 0)
+                log_debug_errno(r, "Failed to add filter for open: %m");
+        else
+                any = true;
+#endif
+
+        r = seccomp_rule_add_exact(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EINVAL),
+                        SCMP_SYS(openat),
+                        1,
+                        SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
+        if (r < 0)
+                log_debug_errno(r, "Failed to add filter for openat: %m");
+        else
+                any = true;
+
+#if defined(__SNR_openat2)
+        /* The new openat2() system call can't be filtered sensibly, see above. */
+        r = seccomp_rule_add_exact(
+                        seccomp,
+                        SCMP_ACT_ERRNO(ENOSYS),
+                        SCMP_SYS(openat2),
+                        0);
+        if (r < 0)
+                log_debug_errno(r, "Failed to add filter for openat2: %m");
+        else
+                any = true;
+#endif
+
+        return any ? 0 : r;
+}
+
+int seccomp_suppress_sync(void) {
+        uint32_t arch;
+        int r;
+
+        /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
+         * manageable, and also masks O_SYNC/O_DSYNC */
+
+        SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+                _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+                const char *c;
+
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+                if (r < 0)
+                        return r;
+
+                NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
+                        int id;
+
+                        id = seccomp_syscall_resolve_name(c);
+                        if (id == __NR_SCMP_ERROR) {
+                                log_debug("System call %s is not known, ignoring.", c);
+                                continue;
+                        }
+
+                        r = seccomp_rule_add_exact(
+                                        seccomp,
+                                        SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
+                                        id,
+                                        0);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
+                }
+
+                (void) block_open_flag(seccomp, O_SYNC);
+#if O_DSYNC != O_SYNC
+                (void) block_open_flag(seccomp, O_DSYNC);
+#endif
+
+                r = seccomp_load(seccomp);
+                if (ERRNO_IS_SECCOMP_FATAL(r))
+                        return r;
+                if (r < 0)
+                        log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+        }
+
+        return 0;
+}
index b3d25c9f3f3e5d2c7ca50b9768fe4efa13324779..4f4bc48431dea62ee7e904afa9119085a8a5cf80 100644 (file)
@@ -150,3 +150,5 @@ static inline const char *seccomp_errno_or_action_to_string(int num) {
 }
 
 int parse_syscall_and_errno(const char *in, char **name, int *error);
+
+int seccomp_suppress_sync(void);