* `$SYSTEMD_NSPAWN_TMPFS_TMP=0` — if set, do not overmount `/tmp/` in the
container with a tmpfs, but leave the directory from the image in place.
+* `$SYSTEMD_SUPPRESS_SYNC=1` — if set, all disk synchronization syscalls are
+ blocked to the container payload (e.g. `sync()`, `fsync()`, `syncfs()`, …)
+ and the `O_SYNC`/`O_DSYNC` flags are made unavailable to `open()` and
+ friends. This is equivalent to passing `--suppress-sync=yes` on the
+ `systemd-nspawn` command line.
+
`systemd-logind`:
* `$SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK=1` — if set, report that
before sending its own to systemd. For more details about notifications
see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>.</para></listitem>
</varlistentry>
+
+ <varlistentry>
+ <term><option>--suppress-sync=</option></term>
+
+ <listitem><para>Expects a boolean argument. If true, turns off any form of on-disk file system
+ synchronization for the container payload. This means all system calls such as <citerefentry
+ project='man-pages'><refentrytitle>sync</refentrytitle><manvolnum>2</manvolnum></citerefentry>,
+ <function>fsync()</function>, <function>syncfs()</function>, … will execute no operation, and the
+ <constant>O_SYNC</constant>/<constant>O_DSYNC</constant> flags to <citerefentry
+ project='man-pages'><refentrytitle>open</refentrytitle><manvolnum>2</manvolnum></citerefentry> and
+ related calls will be made unavailable. This is potentially dangerous, as assumed data integrity
+ guarantees to the container payload are not actually enforced (i.e. data assumed to have been written
+ to disk might be lost if the system is shut down abnormally). However, this can dramatically improve
+ container runtime performance – as long as these guarantees are not required or desirable, for
+ example because any data written by the container is of temporary, redundant nature, or just an
+ intermediary artifact that will be further processed and finalized by a later step in a
+ pipeline. Defaults to false.</para></listitem>
+ </varlistentry>
</variablelist>
</refsect2><refsect2>
details.</para></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>SuppressSync=</varname></term>
+
+ <listitem><para>Configures whether to suppress disk synchronization for the container payload. This
+ is equivalent to the <option>--suppress-sync=</option> command line switch, and takes the same
+ parameter. See
+ <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
+ for details.</para></listitem>
+ </varlistentry>
+
</variablelist>
</refsect1>
local -A OPTS=(
[STANDALONE]='-h --help --version --private-network -b --boot --read-only -q --quiet --share-system
- --keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U'
+ --keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U --suppress-sync=yes'
[ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro
-M --machine -S --slice -E --setenv -Z --selinux-context -L --selinux-apifs-context
--register --network-interface --network-bridge --personality -i --image --tmpfs
Exec.ResolvConf, config_parse_resolv_conf, 0, offsetof(Settings, resolv_conf)
Exec.LinkJournal, config_parse_link_journal, 0, 0
Exec.Timezone, config_parse_timezone, 0, offsetof(Settings, timezone)
+Exec.SuppressSync, config_parse_bool, 0, offsetof(Settings, suppress_sync)
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0
SETTING_CONSOLE_MODE = UINT64_C(1) << 29,
SETTING_CREDENTIALS = UINT64_C(1) << 30,
SETTING_BIND_USER = UINT64_C(1) << 31,
- SETTING_RLIMIT_FIRST = UINT64_C(1) << 32, /* we define one bit per resource limit here */
- SETTING_RLIMIT_LAST = UINT64_C(1) << (32 + _RLIMIT_MAX - 1),
- _SETTINGS_MASK_ALL = (UINT64_C(1) << (32 + _RLIMIT_MAX)) -1,
+ SETTING_SUPPRESS_SYNC = UINT64_C(1) << 32,
+ SETTING_RLIMIT_FIRST = UINT64_C(1) << 33, /* we define one bit per resource limit here */
+ SETTING_RLIMIT_LAST = UINT64_C(1) << (33 + _RLIMIT_MAX - 1),
+ _SETTINGS_MASK_ALL = (UINT64_C(1) << (33 + _RLIMIT_MAX)) -1,
_SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
} SettingsMask;
LinkJournal link_journal;
bool link_journal_try;
TimezoneMode timezone;
+ bool suppress_sync;
/* [Files] */
int read_only;
static Credential *arg_credentials = NULL;
static size_t arg_n_credentials = 0;
static char **arg_bind_user = NULL;
+static bool arg_suppress_sync = false;
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
" -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
" -u --user=USER Run the command under specified user or UID\n"
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
- " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
+ " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
+ " --suppress-sync=BOOLEAN\n"
+ " Suppress any form of disk data synchronization\n\n"
"%3$sSystem Identity:%4$s\n"
" -M --machine=NAME Set the machine name for the container\n"
" --hostname=NAME Override the hostname for the container\n"
if (e)
arg_container_service_name = e;
+ r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
+ if (r >= 0)
+ arg_suppress_sync = r;
+ else if (r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
+
return detect_unified_cgroup_hierarchy_from_environment();
}
ARG_SET_CREDENTIAL,
ARG_LOAD_CREDENTIAL,
ARG_BIND_USER,
+ ARG_SUPPRESS_SYNC,
};
static const struct option options[] = {
{ "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
{ "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
{ "bind-user", required_argument, NULL, ARG_BIND_USER },
+ { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
{}
};
arg_settings_mask |= SETTING_BIND_USER;
break;
+ case ARG_SUPPRESS_SYNC:
+ r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
+ if (r < 0)
+ return r;
+
+ arg_settings_mask |= SETTING_SUPPRESS_SYNC;
+ break;
+
case '?':
return -EINVAL;
return r;
}
+ if (arg_suppress_sync) {
+ r = seccomp_suppress_sync();
+ if (r < 0)
+ log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
+ }
+
#if HAVE_SELINUX
if (arg_selinux_context)
if (setexeccon(arg_selinux_context) < 0)
arg_console_mode = settings->console_mode;
}
+ if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0)
+ arg_suppress_sync = settings->suppress_sync;
+
/* The following properties can only be set through the OCI settings logic, not from the command line, hence we
* don't consult arg_settings_mask for them. */
return 0;
}
+
+static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
+ bool any = false;
+ int r;
+
+ /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
+ * EINVAL, in the hope the client code will retry without O_SYNC then. */
+
+#if SCMP_SYS(open) > 0
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EINVAL),
+ SCMP_SYS(open),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for open: %m");
+ else
+ any = true;
+#endif
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EINVAL),
+ SCMP_SYS(openat),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat: %m");
+ else
+ any = true;
+
+#if defined(__SNR_openat2)
+ /* The new openat2() system call can't be filtered sensibly, see above. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(ENOSYS),
+ SCMP_SYS(openat2),
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat2: %m");
+ else
+ any = true;
+#endif
+
+ return any ? 0 : r;
+}
+
+int seccomp_suppress_sync(void) {
+ uint32_t arch;
+ int r;
+
+ /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
+ * manageable, and also masks O_SYNC/O_DSYNC */
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ const char *c;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
+ int id;
+
+ id = seccomp_syscall_resolve_name(c);
+ if (id == __NR_SCMP_ERROR) {
+ log_debug("System call %s is not known, ignoring.", c);
+ continue;
+ }
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
+ id,
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
+ }
+
+ (void) block_open_flag(seccomp, O_SYNC);
+#if O_DSYNC != O_SYNC
+ (void) block_open_flag(seccomp, O_DSYNC);
+#endif
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
}
int parse_syscall_and_errno(const char *in, char **name, int *error);
+
+int seccomp_suppress_sync(void);