personality of the host system's kernel.</para></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>LockPersonality=</varname></term>
+
+ <listitem><para>Locks down the <citerefentry
+ project='man-pages'><refentrytitle>personality</refentrytitle><manvolnum>2</manvolnum></citerefentry> system
+ call so that the kernel execution domain may not be changed from the default or the personality selected with
+ <varname>Personality=</varname> directive. This may be useful to improve security, because odd personality
+ emulations may be poorly tested and source of vulnerabilities. If running in user mode, or in system mode, but
+ without the <constant>CAP_SYS_ADMIN</constant> capability (e.g. setting <varname>User=</varname>),
+ <varname>NoNewPrivileges=yes</varname> is implied.</para></listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>RuntimeDirectory=</varname></term>
SD_BUS_PROPERTY("SystemCallArchitectures", "as", property_get_syscall_archs, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SystemCallErrorNumber", "i", property_get_syscall_errno, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Personality", "s", property_get_personality, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LockPersonality", "b", bus_property_get_bool, offsetof(ExecContext, lock_personality), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictAddressFamilies", "(bas)", property_get_address_families, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RuntimeDirectoryPreserve", "s", property_get_exec_preserve_mode, offsetof(ExecContext, runtime_directory_preserve_mode), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].mode), SD_BUS_VTABLE_PROPERTY_CONST),
c->protect_kernel_modules ||
c->private_devices ||
context_has_syscall_filters(c) ||
- !set_isempty(c->syscall_archs);
+ !set_isempty(c->syscall_archs) ||
+ c->lock_personality;
}
#ifdef HAVE_SECCOMP
return seccomp_restrict_namespaces(c->restrict_namespaces);
}
+static int apply_lock_personality(const Unit* u, const ExecContext *c) {
+ unsigned long personality = c->personality;
+
+ assert(u);
+ assert(c);
+
+ if (!c->lock_personality)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "LockPersonality="))
+ return 0;
+
+ /* If personality is not specified, use the default (Linux) */
+ if (personality == PERSONALITY_INVALID)
+ personality = PER_LINUX;
+
+ return seccomp_lock_personality(personality);
+}
+
#endif
static void do_idle_pipe_dance(int idle_pipe[4]) {
return r;
}
+ r = apply_lock_personality(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ *error_message = strdup("Failed to lock personalities");
+ return r;
+ }
+
/* This really should remain the last step before the execve(), to make sure our own code is unaffected
* by the filter as little as possible. */
r = apply_syscall_filter(unit, context, needs_ambient_hack);
"%sPersonality: %s\n",
prefix, strna(personality_to_string(c->personality)));
+ fprintf(f,
+ "%sLockPersonality: %s\n",
+ prefix, yes_no(c->lock_personality));
+
if (c->syscall_filter) {
#ifdef HAVE_SECCOMP
Iterator j;
bool same_pgrp;
unsigned long personality;
+ bool lock_personality;
unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */
$1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute)
$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context)
$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime)
-$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)',
+$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)
+$1.LockPersonality, config_parse_bool, 0, offsetof($1, exec_context.lock_personality)',
`$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.RestrictNamespaces, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
-$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0')
+$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+$1.LockPersonality, config_parse_warn_compat, DISABLED_CONFIGURATION, 0')
$1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit)
$1.LimitFSIZE, config_parse_limit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit)
$1.LimitDATA, config_parse_limit, RLIMIT_DATA, offsetof($1, exec_context.rlimit)
#include "alloc-util.h"
#include "macro.h"
#include "nsflags.h"
+#include "process-util.h"
#include "seccomp-util.h"
#include "set.h"
#include "string-util.h"
return 0;
}
+
+int seccomp_lock_personality(unsigned long personality) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ int r;
+
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return -ENOMEM;
+
+ r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(personality),
+ 1,
+ SCMP_A0(SCMP_CMP_NE, personality));
+ if (r < 0)
+ return r;
+
+ return seccomp_load(seccomp);
+}
int seccomp_restrict_address_families(Set *address_families, bool whitelist);
int seccomp_restrict_realtime(void);
int seccomp_memory_deny_write_execute(void);
+int seccomp_lock_personality(unsigned long personality);
extern const uint32_t seccomp_local_archs[];
#include <stdlib.h>
#include <sys/eventfd.h>
#include <sys/mman.h>
+#include <sys/personality.h>
#include <sys/poll.h>
#include <sys/shm.h>
#include <sys/types.h>
assert_se(wait_for_terminate_and_warn("syscallrawseccomp", pid, true) == EXIT_SUCCESS);
}
+static void test_lock_personality(void) {
+ pid_t pid;
+
+ if (!is_seccomp_available())
+ return;
+ if (geteuid() != 0)
+ return;
+
+ pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ assert_se(seccomp_lock_personality(PER_LINUX) >= 0);
+
+ assert_se(personality(PER_LINUX) == PER_LINUX);
+ assert_se(personality(PER_LINUX | ADDR_NO_RANDOMIZE) == -1 && errno == EPERM);
+ assert_se(personality(PER_LINUX | MMAP_PAGE_ZERO) == -1 && errno == EPERM);
+ assert_se(personality(PER_LINUX | ADDR_COMPAT_LAYOUT) == -1 && errno == EPERM);
+ assert_se(personality(PER_LINUX | READ_IMPLIES_EXEC) == -1 && errno == EPERM);
+ assert_se(personality(PER_LINUX_32BIT) == -1 && errno == EPERM);
+ assert_se(personality(PER_SVR4) == -1 && errno == EPERM);
+ assert_se(personality(PER_BSD) == -1 && errno == EPERM);
+ assert_se(personality(PER_LINUX32) == -1 && errno == EPERM);
+ assert_se(personality(PER_LINUX32_3GB) == -1 && errno == EPERM);
+ assert_se(personality(PER_UW7) == -1 && errno == EPERM);
+ assert_se(personality(0x42) == -1 && errno == EPERM);
+ assert_se(personality(PERSONALITY_INVALID) == -1 && errno == EPERM); /* maybe remove this later */
+ assert_se(personality(PER_LINUX) == PER_LINUX);
+ _exit(EXIT_SUCCESS);
+ }
+
+ assert_se(wait_for_terminate_and_warn("lockpersonalityseccomp", pid, true) == EXIT_SUCCESS);
+}
+
int main(int argc, char *argv[]) {
log_set_max_level(LOG_DEBUG);
test_memory_deny_write_execute_shmat();
test_restrict_archs();
test_load_syscall_filter_set_raw();
+ test_lock_personality();
return 0;
}