✓ TimerSlackNSec=
✓ NoNewPrivileges=
✓ KeyringMode=
+✓ ProtectProc=
+✓ ProcSubset=
✓ SystemCallFilter=
✓ SystemCallArchitectures=
✓ SystemCallErrorNumber=
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
if (streq(name, "KeyringMode"))
return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error);
+ if (streq(name, "ProtectProc"))
+ return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error);
+
+ if (streq(name, "ProcSubset"))
+ return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
+
if (streq(name, "RuntimeDirectoryPreserve"))
return bus_set_transient_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
- context->protect_control_groups)
+ context->protect_control_groups ||
+ context->protect_proc != PROTECT_PROC_DEFAULT ||
+ context->proc_subset != PROC_SUBSET_ALL)
return true;
if (context->root_directory) {
.private_mounts = context->private_mounts,
.protect_home = context->protect_home,
.protect_system = context->protect_system,
+ .protect_proc = context->protect_proc,
+ .proc_subset = context->proc_subset,
};
} else if (!context->dynamic_user && root_dir)
/*
"%sRestrictRealtime: %s\n"
"%sRestrictSUIDSGID: %s\n"
"%sKeyringMode: %s\n"
- "%sProtectHostname: %s\n",
+ "%sProtectHostname: %s\n"
+ "%sProtectProc: %s\n"
+ "%sProcSubset: %s\n",
prefix, c->umask,
prefix, c->working_directory ? c->working_directory : "/",
prefix, c->root_directory ? c->root_directory : "/",
prefix, yes_no(c->restrict_realtime),
prefix, yes_no(c->restrict_suid_sgid),
prefix, exec_keyring_mode_to_string(c->keyring_mode),
- prefix, yes_no(c->protect_hostname));
+ prefix, yes_no(c->protect_hostname),
+ prefix, protect_proc_to_string(c->protect_proc),
+ prefix, proc_subset_to_string(c->proc_subset));
if (c->root_image)
fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
char *log_namespace;
+ ProtectProc protect_proc; /* hidepid= */
+ ProcSubset proc_subset; /* subset= */
+
bool private_tmp;
bool private_network;
bool private_devices;
$1.TimerSlackNSec, config_parse_nsec, 0, offsetof($1, exec_context.timer_slack_nsec)
$1.NoNewPrivileges, config_parse_bool, 0, offsetof($1, exec_context.no_new_privileges)
$1.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof($1, exec_context.keyring_mode)
+$1.ProtectProc, config_parse_protect_proc, 0, offsetof($1, exec_context.protect_proc)
+$1.ProcSubset, config_parse_proc_subset, 0, offsetof($1, exec_context.proc_subset)
m4_ifdef(`HAVE_SECCOMP',
`$1.SystemCallFilter, config_parse_syscall_filter, 0, offsetof($1, exec_context)
$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs)
DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy");
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier");
CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
+CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
{ "/proc/latency_stats", READONLY, true },
{ "/proc/mtrr", READONLY, true },
{ "/proc/scsi", READONLY, true },
- { "/proc/sys", READONLY, false },
+ { "/proc/sys", READONLY, true },
{ "/proc/sysrq-trigger", READONLY, true },
{ "/proc/timer_stats", READONLY, true },
{ "/sys", READONLY, false },
return 1;
}
-static int mount_procfs(const MountEntry *m) {
- int r;
+static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
+ const char *entry_path;
assert(m);
+ assert(ns_info);
- (void) mkdir_p_label(mount_entry_path(m), 0755);
+ entry_path = mount_entry_path(m);
- r = path_is_mount_point(mount_entry_path(m), NULL, 0);
- if (r < 0)
- return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
- if (r > 0) /* make this a NOP if /proc is already a mount point */
- return 0;
+ /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
+ * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
+ * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
+ * mounted on /proc/ first. */
- /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
- if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
- return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
+ (void) mkdir_p_label(entry_path, 0755);
+ (void) umount_recursive(entry_path, 0);
+
+ if (ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
+ ns_info->proc_subset != PROC_SUBSET_ALL) {
+ _cleanup_free_ char *opts = NULL;
+
+ /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
+ * pretended to be per-instance but actually was per-namespace), hence let's make use of it
+ * if requested. To make sure this logic succeeds only on kernels where hidepid= is
+ * per-instance, we'll exclusively use the textual value for hidepid=, since support was
+ * added in the same commit: if it's supported it is thus also per-instance. */
+
+ opts = strjoin("hidepid=",
+ ns_info->protect_proc == PROTECT_PROC_DEFAULT ? "off" :
+ protect_proc_to_string(ns_info->protect_proc),
+ ns_info->proc_subset == PROC_SUBSET_PID ? ",subset=pid" : "");
+ if (!opts)
+ return -ENOMEM;
+
+ if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts) < 0) {
+ if (errno != EINVAL)
+ return log_debug_errno(errno, "Failed to mount %s (options=%s): %m", mount_entry_path(m), opts);
+
+ /* If this failed with EINVAL then this likely means the textual hidepid= stuff is
+ * not supported by the kernel, and thus the per-instance hidepid= neither, which
+ * means we really don't want to use it, since it would affect our host's /proc
+ * mount. Hence let's gracefully fallback to a classic, unrestricted version. */
+ } else
+ return 1;
+ }
+
+ if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
+ return log_debug_errno(errno, "Failed to mount %s (no options): %m", mount_entry_path(m));
return 1;
}
static int apply_mount(
const char *root_directory,
- MountEntry *m) {
+ MountEntry *m,
+ const NamespaceInfo *ns_info) {
_cleanup_free_ char *inaccessible = NULL;
bool rbind = true, make = false;
int r;
assert(m);
+ assert(ns_info);
log_debug("Applying namespace mount on %s", mount_entry_path(m));
return mount_sysfs(m);
case PROCFS:
- return mount_procfs(m);
+ return mount_procfs(m, ns_info);
case MOUNT_IMAGES:
return mount_images(m);
return ns_info->mount_apivfs ||
ns_info->protect_control_groups ||
- ns_info->protect_kernel_tunables;
+ ns_info->protect_kernel_tunables ||
+ ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
+ ns_info->proc_subset != PROC_SUBSET_ALL;
}
static size_t namespace_calculate_mounts(
break;
}
- r = apply_mount(root, m);
+ r = apply_mount(root, m, ns_info);
if (r < 0) {
if (error_path && mount_entry_path(m))
*error_path = strdup(mount_entry_path(m));
};
DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
+
+static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
+ [PROTECT_PROC_DEFAULT] = "default",
+ [PROTECT_PROC_NOACCESS] = "noaccess",
+ [PROTECT_PROC_INVISIBLE] = "invisible",
+ [PROTECT_PROC_PTRACEABLE] = "ptraceable",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
+
+static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
+ [PROC_SUBSET_ALL] = "all",
+ [PROC_SUBSET_PID] = "pid",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
_PROTECT_SYSTEM_INVALID = -1
} ProtectSystem;
+typedef enum ProtectProc {
+ PROTECT_PROC_DEFAULT,
+ PROTECT_PROC_NOACCESS, /* hidepid=noaccess */
+ PROTECT_PROC_INVISIBLE, /* hidepid=invisible */
+ PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */
+ _PROTECT_PROC_MAX,
+ _PROTECT_PROC_INVALID = -1,
+} ProtectProc;
+
+typedef enum ProcSubset {
+ PROC_SUBSET_ALL,
+ PROC_SUBSET_PID, /* subset=pid */
+ _PROC_SUBSET_MAX,
+ _PROC_SUBSET_INVALID = -1,
+} ProcSubset;
+
struct NamespaceInfo {
bool ignore_protect_paths:1;
bool private_dev:1;
bool protect_hostname:1;
ProtectHome protect_home;
ProtectSystem protect_system;
+ ProtectProc protect_proc;
+ ProcSubset proc_subset;
};
struct BindMount {
const char* protect_system_to_string(ProtectSystem p) _const_;
ProtectSystem protect_system_from_string(const char *s) _pure_;
+const char* protect_proc_to_string(ProtectProc i) _const_;
+ProtectProc protect_proc_from_string(const char *s) _pure_;
+
+const char* proc_subset_to_string(ProcSubset i) _const_;
+ProcSubset proc_subset_from_string(const char *s) _pure_;
+
void bind_mount_free_many(BindMount *b, size_t n);
int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
"RuntimeDirectoryPreserve",
"Personality",
"KeyringMode",
+ "ProtectProc",
+ "ProcSubset",
"NetworkNamespacePath",
"LogNamespace"))
return bus_append_string(m, field, eq);
.protect_control_groups = true,
.protect_kernel_tunables = true,
.protect_kernel_modules = true,
+ .protect_proc = PROTECT_PROC_NOACCESS,
+ .proc_subset = PROC_SUBSET_PID,
};
char *root_directory;
KEYMAP_TOGGLE=
KeepFree=
KeyringMode=
+ProtectProc=
+ProcSubset=
KillExcludeUsers=
KillOnlyUsers=
KillSignal=