]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: introduce ProtectProc= and ProcSubset= to expose hidepid= and subset= procfs...
authorLennart Poettering <lennart@poettering.net>
Thu, 6 Aug 2020 10:51:50 +0000 (12:51 +0200)
committerLennart Poettering <lennart@poettering.net>
Mon, 24 Aug 2020 18:11:02 +0000 (20:11 +0200)
Kernel 5.8 gained a hidepid= implementation that is truly per procfs,
which allows us to mount a distinct once into every unit, with
individual hidepid= settings. Let's expose this via two new settings:
ProtectProc= (wrapping hidpid=) and ProcSubset= (wrapping subset=).

Replaces: #11670

12 files changed:
docs/TRANSIENT-SETTINGS.md
src/core/dbus-execute.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.m4
src/core/load-fragment.c
src/core/load-fragment.h
src/core/namespace.c
src/core/namespace.h
src/shared/bus-unit-util.c
src/test/test-ns.c
test/fuzz/fuzz-unit-file/directives.service

index 19944d08b804104c5ab69d13c1463fc9c01bd53a..2c0aea07da292a980a1da96bf2f139d6466e6043 100644 (file)
@@ -151,6 +151,8 @@ All execution-related settings are available for transient units.
 ✓ TimerSlackNSec=
 ✓ NoNewPrivileges=
 ✓ KeyringMode=
+✓ ProtectProc=
+✓ ProcSubset=
 ✓ SystemCallFilter=
 ✓ SystemCallArchitectures=
 ✓ SystemCallErrorNumber=
index 17d128c1b16420648f004f78b80337c3e11b6960..c96c654ff03e79deaf91e110918b2a4f36ad2781 100644 (file)
@@ -47,6 +47,8 @@ static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInp
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
@@ -1016,6 +1018,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
 
@@ -1354,6 +1358,8 @@ static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_fr
 static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string);
 static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string);
 static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
 static BUS_DEFINE_SET_TRANSIENT_PARSE(preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
 static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
 static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
@@ -1706,6 +1712,12 @@ int bus_exec_context_set_transient_property(
         if (streq(name, "KeyringMode"))
                 return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error);
 
+        if (streq(name, "ProtectProc"))
+                return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error);
+
+        if (streq(name, "ProcSubset"))
+                return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
+
         if (streq(name, "RuntimeDirectoryPreserve"))
                 return bus_set_transient_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);
 
index c3a87197f714d6fcb454e071e144499db2ca259a..d5107288a179a74d68fa35b8f9eb0f897f66566c 100644 (file)
@@ -1948,7 +1948,9 @@ static bool exec_needs_mount_namespace(
             context->protect_kernel_tunables ||
             context->protect_kernel_modules ||
             context->protect_kernel_logs ||
-            context->protect_control_groups)
+            context->protect_control_groups ||
+            context->protect_proc != PROTECT_PROC_DEFAULT ||
+            context->proc_subset != PROC_SUBSET_ALL)
                 return true;
 
         if (context->root_directory) {
@@ -2652,6 +2654,8 @@ static int apply_mount_namespace(
                         .private_mounts = context->private_mounts,
                         .protect_home = context->protect_home,
                         .protect_system = context->protect_system,
+                        .protect_proc = context->protect_proc,
+                        .proc_subset = context->proc_subset,
                 };
         } else if (!context->dynamic_user && root_dir)
                 /*
@@ -4601,7 +4605,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
                 "%sRestrictRealtime: %s\n"
                 "%sRestrictSUIDSGID: %s\n"
                 "%sKeyringMode: %s\n"
-                "%sProtectHostname: %s\n",
+                "%sProtectHostname: %s\n"
+                "%sProtectProc: %s\n"
+                "%sProcSubset: %s\n",
                 prefix, c->umask,
                 prefix, c->working_directory ? c->working_directory : "/",
                 prefix, c->root_directory ? c->root_directory : "/",
@@ -4623,7 +4629,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
                 prefix, yes_no(c->restrict_realtime),
                 prefix, yes_no(c->restrict_suid_sgid),
                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
-                prefix, yes_no(c->protect_hostname));
+                prefix, yes_no(c->protect_hostname),
+                prefix, protect_proc_to_string(c->protect_proc),
+                prefix, proc_subset_to_string(c->proc_subset));
 
         if (c->root_image)
                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
index 631279038d324584d31e26e380da65075f9a6403..1ea7e51fd799fa70237c69b51e81afacc0812dbf 100644 (file)
@@ -260,6 +260,9 @@ struct ExecContext {
 
         char *log_namespace;
 
+        ProtectProc protect_proc;  /* hidepid= */
+        ProcSubset proc_subset;    /* subset= */
+
         bool private_tmp;
         bool private_network;
         bool private_devices;
index a191de62af37dd1bec40b79f4f83321a02486a94..7d5000c51fe723ef4efda149b51d9dce8cc4756a 100644 (file)
@@ -73,6 +73,8 @@ $1.AmbientCapabilities,          config_parse_capability_set,        0,
 $1.TimerSlackNSec,               config_parse_nsec,                  0,                             offsetof($1, exec_context.timer_slack_nsec)
 $1.NoNewPrivileges,              config_parse_bool,                  0,                             offsetof($1, exec_context.no_new_privileges)
 $1.KeyringMode,                  config_parse_exec_keyring_mode,     0,                             offsetof($1, exec_context.keyring_mode)
+$1.ProtectProc,                  config_parse_protect_proc,          0,                             offsetof($1, exec_context.protect_proc)
+$1.ProcSubset,                   config_parse_proc_subset,           0,                             offsetof($1, exec_context.proc_subset)
 m4_ifdef(`HAVE_SECCOMP',
 `$1.SystemCallFilter,            config_parse_syscall_filter,        0,                             offsetof($1, exec_context)
 $1.SystemCallArchitectures,      config_parse_syscall_archs,         0,                             offsetof($1, exec_context.syscall_archs)
index 75fed001a2fc86282fdfc92bd4bbafec4f30d568..df93fbb28fd731b2adfd645d8d15d40685fff47d 100644 (file)
@@ -118,6 +118,8 @@ DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Fai
 DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode");
 DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy");
 DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode");
 DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode");
 DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode");
 DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier");
index cee5717d0fbc7608716f5fdab1828046712ab7f5..ae134610b1e149d6def0d77624031e2b8f373365 100644 (file)
@@ -108,6 +108,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat);
 CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
 CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
 CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
+CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
 CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
 CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
 CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
index 2e13b10d9cefaf4306d9ba2785709a6558132b21..1f78d66a34782dd56ec36d7449c0d624276af888 100644 (file)
@@ -97,7 +97,7 @@ static const MountEntry protect_kernel_tunables_table[] = {
         { "/proc/latency_stats", READONLY,           true  },
         { "/proc/mtrr",          READONLY,           true  },
         { "/proc/scsi",          READONLY,           true  },
-        { "/proc/sys",           READONLY,           false },
+        { "/proc/sys",           READONLY,           true  },
         { "/proc/sysrq-trigger", READONLY,           true  },
         { "/proc/timer_stats",   READONLY,           true  },
         { "/sys",                READONLY,           false },
@@ -863,22 +863,53 @@ static int mount_sysfs(const MountEntry *m) {
         return 1;
 }
 
-static int mount_procfs(const MountEntry *m) {
-        int r;
+static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
+        const char *entry_path;
 
         assert(m);
+        assert(ns_info);
 
-        (void) mkdir_p_label(mount_entry_path(m), 0755);
+        entry_path = mount_entry_path(m);
 
-        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
-        if (r < 0)
-                return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
-        if (r > 0) /* make this a NOP if /proc is already a mount point */
-                return 0;
+        /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
+         * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
+         * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
+         * mounted on /proc/ first. */
 
-        /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
-        if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
-                return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
+        (void) mkdir_p_label(entry_path, 0755);
+        (void) umount_recursive(entry_path, 0);
+
+        if (ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
+            ns_info->proc_subset != PROC_SUBSET_ALL) {
+                _cleanup_free_ char *opts = NULL;
+
+                /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
+                 * pretended to be per-instance but actually was per-namespace), hence let's make use of it
+                 * if requested. To make sure this logic succeeds only on kernels where hidepid= is
+                 * per-instance, we'll exclusively use the textual value for hidepid=, since support was
+                 * added in the same commit: if it's supported it is thus also per-instance. */
+
+                opts = strjoin("hidepid=",
+                               ns_info->protect_proc == PROTECT_PROC_DEFAULT ? "off" :
+                               protect_proc_to_string(ns_info->protect_proc),
+                               ns_info->proc_subset == PROC_SUBSET_PID ? ",subset=pid" : "");
+                if (!opts)
+                        return -ENOMEM;
+
+                if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts) < 0) {
+                        if (errno != EINVAL)
+                                return log_debug_errno(errno, "Failed to mount %s (options=%s): %m", mount_entry_path(m), opts);
+
+                        /* If this failed with EINVAL then this likely means the textual hidepid= stuff is
+                         * not supported by the kernel, and thus the per-instance hidepid= neither, which
+                         * means we really don't want to use it, since it would affect our host's /proc
+                         * mount. Hence let's gracefully fallback to a classic, unrestricted version. */
+                } else
+                        return 1;
+        }
+
+        if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
+                return log_debug_errno(errno, "Failed to mount %s (no options): %m", mount_entry_path(m));
 
         return 1;
 }
@@ -997,7 +1028,8 @@ static int follow_symlink(
 
 static int apply_mount(
                 const char *root_directory,
-                MountEntry *m) {
+                MountEntry *m,
+                const NamespaceInfo *ns_info) {
 
         _cleanup_free_ char *inaccessible = NULL;
         bool rbind = true, make = false;
@@ -1005,6 +1037,7 @@ static int apply_mount(
         int r;
 
         assert(m);
+        assert(ns_info);
 
         log_debug("Applying namespace mount on %s", mount_entry_path(m));
 
@@ -1109,7 +1142,7 @@ static int apply_mount(
                 return mount_sysfs(m);
 
         case PROCFS:
-                return mount_procfs(m);
+                return mount_procfs(m, ns_info);
 
         case MOUNT_IMAGES:
                 return mount_images(m);
@@ -1221,7 +1254,9 @@ static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
 
         return ns_info->mount_apivfs ||
                 ns_info->protect_control_groups ||
-                ns_info->protect_kernel_tunables;
+                ns_info->protect_kernel_tunables ||
+                ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
+                ns_info->proc_subset != PROC_SUBSET_ALL;
 }
 
 static size_t namespace_calculate_mounts(
@@ -1717,7 +1752,7 @@ int setup_namespace(
                                         break;
                                 }
 
-                                r = apply_mount(root, m);
+                                r = apply_mount(root, m, ns_info);
                                 if (r < 0) {
                                         if (error_path && mount_entry_path(m))
                                                 *error_path = strdup(mount_entry_path(m));
@@ -2237,3 +2272,19 @@ static const char* const namespace_type_table[] = {
 };
 
 DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
+
+static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
+        [PROTECT_PROC_DEFAULT]    = "default",
+        [PROTECT_PROC_NOACCESS]   = "noaccess",
+        [PROTECT_PROC_INVISIBLE]  = "invisible",
+        [PROTECT_PROC_PTRACEABLE] = "ptraceable",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
+
+static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
+        [PROC_SUBSET_ALL] = "all",
+        [PROC_SUBSET_PID] = "pid",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
index ec1ab4e2a743d2618b2638c084964468ecfb0080..e682eae7942b7bc6eeb84a31779645b5f6f3e10e 100644 (file)
@@ -47,6 +47,22 @@ typedef enum ProtectSystem {
         _PROTECT_SYSTEM_INVALID = -1
 } ProtectSystem;
 
+typedef enum ProtectProc {
+        PROTECT_PROC_DEFAULT,
+        PROTECT_PROC_NOACCESS,   /* hidepid=noaccess */
+        PROTECT_PROC_INVISIBLE,  /* hidepid=invisible */
+        PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */
+        _PROTECT_PROC_MAX,
+        _PROTECT_PROC_INVALID = -1,
+} ProtectProc;
+
+typedef enum ProcSubset {
+        PROC_SUBSET_ALL,
+        PROC_SUBSET_PID, /* subset=pid */
+        _PROC_SUBSET_MAX,
+        _PROC_SUBSET_INVALID = -1,
+} ProcSubset;
+
 struct NamespaceInfo {
         bool ignore_protect_paths:1;
         bool private_dev:1;
@@ -59,6 +75,8 @@ struct NamespaceInfo {
         bool protect_hostname:1;
         ProtectHome protect_home;
         ProtectSystem protect_system;
+        ProtectProc protect_proc;
+        ProcSubset proc_subset;
 };
 
 struct BindMount {
@@ -135,6 +153,12 @@ ProtectHome protect_home_from_string(const char *s) _pure_;
 const char* protect_system_to_string(ProtectSystem p) _const_;
 ProtectSystem protect_system_from_string(const char *s) _pure_;
 
+const char* protect_proc_to_string(ProtectProc i) _const_;
+ProtectProc protect_proc_from_string(const char *s) _pure_;
+
+const char* proc_subset_to_string(ProcSubset i) _const_;
+ProcSubset proc_subset_from_string(const char *s) _pure_;
+
 void bind_mount_free_many(BindMount *b, size_t n);
 int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
 
index 7fd2595c0b05e1891e1631d642e919ccf870fe12..d010d3bf3ecd02804066eab5d82fb166e3b432d4 100644 (file)
@@ -855,6 +855,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
                               "RuntimeDirectoryPreserve",
                               "Personality",
                               "KeyringMode",
+                              "ProtectProc",
+                              "ProcSubset",
                               "NetworkNamespacePath",
                               "LogNamespace"))
                 return bus_append_string(m, field, eq);
index d3804b50d7d7a0ccdd378085d923e4c78a15f62d..29f6dc5e1f196242776272e7ca1c16be0e63fa6d 100644 (file)
@@ -36,6 +36,8 @@ int main(int argc, char *argv[]) {
                 .protect_control_groups = true,
                 .protect_kernel_tunables = true,
                 .protect_kernel_modules = true,
+                .protect_proc = PROTECT_PROC_NOACCESS,
+                .proc_subset = PROC_SUBSET_PID,
         };
 
         char *root_directory;
index dbff9ab2cc7d6740ad625b99aaa709c77bdfe901..224ccffb929b57140844a18aec52d3af1fb9aaee 100644 (file)
@@ -782,6 +782,8 @@ KEYMAP=
 KEYMAP_TOGGLE=
 KeepFree=
 KeyringMode=
+ProtectProc=
+ProcSubset=
 KillExcludeUsers=
 KillOnlyUsers=
 KillSignal=