]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn.c
Rework cpu affinity parsing
[thirdparty/systemd.git] / src / nspawn / nspawn.c
index 5ff02130d65815dd3e289bf1966ddc544613c80c..25184e11e9e30fc76d16075451475caaa5844265 100644 (file)
@@ -220,8 +220,7 @@ static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
 static bool arg_no_new_privileges = false;
 static int arg_oom_score_adjust = 0;
 static bool arg_oom_score_adjust_set = false;
-static cpu_set_t *arg_cpuset = NULL;
-static unsigned arg_cpuset_ncpus = 0;
+static CPUSet arg_cpu_set = {};
 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
 static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
@@ -259,7 +258,7 @@ STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
 #if HAVE_SECCOMP
 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
 #endif
-STATIC_DESTRUCTOR_REGISTER(arg_cpuset, CPU_FREEp);
+STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
 
 static int help(void) {
@@ -285,11 +284,12 @@ static int help(void) {
                "                            if missing\n"
                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
                "                            remove it after exit\n"
-               "  -i --image=PATH           File system device or disk image for the container\n"
+               "  -i --image=PATH           Root file system disk image (or device node) for\n"
+               "                            the container\n"
                "     --oci-bundle=PATH      OCI bundle directory\n"
                "     --read-only            Mount the root directory read-only\n"
                "     --volatile[=MODE]      Run the system in volatile mode\n"
-               "     --root-hash=HASH       Specify verity root hash\n"
+               "     --root-hash=HASH       Specify verity root hash for root disk image\n"
                "     --pivot-root=PATH[:PATH]\n"
                "                            Pivot root to given directory in the container\n\n"
                "%3$sExecution:%4$s\n"
@@ -948,10 +948,8 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case ARG_LINK_JOURNAL:
                         r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
-                        if (r < 0) {
-                                log_error_errno(r, "Failed to parse link journal mode %s", optarg);
-                                return -EINVAL;
-                        }
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
 
                         arg_settings_mask |= SETTING_LINK_JOURNAL;
                         break;
@@ -1242,9 +1240,8 @@ static int parse_argv(int argc, char *argv[]) {
                         if (r < 0)
                                 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
                         if (l < sizeof(sd_id128_t)) {
-                                log_error("Root hash must be at least 128bit long: %s", optarg);
                                 free(k);
-                                return -EINVAL;
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
                         }
 
                         free(arg_root_hash);
@@ -1331,17 +1328,14 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case ARG_CPU_AFFINITY: {
-                        _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
+                        CPUSet cpuset;
 
                         r = parse_cpu_set(optarg, &cpuset);
                         if (r < 0)
-                                return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
-
-                        if (arg_cpuset)
-                                CPU_FREE(arg_cpuset);
+                                return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
 
-                        arg_cpuset = TAKE_PTR(cpuset);
-                        arg_cpuset_ncpus = r;
+                        cpu_set_reset(&arg_cpu_set);
+                        arg_cpu_set = cpuset;
                         arg_settings_mask |= SETTING_CPU_AFFINITY;
                         break;
                 }
@@ -1388,10 +1382,8 @@ static int parse_argv(int argc, char *argv[]) {
                                      "read-only\n"
                                      "passive\n"
                                      "pipe");
-                        else {
-                                log_error("Unknown console mode: %s", optarg);
-                                return -EINVAL;
-                        }
+                        else
+                                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
 
                         arg_settings_mask |= SETTING_CONSOLE_MODE;
                         break;
@@ -1499,7 +1491,7 @@ static int verify_arguments(void) {
 
         /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
          * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
-         * copy-up (in case of overlay) making the entire excercise pointless. */
+         * copy-up (in case of overlay) making the entire exercise pointless. */
         if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
 
@@ -1700,7 +1692,7 @@ static int setup_timezone(const char *dest) {
 
         case TIMEZONE_COPY:
                 /* If mounting failed, try to copy */
-                r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
+                r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
                 if (r < 0) {
                         log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
                                        "Failed to copy /etc/localtime to %s, ignoring: %m", where);
@@ -1827,7 +1819,7 @@ static int setup_resolv_conf(const char *dest) {
         }
 
         /* If that didn't work, let's copy the file */
-        r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
+        r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
         if (r < 0) {
                 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
                  * resolved or something similar runs inside and the symlink points there.
@@ -1853,10 +1845,9 @@ static int setup_boot_id(void) {
         const char *to;
         int r;
 
-        /* Generate a new randomized boot ID, so that each boot-up of
-         * the container gets a new one */
+        /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
 
-        r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+        r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
         if (r < 0)
                 return log_error_errno(r, "Failed to generate random boot ID path: %m");
 
@@ -2100,19 +2091,18 @@ static int setup_kmsg(int kmsg_socket) {
         _cleanup_free_ char *fifo = NULL;
         _cleanup_close_ int fd = -1;
         _cleanup_umask_ mode_t u;
-        const char *to;
         int r;
 
         assert(kmsg_socket >= 0);
 
         u = umask(0000);
 
-        /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+        /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
          * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
          * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
          * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
 
-        r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+        r = tempfn_random_child("/run", "proc-kmsg", &fifo);
         if (r < 0)
                 return log_error_errno(r, "Failed to generate kmsg path: %m");
 
@@ -2120,9 +2110,8 @@ static int setup_kmsg(int kmsg_socket) {
                 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
 
         from = TAKE_PTR(fifo);
-        to = "/proc/kmsg";
 
-        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+        r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
         if (r < 0)
                 return r;
 
@@ -2617,10 +2606,8 @@ static int determine_names(void) {
                                 return log_error_errno(r, "Failed to determine current directory: %m");
                 }
 
-                if (!arg_directory && !arg_image) {
-                        log_error("Failed to determine path, please use -D or -i.");
-                        return -EINVAL;
-                }
+                if (!arg_directory && !arg_image)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
         }
 
         if (!arg_machine) {
@@ -2643,10 +2630,8 @@ static int determine_names(void) {
                         return log_oom();
 
                 hostname_cleanup(arg_machine);
-                if (!machine_name_is_valid(arg_machine)) {
-                        log_error("Failed to determine machine name automatically, please use -M.");
-                        return -EINVAL;
-                }
+                if (!machine_name_is_valid(arg_machine))
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
 
                 if (arg_ephemeral) {
                         char *b;
@@ -2773,10 +2758,8 @@ static int patch_sysctl(void) {
                                 break;
                 }
 
-                if (!good) {
-                        log_error("Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
-                        return -EPERM;
-                }
+                if (!good)
+                        return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
 
                 r = sysctl_write(*k, *v);
                 if (r < 0)
@@ -2935,8 +2918,8 @@ static int inner_child(
                         return log_error_errno(r, "Failed to adjust OOM score: %m");
         }
 
-        if (arg_cpuset)
-                if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
+        if (arg_cpu_set.set)
+                if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
                         return log_error_errno(errno, "Failed to set CPU affinity: %m");
 
         (void) setup_hostname();
@@ -2961,7 +2944,7 @@ static int inner_child(
                 if (is_seccomp_available()) {
 
                         r = seccomp_load(arg_seccomp);
-                        if (IN_SET(r, -EPERM, -EACCES))
+                        if (ERRNO_IS_SECCOMP_FATAL(r))
                                 return log_error_errno(r, "Failed to install seccomp filter: %m");
                         if (r < 0)
                                 log_debug_errno(r, "Failed to install seccomp filter: %m");
@@ -3275,7 +3258,7 @@ static int outer_child(
                         arg_userns_mode != USER_NAMESPACE_NO,
                         arg_uid_shift,
                         arg_uid_range,
-                        arg_selinux_context);
+                        arg_selinux_apifs_context);
         if (r < 0)
                 return r;
 
@@ -3306,7 +3289,7 @@ static int outer_child(
 
         /* Mark everything as shared so our mounts get propagated down. This is
          * required to make new bind mounts available in systemd services
-         * inside the containter that create a new mount namespace.
+         * inside the container that create a new mount namespace.
          * See https://github.com/systemd/systemd/issues/3860
          * Further submounts (such as /dev) done after this will inherit the
          * shared propagation mode. */
@@ -3323,7 +3306,7 @@ static int outer_child(
                 return r;
 
         if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
-                r = bind_remount_recursive(directory, true, NULL);
+                r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to make tree read-only: %m");
         }
@@ -3728,7 +3711,7 @@ static int merge_settings(Settings *settings, const char *path) {
                 /* Copy the full capabilities over too */
                 if (capability_quintet_is_set(&settings->full_capabilities)) {
                         if (!arg_settings_trusted)
-                                log_warning("Ignoring capabilitiy settings, file %s is not trusted.", path);
+                                log_warning("Ignoring capability settings, file %s is not trusted.", path);
                         else
                                 arg_full_capabilities = settings->full_capabilities;
                 }
@@ -3882,15 +3865,14 @@ static int merge_settings(Settings *settings, const char *path) {
         }
 
         if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
-            settings->cpuset) {
+            settings->cpu_set.set) {
 
                 if (!arg_settings_trusted)
                         log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
                 else {
-                        if (arg_cpuset)
-                                CPU_FREE(arg_cpuset);
-                        arg_cpuset = TAKE_PTR(settings->cpuset);
-                        arg_cpuset_ncpus = settings->cpuset_ncpus;
+                        cpu_set_reset(&arg_cpu_set);
+                        arg_cpu_set = settings->cpu_set;
+                        settings->cpu_set = (CPUSet) {};
                 }
         }
 
@@ -4161,10 +4143,9 @@ static int run_container(int master,
                         log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
                 else if (r < 0)
                         return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
-                else if (r == 0) {
-                        log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
-                        return -EINVAL;
-                }
+                else if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
         }
 
         *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
@@ -4227,10 +4208,8 @@ static int run_container(int master,
                 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
                 if (l < 0)
                         return log_error_errno(errno, "Failed to read UID shift: %m");
-                if (l != sizeof arg_uid_shift) {
-                        log_error("Short read while reading UID shift.");
-                        return -EIO;
-                }
+                if (l != sizeof arg_uid_shift)
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
 
                 if (arg_userns_mode == USER_NAMESPACE_PICK) {
                         /* If we are supposed to pick the UID shift, let's try to use the shift read from the
@@ -4244,10 +4223,8 @@ static int run_container(int master,
                         l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
                         if (l < 0)
                                 return log_error_errno(errno, "Failed to send UID shift: %m");
-                        if (l != sizeof arg_uid_shift) {
-                                log_error("Short write while writing UID shift.");
-                                return -EIO;
-                        }
+                        if (l != sizeof arg_uid_shift)
+                                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
                 }
         }
 
@@ -4256,11 +4233,9 @@ static int run_container(int master,
                 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
                 if (l < 0)
                         return log_error_errno(errno, "Failed to read cgroup mode: %m");
-                if (l != sizeof(arg_unified_cgroup_hierarchy)) {
-                        log_error("Short read while reading cgroup mode (%zu bytes).%s",
-                                  l, l == 0 ? " The child is most likely dead." : "");
-                        return -EIO;
-                }
+                if (l != sizeof(arg_unified_cgroup_hierarchy))
+                        return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
+                                               l, l == 0 ? " The child is most likely dead." : "");
         }
 
         /* Wait for the outer child. */
@@ -4274,19 +4249,15 @@ static int run_container(int master,
         l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
         if (l < 0)
                 return log_error_errno(errno, "Failed to read inner child PID: %m");
-        if (l != sizeof *pid) {
-                log_error("Short read while reading inner child PID.");
-                return -EIO;
-        }
+        if (l != sizeof *pid)
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
 
         /* We also retrieve container UUID in case it was generated by outer child */
         l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
         if (l < 0)
                 return log_error_errno(errno, "Failed to read container machine ID: %m");
-        if (l != sizeof(arg_uuid)) {
-                log_error("Short read while reading container machined ID.");
-                return -EIO;
-        }
+        if (l != sizeof(arg_uuid))
+                return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
 
         /* We also retrieve the socket used for notifications generated by outer child */
         notify_socket = receive_one_fd(notify_socket_pair[0], 0);
@@ -4297,10 +4268,8 @@ static int run_container(int master,
         log_debug("Init process invoked as PID "PID_FMT, *pid);
 
         if (arg_userns_mode != USER_NAMESPACE_NO) {
-                if (!barrier_place_and_sync(&barrier)) { /* #1 */
-                        log_error("Child died too early.");
-                        return -ESRCH;
-                }
+                if (!barrier_place_and_sync(&barrier)) /* #1 */
+                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
                 r = setup_uid_map(*pid);
                 if (r < 0)
@@ -4312,10 +4281,8 @@ static int run_container(int master,
         if (arg_private_network) {
                 if (!arg_network_namespace_path) {
                         /* Wait until the child has unshared its network namespace. */
-                        if (!barrier_place_and_sync(&barrier)) { /* #3 */
-                                log_error("Child died too early");
-                                return -ESRCH;
-                        }
+                        if (!barrier_place_and_sync(&barrier)) /* #3 */
+                                return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
                 }
 
                 r = move_network_interfaces(*pid, arg_network_interfaces);
@@ -4471,10 +4438,8 @@ static int run_container(int master,
                 return r;
 
         /* Let the child know that we are ready and wait that the child is completely ready now. */
-        if (!barrier_place_and_sync(&barrier)) { /* #5 */
-                log_error("Child died too early.");
-                return -ESRCH;
-        }
+        if (!barrier_place_and_sync(&barrier)) /* #5 */
+                return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
         /* At this point we have made use of the UID we picked, and thus nss-mymachines
          * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
@@ -4873,7 +4838,7 @@ static int run(int argc, char *argv[]) {
                                 goto finish;
                         }
 
-                        r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
+                        r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
                         if (r < 0) {
                                 r = log_error_errno(r, "Failed to copy image file: %m");
                                 goto finish;