nspawn: fix MS_SHARED mount propagation for userns containers

[thirdparty/systemd.git] / src / nspawn / nspawn.c
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c

index 0f2d01c0aa8872928ff3d1a61e33f92ef9444988..3b9493f232e26f4c15a0b7e236f5288e14afe8e2 100644 (file)
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -2931,7 +2931,8 @@ static int inner_child(
                  int kmsg_socket,
                  int rtnl_socket,
                  int master_pty_socket,
-                FDSet *fds) {
+                FDSet *fds,
+                char **os_release_pairs) {
  
          _cleanup_free_ char *home = NULL;
          char as_uuid[ID128_UUID_STRING_MAX];
@@ -2976,13 +2977,20 @@ static int inner_child(
  
                  /* Wait until the parent wrote the UID map */
                  if (!barrier_place_and_sync(barrier)) /* #2 */
-                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
-                                               "Parent died too early");
-        }
+                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
  
-        r = reset_uid_gid();
-        if (r < 0)
-                return log_error_errno(r, "Couldn't become new root: %m");
+                /* Become the new root user inside our namespace */
+                r = reset_uid_gid();
+                if (r < 0)
+                        return log_error_errno(r, "Couldn't become new root: %m");
+
+                /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
+                 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
+                 * propagation, but simply create new peer groups for all our mounts). */
+                r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
+                if (r < 0)
+                        return r;
+        }
  
          r = mount_all(NULL,
                        arg_mount_settings | MOUNT_IN_USERNS,
@@ -3190,7 +3198,7 @@ static int inner_child(
          if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
                  return log_oom();
  
-        env_use = strv_env_merge(2, envp, arg_setenv);
+        env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
          if (!env_use)
                  return log_oom();
  
@@ -3316,6 +3324,7 @@ static int outer_child(
                  FDSet *fds,
                  int netns_fd) {
  
+        _cleanup_strv_free_ char **os_release_pairs = NULL;
          _cleanup_close_ int fd = -1;
          const char *p;
          pid_t pid;
@@ -3337,6 +3346,10 @@ static int outer_child(
  
          log_debug("Outer child is initializing.");
  
+        r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
+        if (r < 0)
+                log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
+
          if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
                  return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
  
@@ -3344,9 +3357,8 @@ static int outer_child(
          if (r < 0)
                  return r;
  
-        /* Mark everything as slave, so that we still
-         * receive mounts from the real root, but don't
-         * propagate mounts to the real root. */
+        /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
+         * mounts to the real root. */
          r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
          if (r < 0)
                  return r;
@@ -3592,9 +3604,8 @@ static int outer_child(
                  notify_socket = safe_close(notify_socket);
                  uid_shift_socket = safe_close(uid_shift_socket);
  
-                /* The inner child has all namespaces that are
-                 * requested, so that we all are owned by the user if
-                 * user namespaces are turned on. */
+                /* The inner child has all namespaces that are requested, so that we all are owned by the
+                 * user if user namespaces are turned on. */
  
                  if (arg_network_namespace_path) {
                          r = namespace_enter(-1, -1, netns_fd, -1, -1);
@@ -3602,7 +3613,7 @@ static int outer_child(
                                  return log_error_errno(r, "Failed to join network namespace: %m");
                  }
  
-                r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
+                r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
                  if (r < 0)
                          _exit(EXIT_FAILURE);
  
@@ -4630,7 +4641,7 @@ static int run_container(
          if (!barrier_place_and_sync(&barrier)) /* #5 */
                  return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
  
-        /* At this point we have made use of the UID we picked, and thus nss-mymachines
+        /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
           * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
          etc_passwd_lock = safe_close(etc_passwd_lock);