#include <stdlib.h>
#include <sys/param.h>
#include <sys/types.h>
+#include <sys/socket.h>
#include <sys/wait.h>
#include "attach.h"
struct passwd *passwd;
struct lxc_proc_context_info *init_ctx;
struct lxc_handler *handler;
- void *cgroup_data = NULL;
uid_t uid;
char *curdir;
+ int cgroup_ipc_sockets[2];
ret = lxc_caps_init();
if (ret)
return -1;
}
- if (!elevated_privileges) {
- /* we have to do this now since /sys/fs/cgroup may not
- * be available inside the container or we may not have
- * the required permissions anymore
- */
- ret = lxc_cgroup_prepare_attach(my_args.name, &cgroup_data);
- if (ret < 0) {
- ERROR("failed to prepare attaching to cgroup");
- return -1;
- }
- }
-
curdir = getcwd(NULL, 0);
/* determine which namespaces the container was created with
}
}
+ /* For the cgroup attaching logic to work in conjunction with pid and user namespaces,
+ * we need to have the following hierarchy:
+ *
+ * lxc-attach [process executed externally]
+ * | socketpair(cgroup_ipc_sockets)
+ * | fork() -> child
+ * | | setns()
+ * | | fork() -> grandchild
+ * | | | initialize
+ * | | | signal parent
+ * | |<------------------|----+
+ * | | signal parent |
+ * |<----------------------|-----+ |
+ * | add to cgroups | |
+ * | signal child -------->| |
+ * | | signal child ---->|
+ * | waitpid() | waitpid() | exec()
+ * | |<------------------| exit()
+ * |<----------------------| exit()
+ * | exit()
+ *
+ * The rationale is the following: The first parent is needed because after
+ * setns() (mount + user namespace) we can't access the cgroup filesystem
+ * to add the pid to the corresponding cgroup. Therefore, we need to do that
+ * in a process executed on the host, so that's why we need to fork and wait
+ * for it to have done some initialization (cgroups may restrict certain
+ * operations so we have to do that in the end) and use IPC for signaling.
+ *
+ * Then in the child process we do the setns(). However, a process is never
+ * really attached to a pid namespace (never changes its pid, doesn't appear
+ * in the pid namespace /proc), only child processes of that process are
+ * truely inside the new pid namespace. That's why we need to fork() again
+ * after setns() before performing final initializations, then signal our
+ * parent, which signals the primary process, which does cgroup adding,
+ * which then signals to the grandchild that it can exec().
+ */
+ ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, cgroup_ipc_sockets);
+ if (ret < 0) {
+ SYSERROR("could not set up required IPC mechanism for attaching");
+ return -1;
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ SYSERROR("failed to create first subprocess");
+ return -1;
+ }
+
+ if (pid) {
+ int status;
+ pid_t grandchild;
+
+ close(cgroup_ipc_sockets[1]);
+
+ gparent_reread:
+ ret = read(cgroup_ipc_sockets[0], &grandchild, sizeof(grandchild));
+ if (ret <= 0) {
+ if (ret < 0 && (errno == EAGAIN || errno == EINTR))
+ goto gparent_reread;
+ ERROR("failed to get pid of attached process to add to cgroup");
+ return -1;
+ }
+
+ if (!elevated_privileges) {
+ ret = lxc_cgroup_attach(my_args.name, grandchild);
+ if (ret < 0) {
+ ERROR("failed to attach process to cgroup");
+ return -1;
+ }
+ }
+
+ status = 0;
+ ret = write(cgroup_ipc_sockets[0], &status, sizeof(status));
+ if (ret <= 0) {
+ ERROR("failed to signal child that cgroup logic has finished");
+ return -1;
+ }
+
+ close(cgroup_ipc_sockets[0]);
+
+ gparent_again:
+ ret = waitpid(pid, &status, 0);
+ if (ret < 0) {
+ if (errno == EINTR)
+ goto gparent_again;
+ SYSERROR("failed to wait for process '%d'", pid);
+ return -1;
+ }
+
+ if (WIFEXITED(status))
+ return WEXITSTATUS(status);
+
+ return -1;
+ }
+
+ /* at this point we are in the 'parent' process so we need to close the
+ * socket reserved for the 'grandparent' process
+ */
+ close(cgroup_ipc_sockets[0]);
+
/* we need to attach before we fork since certain namespaces
* (such as pid namespaces) only really affect children of the
* current process and not the process itself
free(curdir);
- /* hack: we need sync.h infrastructure - and that needs a handler */
+ /* hack: we need sync.h infrastructure - and that needs a handler
+ * FIXME: perhaps we should also just use a very simple socketpair()
+ * here? - like with the grandparent <-> parent communication?
+ */
handler = calloc(1, sizeof(*handler));
if (lxc_sync_init(handler)) {
if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
return -1;
- /* now that we are done with all privileged operations,
- * we can add ourselves to the cgroup. Since we smuggled in
- * the fds earlier, we still have write permission
+ /* ask grandparent to add child to cgroups, the grandparent will
+ * itself check whether that's actually necessary
*/
- if (!elevated_privileges) {
- /* since setns() for pid namespaces only really
- * affects child processes, the pid we have is
- * still valid outside the container, so this is
- * fine
+ ret = write(cgroup_ipc_sockets[1], &pid, sizeof(pid));
+ if (ret != sizeof(pid)) {
+ ERROR("error using IPC to notify main process of pid to add to the cgroups of the container");
+ return -1;
+ }
+
+ parent_reread:
+ /* we need some mechanism to check whether the grandparent could
+ * add us to the cgroups or not - so we await a dummy integer
+ * on the same socket (that's why we don't use a pipe - we need
+ * two-way communication). So if the parent fails and exits, that
+ * will close the socket, which will cause a read of 0 bytes for
+ * us, so we just terminate. If we read at least a byte, we don't
+ * care about the contents...
+ */
+ ret = read(cgroup_ipc_sockets[1], &status, sizeof(status));
+ if (ret <= 0) {
+ if (ret < 0 && (errno == EAGAIN || errno == EINTR))
+ goto parent_reread;
+ /* only print someting if we can't assume the parent already
+ * gave an error message, that will reduce confusion for the
+ * user
*/
- ret = lxc_cgroup_finish_attach(cgroup_data, pid);
- if (ret < 0) {
- ERROR("failed to attach process to cgroup");
- return -1;
- }
+ if (ret != 0)
+ ERROR("failed to get notification that the child process was added to the container's cgroups");
+ return -1;
}
+ /* we don't need that IPC interface anymore */
+ close(cgroup_ipc_sockets[1]);
+
/* tell the child we are done initializing */
if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE))
return -1;
if (!pid) {
lxc_sync_fini_parent(handler);
- lxc_cgroup_dispose_attach(cgroup_data);
+ close(cgroup_ipc_sockets[1]);
if (attach_apparmor(init_ctx->aa_profile) < 0) {
ERROR("failed switching apparmor profiles");
lxc_sync_fini(handler);
+ if (namespace_flags & CLONE_NEWUSER) {
+ /* XXX FIXME this should get the uid of the container init and setuid to that */
+ /* XXX FIXME or perhaps try to map in the lxc-attach caller's uid? */
+ if (setgid(0)) {
+ SYSERROR("switching to container gid");
+ return -1;
+ }
+ if (setuid(0)) {
+ SYSERROR("switching to container uid");
+ return -1;
+ }
+ }
+
if (my_args.argc) {
execvp(my_args.argv[0], my_args.argv);
SYSERROR("failed to exec '%s'", my_args.argv[0]);