]> git.ipfire.org Git - thirdparty/lxc.git/commitdiff
namespace: add lxc_raw_clone()
authorChristian Brauner <christian.brauner@ubuntu.com>
Thu, 14 Dec 2017 14:31:54 +0000 (15:31 +0100)
committerChristian Brauner <christian.brauner@ubuntu.com>
Sun, 17 Dec 2017 15:56:48 +0000 (16:56 +0100)
This is based on raw_clone in systemd but adapted to our needs. The main reason
is that we need an implementation of fork()/clone() that does guarantee us that
no pthread_atfork() handlers are run. While clone() in glibc currently doesn't
run pthread_atfork() handlers we should be fine but there's no guarantee that
this won't be the case in the future. So let's do the syscall directly - or as
direct as we can. An additional nice feature is that we get fork() behavior,
i.e. lxc_raw_clone() returns 0 in the child and the child pid in the parent.

Our implementation tries to make sure that we cover all cases according to
kernel sources. Note that we are not interested in any arguments that could be
passed after the stack.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
src/lxc/namespace.c
src/lxc/namespace.h
src/tests/Makefile.am
src/tests/lxc_raw_clone.c [new file with mode: 0644]

index e760c0d9a0ff37eaea5871b638d7c2e788b7fbe4..1b4d7179672e905d66673834cb52de613c1ba5ff 100644 (file)
 #include <alloca.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <sched.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/param.h>
 #include <sys/stat.h>
+#include <sys/syscall.h>
 #include <sys/types.h>
 
 #include "log.h"
@@ -59,8 +61,7 @@ pid_t lxc_clone(int (*fn)(void *), void *arg, int flags)
        pid_t ret;
 
 #ifdef __ia64__
-       ret = __clone2(do_clone, stack,
-                      stack_size, flags | SIGCHLD, &clone_arg);
+       ret = __clone2(do_clone, stack, stack_size, flags | SIGCHLD, &clone_arg);
 #else
        ret = clone(do_clone, stack  + stack_size, flags | SIGCHLD, &clone_arg);
 #endif
@@ -70,6 +71,62 @@ pid_t lxc_clone(int (*fn)(void *), void *arg, int flags)
        return ret;
 }
 
+/**
+ * This is based on raw_clone in systemd but adapted to our needs. This uses
+ * copy on write semantics and doesn't pass a stack. CLONE_VM is tricky and
+ * doesn't really matter to us so disallow it.
+ *
+ * The nice thing about this is that we get fork() behavior. That is
+ * lxc_raw_clone() returns 0 in the child and the child pid in the parent.
+ */
+pid_t lxc_raw_clone(unsigned long flags)
+{
+
+       /* These flags don't interest at all so we don't jump through any hoopes
+        * of retrieving them and passing them to the kernel.
+        */
+       errno = EINVAL;
+       if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
+                     CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
+               return -EINVAL;
+
+#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
+       /* On s390/s390x and cris the order of the first and second arguments
+        * of the system call is reversed.
+        */
+       return (int)syscall(__NR_clone, NULL, flags | SIGCHLD);
+#elif defined(__sparc__) && defined(__arch64__)
+       {
+               /**
+                * sparc64 always returns the other process id in %o0, and
+                * a boolean flag whether this is the child or the parent in
+                * %o1. Inline assembly is needed to get the flag returned
+                * in %o1.
+                */
+               int in_child;
+               int child_pid;
+               asm volatile("mov %2, %%g1\n\t"
+                            "mov %3, %%o0\n\t"
+                            "mov 0 , %%o1\n\t"
+                            "t 0x6d\n\t"
+                            "mov %%o1, %0\n\t"
+                            "mov %%o0, %1"
+                            : "=r"(in_child), "=r"(child_pid)
+                            : "i"(__NR_clone), "r"(flags | SIGCHLD)
+                            : "%o1", "%o0", "%g1");
+               if (in_child)
+                       return 0;
+               else
+                       return child_pid;
+       }
+#elif defined(__ia64__)
+       /* On ia64 the stack and stack size are passed as separate arguments. */
+       return (int)syscall(__NR_clone, flags | SIGCHLD, NULL, 0);
+#else
+       return (int)syscall(__NR_clone, flags | SIGCHLD, NULL);
+#endif
+}
+
 /* Leave the user namespace at the first position in the array of structs so
  * that we always attach to it first when iterating over the struct and using
  * setns() to switch namespaces. This especially affects lxc_attach(): Suppose
index 4916950c15d2a0e935cafd610f140ae2fcd7ed75..5064b27c0a667d9fe66ee47323f1a88ef1453876 100644 (file)
 
 #include "config.h"
 
+#ifndef CLONE_PARENT_SETTID
+#define CLONE_PARENT_SETTID 0x00100000
+#endif
+
+#ifndef CLONE_CHILD_CLEARTID
+#define CLONE_CHILD_CLEARTID 0x00200000
+#endif
+
+#ifndef CLONE_CHILD_SETTID
+#define CLONE_CHILD_SETTID 0x01000000
+#endif
+
+#ifndef CLONE_VFORK
+#define CLONE_VFORK 0x00004000
+#endif
+
+#ifndef CLONE_THREAD
+#define CLONE_THREAD 0x00010000
+#endif
+
+#ifndef CLONE_SETTLS
+#define CLONE_SETTLS 0x00080000
+#endif
+
+#ifndef CLONE_VM
+#define CLONE_VM 0x00000100
+#endif
+
+#ifndef CLONE_FILES
+#define CLONE_FILES 0x00000400
+#endif
+
 #ifndef CLONE_FS
 #  define CLONE_FS                0x00000200
 #endif
@@ -80,6 +112,7 @@ int clone(int (*fn)(void *), void *child_stack,
 #endif
 
 extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags);
+extern pid_t lxc_raw_clone(unsigned long flags);
 
 extern int lxc_namespace_2_cloneflag(char *namespace);
 extern int lxc_fill_namespace_flags(char *flaglist, int *flags);
index af9c490f2ffb1e8fc179aa293728cf6064506b42..e2e73c941f30f37cff3573a559b571af68d5658b 100644 (file)
@@ -28,6 +28,7 @@ lxc_test_parse_config_file_SOURCES = parse_config_file.c lxctest.h
 lxc_test_config_jump_table_SOURCES = config_jump_table.c lxctest.h
 lxc_test_shortlived_SOURCES = shortlived.c
 lxc_test_state_server_SOURCES = state_server.c lxctest.h
+lxc_test_raw_clone_SOURCES = lxc_raw_clone.c lxctest.h
 
 AM_CFLAGS=-DLXCROOTFSMOUNT=\"$(LXCROOTFSMOUNT)\" \
        -DLXCPATH=\"$(LXCPATH)\" \
@@ -56,7 +57,8 @@ bin_PROGRAMS = lxc-test-containertests lxc-test-locktests lxc-test-startone \
        lxc-test-snapshot lxc-test-concurrent lxc-test-may-control \
        lxc-test-reboot lxc-test-list lxc-test-attach lxc-test-device-add-remove \
        lxc-test-apparmor lxc-test-utils lxc-test-parse-config-file \
-       lxc-test-config-jump-table lxc-test-shortlived lxc-test-state-server
+       lxc-test-config-jump-table lxc-test-shortlived lxc-test-state-server \
+       lxc-test-raw-clone
 
 bin_SCRIPTS = lxc-test-automount \
              lxc-test-autostart \
@@ -93,6 +95,7 @@ EXTRA_DIST = \
        list.c \
        locktests.c \
        lxcpath.c \
+       lxc_raw_clone.c \
        lxc-test-lxc-attach \
        lxc-test-automount \
        lxc-test-rootfs \
diff --git a/src/tests/lxc_raw_clone.c b/src/tests/lxc_raw_clone.c
new file mode 100644 (file)
index 0000000..2eadf98
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * lxc: linux Container library
+ *
+ * Copyright © 2017 Canonical Ltd.
+ *
+ * Authors:
+ * Christian Brauner <christian.brauner@ubuntu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define _GNU_SOURCE
+#define __STDC_FORMAT_MACROS
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "lxctest.h"
+#include "namespace.h"
+#include "utils.h"
+
+int main(int argc, char *argv[])
+{
+       int status;
+       pid_t pid;
+       int flags = 0;
+
+       pid = lxc_raw_clone(CLONE_PARENT_SETTID);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_PARENT_SETTID) "
+                                 "should not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_CHILD_SETTID);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_CHILD_SETTID) "
+                                 "should not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_CHILD_CLEARTID);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_CHILD_CLEARTID) "
+                                 "should not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_SETTLS);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_SETTLS) should "
+                                 "not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_VM);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_VM) should "
+                         "not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(0);
+       if (pid < 0) {
+               lxc_error("%s\n", "Failed to call lxc_raw_clone(0)");
+               exit(EXIT_FAILURE);
+       }
+
+       if (pid == 0) {
+               lxc_error("%s\n", "Child will exit(EXIT_SUCCESS)");
+               exit(EXIT_SUCCESS);
+       }
+
+       status = wait_for_pid(pid);
+       if (status != 0) {
+               lxc_error("%s\n", "Failed to retrieve correct exit status");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(0);
+       if (pid < 0) {
+               lxc_error("%s\n", "Failed to call lxc_raw_clone(0)");
+               exit(EXIT_FAILURE);
+       }
+
+       if (pid == 0) {
+               lxc_error("%s\n", "Child will exit(EXIT_FAILURE)");
+               exit(EXIT_FAILURE);
+       }
+
+       status = wait_for_pid(pid);
+       if (status == 0) {
+               lxc_error("%s\n", "Failed to retrieve correct exit status");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_NEWUSER | CLONE_NEWCGROUP | CLONE_NEWNS |
+                           CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWIPC |
+                           CLONE_NEWPID | CLONE_NEWUTS);
+       if (pid < 0) {
+               lxc_error("%s\n", "Failed to call lxc_raw_clone(CLONE_NEWUSER "
+                                 "| CLONE_NEWCGROUP | CLONE_NEWNS | "
+                                 "CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWIPC "
+                                 "| CLONE_NEWPID | CLONE_NEWUTS);");
+               exit(EXIT_FAILURE);
+       }
+
+       if (pid == 0) {
+               lxc_error("%s\n", "Child will exit(EXIT_SUCCESS)");
+               exit(EXIT_SUCCESS);
+       }
+
+       status = wait_for_pid(pid);
+       if (status != 0) {
+               lxc_error("%s\n", "Failed to retrieve correct exit status");
+               exit(EXIT_FAILURE);
+       }
+
+       flags |= CLONE_NEWUSER;
+       if (cgns_supported())
+               flags |= CLONE_NEWCGROUP;
+       flags |= CLONE_NEWNS;
+       flags |= CLONE_NEWIPC;
+       flags |= CLONE_NEWNET;
+       flags |= CLONE_NEWIPC;
+       flags |= CLONE_NEWPID;
+       flags |= CLONE_NEWUTS;
+       pid = lxc_raw_clone(flags);
+       if (pid < 0) {
+               lxc_error("%s\n", "Failed to call lxc_raw_clone(CLONE_NEWUSER "
+                                 "| CLONE_NEWCGROUP | CLONE_NEWNS | "
+                                 "CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWIPC "
+                                 "| CLONE_NEWPID | CLONE_NEWUTS);");
+               exit(EXIT_FAILURE);
+       }
+
+
+       if (pid == 0) {
+               lxc_error("%s\n", "Child will exit(EXIT_FAILURE)");
+               exit(EXIT_FAILURE);
+       }
+
+       status = wait_for_pid(pid);
+       if (status == 0) {
+               lxc_error("%s\n", "Failed to retrieve correct exit status");
+               exit(EXIT_SUCCESS);
+       }
+
+       lxc_debug("%s\n", "All lxc_raw_clone() tests successful");
+       exit(EXIT_SUCCESS);
+}