]> git.ipfire.org Git - thirdparty/lxc.git/commitdiff
namespace: add lxc_raw_clone()
authorChristian Brauner <christian.brauner@ubuntu.com>
Thu, 14 Dec 2017 14:31:54 +0000 (15:31 +0100)
committerChristian Brauner <christian.brauner@ubuntu.com>
Thu, 14 Dec 2017 21:18:28 +0000 (22:18 +0100)
This is based on raw_clone in systemd but adapted to our needs. The main reason
is that we need an implementation of fork()/clone() that does guarantee us that
no pthread_atfork() handlers are run. While clone() in glibc currently doesn't
run pthread_atfork() handlers we should be fine but there's no guarantee that
this won't be the case in the future. So let's do the syscall directly - or as
direct as we can. An additional nice feature is that we get fork() behavior,
i.e. lxc_raw_clone() returns 0 in the child and the child pid in the parent.

Our implementation tries to make sure that we cover all cases according to
kernel sources. Note that we are not interested in any arguments that could be
passed after the stack.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
src/lxc/namespace.c
src/lxc/namespace.h
src/tests/Makefile.am
src/tests/lxc_raw_clone.c [new file with mode: 0644]

index d716676ed9391b790cee5f63b326622ce9ac2d9b..e9c0ddd0f09dc7a9fa57a76b7e148cc715a0f6b4 100644 (file)
 #include <alloca.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <sched.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/param.h>
 #include <sys/stat.h>
+#include <sys/syscall.h>
 #include <sys/types.h>
 
 #include "log.h"
@@ -59,8 +61,7 @@ pid_t lxc_clone(int (*fn)(void *), void *arg, int flags)
        pid_t ret;
 
 #ifdef __ia64__
-       ret = __clone2(do_clone, stack,
-                      stack_size, flags | SIGCHLD, &clone_arg);
+       ret = __clone2(do_clone, stack, stack_size, flags | SIGCHLD, &clone_arg);
 #else
        ret = clone(do_clone, stack  + stack_size, flags | SIGCHLD, &clone_arg);
 #endif
@@ -70,6 +71,62 @@ pid_t lxc_clone(int (*fn)(void *), void *arg, int flags)
        return ret;
 }
 
+/**
+ * This is based on raw_clone in systemd but adapted to our needs. This uses
+ * copy on write semantics and doesn't pass a stack. CLONE_VM is tricky and
+ * doesn't really matter to us so disallow it.
+ *
+ * The nice thing about this is that we get fork() behavior. That is
+ * lxc_raw_clone() returns 0 in the child and the child pid in the parent.
+ */
+pid_t lxc_raw_clone(unsigned long flags)
+{
+
+       /* These flags don't interest at all so we don't jump through any hoopes
+        * of retrieving them and passing them to the kernel.
+        */
+       errno = EINVAL;
+       if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
+                     CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
+               return -EINVAL;
+
+#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
+       /* On s390/s390x and cris the order of the first and second arguments
+        * of the system call is reversed.
+        */
+       return (int)syscall(__NR_clone, NULL, flags | SIGCHLD);
+#elif defined(__sparc__) && defined(__arch64__)
+       {
+               /**
+                * sparc64 always returns the other process id in %o0, and
+                * a boolean flag whether this is the child or the parent in
+                * %o1. Inline assembly is needed to get the flag returned
+                * in %o1.
+                */
+               int in_child;
+               int child_pid;
+               asm volatile("mov %2, %%g1\n\t"
+                            "mov %3, %%o0\n\t"
+                            "mov 0 , %%o1\n\t"
+                            "t 0x6d\n\t"
+                            "mov %%o1, %0\n\t"
+                            "mov %%o0, %1"
+                            : "=r"(in_child), "=r"(child_pid)
+                            : "i"(__NR_clone), "r"(flags | SIGCHLD)
+                            : "%o1", "%o0", "%g1");
+               if (in_child)
+                       return 0;
+               else
+                       return child_pid;
+       }
+#elif defined(__ia64__)
+       /* On ia64 the stack and stack size are passed as separate arguments. */
+       return (int)syscall(__NR_clone, flags | SIGCHLD, NULL, 0);
+#else
+       return (int)syscall(__NR_clone, flags | SIGCHLD, NULL);
+#endif
+}
+
 /* Leave the user namespace at the first position in the array of structs so
  * that we always attach to it first when iterating over the struct and using
  * setns() to switch namespaces. This especially affects lxc_attach(): Suppose
index 7644fcd60cf05795f962806bc02a51925a774dc0..84453d9f0df5c3c54ea6c921a6d60b640304a8e9 100644 (file)
 
 #include "config.h"
 
+#ifndef CLONE_PARENT_SETTID
+#define CLONE_PARENT_SETTID 0x00100000
+#endif
+
+#ifndef CLONE_CHILD_CLEARTID
+#define CLONE_CHILD_CLEARTID 0x00200000
+#endif
+
+#ifndef CLONE_CHILD_SETTID
+#define CLONE_CHILD_SETTID 0x01000000
+#endif
+
+#ifndef CLONE_VFORK
+#define CLONE_VFORK 0x00004000
+#endif
+
+#ifndef CLONE_THREAD
+#define CLONE_THREAD 0x00010000
+#endif
+
+#ifndef CLONE_SETTLS
+#define CLONE_SETTLS 0x00080000
+#endif
+
+#ifndef CLONE_VM
+#define CLONE_VM 0x00000100
+#endif
+
+#ifndef CLONE_FILES
+#define CLONE_FILES 0x00000400
+#endif
+
 #ifndef CLONE_FS
 #  define CLONE_FS                0x00000200
 #endif
@@ -81,6 +113,7 @@ int clone(int (*fn)(void *), void *child_stack,
 #endif
 
 extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags);
+extern pid_t lxc_raw_clone(unsigned long flags);
 
 extern int lxc_namespace_2_cloneflag(const char *namespace);
 extern int lxc_namespace_2_ns_idx(const char *namespace);
index 0525ca909110102b3c7a60a8761a310b5dcab12e..6df1a9fa63fdf1296fe3152bc521b77ab133fb3e 100644 (file)
@@ -33,6 +33,7 @@ lxc_test_livepatch_SOURCES = livepatch.c lxctest.h
 lxc_test_state_server_SOURCES = state_server.c lxctest.h
 lxc_test_share_ns_SOURCES = share_ns.c lxctest.h
 lxc_test_criu_check_feature_SOURCES = criu_check_feature.c lxctest.h
+lxc_test_raw_clone_SOURCES = lxc_raw_clone.c lxctest.h
 
 AM_CFLAGS=-DLXCROOTFSMOUNT=\"$(LXCROOTFSMOUNT)\" \
        -DLXCPATH=\"$(LXCPATH)\" \
@@ -63,7 +64,7 @@ bin_PROGRAMS = lxc-test-containertests lxc-test-locktests lxc-test-startone \
        lxc-test-apparmor lxc-test-utils lxc-test-parse-config-file \
        lxc-test-config-jump-table lxc-test-shortlived lxc-test-livepatch \
        lxc-test-api-reboot lxc-test-state-server lxc-test-share-ns \
-       lxc-test-criu-check-feature
+       lxc-test-criu-check-feature lxc-test-raw-clone
 
 bin_SCRIPTS = lxc-test-automount \
              lxc-test-autostart \
@@ -104,6 +105,7 @@ EXTRA_DIST = \
        livepatch.c \
        locktests.c \
        lxcpath.c \
+       lxc_raw_clone.c \
        lxc-test-lxc-attach \
        lxc-test-automount \
        lxc-test-rootfs \
diff --git a/src/tests/lxc_raw_clone.c b/src/tests/lxc_raw_clone.c
new file mode 100644 (file)
index 0000000..2eadf98
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * lxc: linux Container library
+ *
+ * Copyright © 2017 Canonical Ltd.
+ *
+ * Authors:
+ * Christian Brauner <christian.brauner@ubuntu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define _GNU_SOURCE
+#define __STDC_FORMAT_MACROS
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "lxctest.h"
+#include "namespace.h"
+#include "utils.h"
+
+int main(int argc, char *argv[])
+{
+       int status;
+       pid_t pid;
+       int flags = 0;
+
+       pid = lxc_raw_clone(CLONE_PARENT_SETTID);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_PARENT_SETTID) "
+                                 "should not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_CHILD_SETTID);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_CHILD_SETTID) "
+                                 "should not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_CHILD_CLEARTID);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_CHILD_CLEARTID) "
+                                 "should not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_SETTLS);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_SETTLS) should "
+                                 "not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_VM);
+       if (pid >= 0 || pid != -EINVAL) {
+               lxc_error("%s\n", "Calling lxc_raw_clone(CLONE_VM) should "
+                         "not be possible");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(0);
+       if (pid < 0) {
+               lxc_error("%s\n", "Failed to call lxc_raw_clone(0)");
+               exit(EXIT_FAILURE);
+       }
+
+       if (pid == 0) {
+               lxc_error("%s\n", "Child will exit(EXIT_SUCCESS)");
+               exit(EXIT_SUCCESS);
+       }
+
+       status = wait_for_pid(pid);
+       if (status != 0) {
+               lxc_error("%s\n", "Failed to retrieve correct exit status");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(0);
+       if (pid < 0) {
+               lxc_error("%s\n", "Failed to call lxc_raw_clone(0)");
+               exit(EXIT_FAILURE);
+       }
+
+       if (pid == 0) {
+               lxc_error("%s\n", "Child will exit(EXIT_FAILURE)");
+               exit(EXIT_FAILURE);
+       }
+
+       status = wait_for_pid(pid);
+       if (status == 0) {
+               lxc_error("%s\n", "Failed to retrieve correct exit status");
+               exit(EXIT_FAILURE);
+       }
+
+       pid = lxc_raw_clone(CLONE_NEWUSER | CLONE_NEWCGROUP | CLONE_NEWNS |
+                           CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWIPC |
+                           CLONE_NEWPID | CLONE_NEWUTS);
+       if (pid < 0) {
+               lxc_error("%s\n", "Failed to call lxc_raw_clone(CLONE_NEWUSER "
+                                 "| CLONE_NEWCGROUP | CLONE_NEWNS | "
+                                 "CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWIPC "
+                                 "| CLONE_NEWPID | CLONE_NEWUTS);");
+               exit(EXIT_FAILURE);
+       }
+
+       if (pid == 0) {
+               lxc_error("%s\n", "Child will exit(EXIT_SUCCESS)");
+               exit(EXIT_SUCCESS);
+       }
+
+       status = wait_for_pid(pid);
+       if (status != 0) {
+               lxc_error("%s\n", "Failed to retrieve correct exit status");
+               exit(EXIT_FAILURE);
+       }
+
+       flags |= CLONE_NEWUSER;
+       if (cgns_supported())
+               flags |= CLONE_NEWCGROUP;
+       flags |= CLONE_NEWNS;
+       flags |= CLONE_NEWIPC;
+       flags |= CLONE_NEWNET;
+       flags |= CLONE_NEWIPC;
+       flags |= CLONE_NEWPID;
+       flags |= CLONE_NEWUTS;
+       pid = lxc_raw_clone(flags);
+       if (pid < 0) {
+               lxc_error("%s\n", "Failed to call lxc_raw_clone(CLONE_NEWUSER "
+                                 "| CLONE_NEWCGROUP | CLONE_NEWNS | "
+                                 "CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWIPC "
+                                 "| CLONE_NEWPID | CLONE_NEWUTS);");
+               exit(EXIT_FAILURE);
+       }
+
+
+       if (pid == 0) {
+               lxc_error("%s\n", "Child will exit(EXIT_FAILURE)");
+               exit(EXIT_FAILURE);
+       }
+
+       status = wait_for_pid(pid);
+       if (status == 0) {
+               lxc_error("%s\n", "Failed to retrieve correct exit status");
+               exit(EXIT_SUCCESS);
+       }
+
+       lxc_debug("%s\n", "All lxc_raw_clone() tests successful");
+       exit(EXIT_SUCCESS);
+}