From: Maria Matejka Date: Thu, 29 Aug 2024 05:53:47 +0000 (+0200) Subject: Flock: Creating the hypervisor and the external-contact process X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d79693b0f3e8c4bcd5eb0618048b8fd12a902014;p=thirdparty%2Fbird.git Flock: Creating the hypervisor and the external-contact process This is the first part of rewriting Flock to C to significantly reduce memory footprint of individual machines from 20+M in Python to (goal) less than 1M. Now the process eats ~460k and I suspect that this won't even be the consumption per machine in total as fork() is involved and some parts of the memory will be heavily shared. --- diff --git a/Makefile.in b/Makefile.in index aed6dbc22..9f16bf545 100644 --- a/Makefile.in +++ b/Makefile.in @@ -78,7 +78,7 @@ cli: $(client) $(daemon): LIBS += $(DAEMON_LIBS) # Include directories -dirs := client conf doc filter lib nest test $(addprefix proto/,$(protocols)) @sysdep_dirs@ +dirs := client conf doc filter flock lib nest test $(addprefix proto/,$(protocols)) @sysdep_dirs@ # conf/Makefile declarations needed for all other modules conf-lex-targets := $(addprefix $(objdir)/conf/,cf-lex.o) diff --git a/flock/Makefile b/flock/Makefile new file mode 100644 index 000000000..debbd66cb --- /dev/null +++ b/flock/Makefile @@ -0,0 +1,12 @@ +src := flock.c hypervisor.c +obj := $(src-o-files) + +flock=$(exedir)/flock-sim + +$(flock): $(obj) +$(flock): $(common-lib) +$(flock): LIBS += $(COMMON_LIBS) + +$(flock): + $(E)echo LD $(LDFLAGS) -o $@ $^ $(LIBS) + $(Q)$(CC) $(LDFLAGS) -o $@ $(patsubst $(common-lib),$(shell cat $(common-lib)),$^) $(LIBS) diff --git a/flock/flock.c b/flock/flock.c new file mode 100644 index 000000000..3320cd28e --- /dev/null +++ b/flock/flock.c @@ -0,0 +1,209 @@ +#include "flock/flock.h" + +#include "lib/string.h" + +#include "lib/timer.h" +#include "sysdep/unix/unix.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Overall configuration */ +struct flock_config flock_config; + +/** + * Signal handling + * + * We wanna behave as the init process inside the newly create PID namespace + * which means that the signals have different meanings than for other processes, + * For more information, see pid_namespaces(7). + */ + +static sig_atomic_t signal_received; +#define SIGREQ_REBOOT 1 +#define SIGREQ_POWEROFF 2 +#define SIGREQ_FAIL 4 + +static void +hypervisor_reboot_sighandler(int signo UNUSED) +{ + signal_received |= SIGREQ_REBOOT; +} + +static void +hypervisor_poweroff_sighandler(int signo UNUSED) +{ + signal_received |= SIGREQ_POWEROFF; +} + +static void +hypervisor_fail_sighandler(int signo UNUSED) +{ + signal_received |= SIGREQ_FAIL; + + int e = fork(); + if (e == 0) + { + signal(SIGABRT, SIG_DFL); + abort(); + } + + if (e > 0) + waitpid(e, NULL, 0); + + _exit(1); +} + + +/* + * The Main. + * + * Bootstrapping and all the fiddling around before anything can actually + * be really executed. + */ + +#define SYSCALL(x, ...) ({ int e = x(__VA_ARGS__); if (e < 0) die("Failed to run %s at %s:%d: %m", #x, __FILE__, __LINE__); e; }) + +#define KILLABLE_SIGNALS SIGINT, SIGTERM, SIGHUP, SIGQUIT + +static inline void +usage(FILE *f) +{ + fprintf(f, + "Usage: %s name\n\n" + "Runs hypervisor with the given name.\n", + flock_config.exec_name); +} + +int +main(int argc, char **argv, char **argh UNUSED) +{ + /* Prepare necessary infrastructure */ + the_bird_lock(); + times_update(); + resource_init(); + random_init(); + + birdloop_init(); + boot_time = current_time(); + + log_switch(1, NULL, NULL); + + /* Parse args */ + flock_config.exec_name = argv[0] ?: "flock-sim"; + int opt; + while ((opt = getopt(argc, argv, "")) != -1) + { + /* TODO: add some options */ + usage(stderr); + return 2; + } + + /* Get hypervisor name */ + if (optind != argc - 1) + { + usage(stderr); + return 2; + } + + flock_config.hypervisor_name = argv[optind]; + + /* Mask signals for forking and other fragile stuff */ + sigset_t oldmask; + sigset_t newmask; + sigemptyset(&newmask); +#define FROB(x) sigaddset(&newmask, x); + MACRO_FOREACH(FROB, KILLABLE_SIGNALS); +#undef FROB + sigprocmask(SIG_BLOCK, &newmask, &oldmask); + + /* Keep the original UID/GIDs */ + uid_t euid = geteuid(), egid = getegid(); + + /* First we need to create the PID + mount + user namespace to acquire capabilities */ + SYSCALL(unshare, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUSER); + + /* Then we have to fork() to become PID 1 of the new PID namespace */ + pid_t init_pid = fork(); + if (init_pid < 0) + die("Failed to become init: %m"); + + /* The parent process may end now + * TODO: allow wait() and/or writing PIDfile + * instead of just ending */ + if (init_pid > 0) + return 0; + + /* We also need to fix some UID/GID mappings to become local root. + * TODO: this will need an upgrade for full-scale containers. */ +#define WRITE_ONCE(file, data, len) do { \ + int fd = SYSCALL(open, file, O_WRONLY); \ + int e = write(fd, data, len); \ + if (e != len) die("Failed to write %s to %s", data, file); \ + close(fd); \ +} while (0) + + { + char fixer[256]; + int len = bsnprintf(fixer, sizeof fixer, "0 %d 1", euid); + WRITE_ONCE("/proc/self/uid_map", fixer, len); + + WRITE_ONCE("/proc/self/setgroups", "deny", sizeof "deny"); + + len = bsnprintf(fixer, sizeof fixer, "0 %d 1", egid); + WRITE_ONCE("/proc/self/gid_map", fixer, len); + } +#undef WRITE_ONCE + + /* Remounting proc to reflect the new PID namespace */ + SYSCALL(mount, "none", "/", NULL, MS_REC | MS_PRIVATE, NULL); + SYSCALL(mount, "proc", "/proc", "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL); + + /* Now we are init but in the original network namespace, + * let's spawn a child to do external communication before unsharing */ + hypervisor_exposed_fork(); + + /* And now finally we can go for unsharing the rest -- networks and time */ + SYSCALL(unshare, CLONE_NEWTIME | CLONE_NEWNET); + + /* Set signal handlers as this process is init in its PID namespace */ + signal(SIGTERM, hypervisor_poweroff_sighandler); + signal(SIGINT, hypervisor_poweroff_sighandler); + signal(SIGHUP, hypervisor_reboot_sighandler); + signal(SIGQUIT, hypervisor_fail_sighandler); + + /* Unblock signals */ + sigprocmask(SIG_SETMASK, &oldmask, NULL); + + /* Check limits */ + struct rlimit corelimit; + getrlimit(RLIMIT_CORE, &corelimit); + log(L_INFO "Core limit %u %u", corelimit.rlim_cur, corelimit.rlim_max); + + /* Wait for Godot */ + log(L_INFO "Hypervisor running"); + while (1) + { + pause(); + + uint s = signal_received; + signal_received &= ~s; + + if (s & SIGREQ_FAIL) + bug("Fail flag should never propagate from signal"); + else if (s & SIGREQ_POWEROFF) + return 0; + else if (s & SIGREQ_REBOOT) + log(L_ERR "Reboot requested but not implemented"); + } +} diff --git a/flock/flock.h b/flock/flock.h new file mode 100644 index 000000000..285339ae5 --- /dev/null +++ b/flock/flock.h @@ -0,0 +1,16 @@ +#define _GNU_SOURCE + +#ifndef INCLUDE_FLOCK_H +#define INCLUDE_FLOCK_H +#include "lib/birdlib.h" + +void hypervisor_exposed_fork(void); + +struct flock_config { + const char *hypervisor_name; + const char *exec_name; +}; + +extern struct flock_config flock_config; + +#endif diff --git a/flock/hypervisor.c b/flock/hypervisor.c new file mode 100644 index 000000000..8b2f955ae --- /dev/null +++ b/flock/hypervisor.c @@ -0,0 +1,90 @@ +#include "lib/birdlib.h" + +#include "lib/resource.h" +#include "lib/io-loop.h" + +#include + +/* Local communication structure */ +static struct hypervisor_exposed { + pool *p; + sock *s; + struct birdloop *loop; +} he; + +/** + * Exposed process' parent side (requestor) + **/ +static int +hypervisor_exposed_parent_rx(sock *sk, uint size UNUSED) +{ + log(L_INFO "HV EP RX"); + recvmsg(sk->fd, NULL, 0); + return 0; +} + +static void +hypervisor_exposed_parent_err(sock *sk UNUSED, int e UNUSED) +{ +} + +/** + * Exposed process' child side (executor) + **/ +static int +hypervisor_exposed_child_rx(sock *sk, uint size UNUSED) +{ + log(L_INFO "HV EC RX"); + recvmsg(sk->fd, NULL, 0); + return 0; +} + +static void +hypervisor_exposed_child_err(sock *sk UNUSED, int e UNUSED) +{ +} + +/** + * Common init code + */ +void +hypervisor_exposed_fork(void) +{ + int fds[2], e; + + /* create socketpair before forking to do communication */ + e = socketpair(AF_UNIX, SOCK_STREAM, 0, fds); + if (e < 0) + die("Failed to create internal socketpair: %m"); + + e = fork(); + if (e < 0) + die("Failed to fork exposed: %m"); + + /* Create the communication channel (both sides at once) */ + he.loop = birdloop_new(&root_pool, DOMAIN_ORDER(proto), 0, "Exposed interlink"); + + birdloop_enter(he.loop); + he.p = rp_new(birdloop_pool(he.loop), birdloop_domain(he.loop), "Exposed interlink pool"); + he.s = sk_new(he.p); + he.s->type = SK_MAGIC; + he.s->rx_hook = e ? hypervisor_exposed_parent_rx : hypervisor_exposed_child_rx; + he.s->err_hook = e ? hypervisor_exposed_parent_err : hypervisor_exposed_child_err; + he.s->fd = fds[!!e]; + close(fds[!e]); + + if (sk_open(he.s, he.loop) < 0) + bug("Exposed parent: sk_open failed"); + + birdloop_leave(he.loop); + + /* Now there is a loop both in child and parent, prepared to read the socket. + * There is only one difference. Whereas the parent has to continue its run + * to do other duties, the child is stuck here forever. */ + if (e) + return; + + /* Child-only */ + while (1) + pause(); +}