From: Michael Tremer Date: Wed, 3 Aug 2022 08:42:18 +0000 (+0000) Subject: execute: Replace old code with new jail X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=6d97c6e54c1c3aa6a246995471ec23133dcc28d5;p=people%2Fstevee%2Fpakfire.git execute: Replace old code with new jail Signed-off-by: Michael Tremer --- diff --git a/src/libpakfire/execute.c b/src/libpakfire/execute.c index 97badfe8..531d1a51 100644 --- a/src/libpakfire/execute.c +++ b/src/libpakfire/execute.c @@ -18,257 +18,19 @@ # # #############################################################################*/ -#include #include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include #include -#include -#include -#include #include -// libseccomp -#include - -#include -#include #include #include #include -#include #include #include -#define ENVIRON_SIZE 128 -#define BUFFER_SIZE 1024 * 64 -#define EPOLL_MAX_EVENTS 2 #define LDCONFIG "/sbin/ldconfig" -// The default environment that will be set for every command -static const struct environ { - const char* key; - const char* val; -} default_environ[] = { - { "LANG", "en_US.utf-8" }, - { "TERM", "vt100" }, - { NULL, NULL }, -}; - -struct pakfire_execute { - struct pakfire* pakfire; - - // Flags - int flags; - - // Environment - const char** argv; - char* envp[ENVIRON_SIZE]; - - char cgroup[PATH_MAX]; - - // File descriptors - int stdout[2]; - int stderr[2]; -}; - -struct pakfire_execute_buffer { - char data[BUFFER_SIZE]; - size_t used; -}; - -static int clone3(struct clone_args* args, size_t size) { - return syscall(__NR_clone3, args, size); -} - -static int pakfire_execute_buffer_is_full(const struct pakfire_execute_buffer* buffer) { - return (sizeof(buffer->data) == buffer->used); -} - -/* - This function reads as much data as it can from the file descriptor. - If it finds a whole line in it, it will send it to the logger and repeat the process. - If not newline character is found, it will try to read more data until it finds one. -*/ -static int pakfire_execute_logger_proxy(struct pakfire* pakfire, int fd, - pakfire_execute_logging_callback logging_callback, void* data, int priority, - struct pakfire_execute_buffer* buffer) { - char line[BUFFER_SIZE + 1]; - - // Fill up buffer from fd - if (buffer->used < sizeof(buffer->data)) { - ssize_t bytes_read = read(fd, buffer->data + buffer->used, - sizeof(buffer->data) - buffer->used); - - // Handle errors - if (bytes_read < 0) { - ERROR(pakfire, "Could not read from fd %d: %m\n", fd); - return -1; - } - - // Update buffer size - buffer->used += bytes_read; - } - - // See if we have any lines that we can write - while (buffer->used) { - // Search for the end of the first line - char* eol = memchr(buffer->data, '\n', buffer->used); - - // No newline found - if (!eol) { - // If the buffer is full, we send the content to the logger and try again - // This should not happen in practise - if (pakfire_execute_buffer_is_full(buffer)) { - ERROR(pakfire, "Logging buffer is full. Sending all content\n"); - eol = buffer->data + sizeof(buffer->data) - 1; - - // Otherwise we might have only read parts of the output - } else - break; - } - - // Find the length of the string - size_t length = eol - buffer->data + 1; - - // Copy the line into the buffer - memcpy(line, buffer->data, length); - - // Terminate the string - line[length] = '\0'; - - // Log the line - int r = logging_callback(pakfire, data, priority, line, length); - if (r) { - ERROR(pakfire, "The logging callback returned an error: %d\n", r); - return r; - } - - // Remove line from buffer - memmove(buffer->data, buffer->data + length, buffer->used - length); - buffer->used -= length; - } - - return 0; -} - -static int pakfire_execute_logger(struct pakfire* pakfire, pakfire_execute_logging_callback logging_callback, - void* data, pid_t pid, int stdout, int stderr, int* status) { - int epollfd = -1; - struct epoll_event ev; - struct epoll_event events[EPOLL_MAX_EVENTS]; - int r = 0; - - int fds[2] = { - stdout, stderr, - }; - - // Allocate buffers - struct buffers { - struct pakfire_execute_buffer stdout; - struct pakfire_execute_buffer stderr; - } buffers; - buffers.stdout.used = buffers.stderr.used = 0; - - // Setup epoll - epollfd = epoll_create1(0); - if (epollfd < 0) { - ERROR(pakfire, "Could not initialize epoll(): %m\n"); - r = -errno; - goto OUT; - } - - ev.events = EPOLLIN; - - // Turn file descriptors into non-blocking mode and add them to epoll() - for (unsigned int i = 0; i < 2; i++) { - int fd = fds[i]; - - // Read flags - int flags = fcntl(fd, F_GETFL, 0); - - // Set modified flags - if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) { - ERROR(pakfire, "Could not set file descriptor %d into non-blocking mode: %m\n", - fd); - r = -errno; - goto OUT; - } - - ev.data.fd = fd; - - if (epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) { - ERROR(pakfire, "Could not add file descriptor %d to epoll(): %m\n", fd); - r = -errno; - goto OUT; - } - } - - int ended = 0; - - // Loop for as long as the process is alive - while (!ended) { - // If waitpid() returns non-zero, the process has ended, but we want to perform - // one last iteration over the loop to read any remaining content from the file - // descriptor buffers. - r = waitpid(pid, status, WNOHANG); - if (r) - ended = 1; - - int num = epoll_wait(epollfd, events, EPOLL_MAX_EVENTS, -1); - if (num < 1) { - // Ignore if epoll_wait() has been interrupted - if (errno == EINTR) - continue; - - ERROR(pakfire, "epoll_wait() failed: %m\n"); - r = -errno; - - goto OUT; - } - - struct pakfire_execute_buffer* buffer; - int priority; - - for (int i = 0; i < num; i++) { - int fd = events[i].data.fd; - - if (fd == stdout) { - buffer = &buffers.stdout; - priority = LOG_INFO; - - } else if (fd == stderr) { - buffer = &buffers.stderr; - priority = LOG_ERR; - - } else { - DEBUG(pakfire, "Received invalid file descriptor %d\n", fd); - continue; - } - - // Send everything to the logger - r = pakfire_execute_logger_proxy(pakfire, fd, logging_callback, data, priority, buffer); - if (r) - goto OUT; - } - } - -OUT: - if (epollfd > 0) - close(epollfd); - - return r; -} - static int default_logging_callback(struct pakfire* pakfire, void* data, int priority, const char* line, size_t length) { switch (priority) { @@ -337,572 +99,34 @@ int pakfire_execute_capture_stdout_to_array(struct pakfire* pakfire, void* data, return default_logging_callback(pakfire, NULL, priority, line, length); } -static int pakfire_drop_capabilities(struct pakfire* pakfire) { - const int capabilities[] = { - // Deny access to the kernel's audit system - CAP_AUDIT_CONTROL, - CAP_AUDIT_READ, - CAP_AUDIT_WRITE, - - // Deny suspending block devices - CAP_BLOCK_SUSPEND, - - // Deny any stuff with BPF - CAP_BPF, - - // Deny checkpoint restore - CAP_CHECKPOINT_RESTORE, - - // Deny opening files by inode number (open_by_handle_at) - CAP_DAC_READ_SEARCH, - - // Deny setting SUID bits - CAP_FSETID, - - // Deny locking more memory - CAP_IPC_LOCK, - - // Deny modifying any Apparmor/SELinux/SMACK configuration - CAP_MAC_ADMIN, - CAP_MAC_OVERRIDE, - - // Deny creating any special devices - CAP_MKNOD, - - // Deny setting any capabilities - CAP_SETFCAP, - - // Deny reading from syslog - CAP_SYSLOG, - - // Deny any admin actions (mount, sethostname, ...) - CAP_SYS_ADMIN, - - // Deny rebooting the system - CAP_SYS_BOOT, - - // Deny loading kernel modules - CAP_SYS_MODULE, - - // Deny setting nice level - CAP_SYS_NICE, - - // Deny access to /proc/kcore, /dev/mem, /dev/kmem - CAP_SYS_RAWIO, - - // Deny circumventing any resource limits - CAP_SYS_RESOURCE, - - // Deny setting the system time - CAP_SYS_TIME, - - // Deny playing with suspend - CAP_WAKE_ALARM, - - 0, - }; - - size_t num_caps = 0; - int r; - - // Drop any capabilities - for (const int* cap = capabilities; *cap; cap++) { - r = prctl(PR_CAPBSET_DROP, *cap, 0, 0, 0); - if (r) { - ERROR(pakfire, "Could not drop capability %d: %m\n", *cap); - return r; - } - - num_caps++; - } - - // Fetch any capabilities - cap_t caps = cap_get_proc(); - if (!caps) { - ERROR(pakfire, "Could not read capabilities: %m\n"); - return 1; - } - - /* - Set inheritable capabilities - - This ensures that no processes will be able to gain any of the listed - capabilities again. - */ - r = cap_set_flag(caps, CAP_INHERITABLE, num_caps, capabilities, CAP_CLEAR); - if (r) { - ERROR(pakfire, "cap_set_flag() failed: %m\n"); - goto ERROR; - } - - // Restore capabilities - r = cap_set_proc(caps); - if (r) { - ERROR(pakfire, "Could not restore capabilities: %m\n"); - goto ERROR; - } - -ERROR: - if (caps) - cap_free(caps); - - return r; -} - -static int pakfire_limit_syscalls(struct pakfire* pakfire) { - const int syscalls[] = { - // The kernel's keyring isn't namespaced - SCMP_SYS(keyctl), - SCMP_SYS(add_key), - SCMP_SYS(request_key), - - // Disable userfaultfd - SCMP_SYS(userfaultfd), - - // Disable perf which could leak a lot of information about the host - SCMP_SYS(perf_event_open), - - 0, - }; - int r = 1; - - // Setup a syscall filter which allows everything by default - scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW); - if (!ctx) { - ERROR(pakfire, "Could not setup seccomp filter: %m\n"); - goto ERROR; - } - - // All all syscalls - for (const int* syscall = syscalls; *syscall; syscall++) { - r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), *syscall, 0); - if (r) { - ERROR(pakfire, "Could not configure syscall %d: %m\n", *syscall); - goto ERROR; - } - } - - // Load syscall filter into the kernel - r = seccomp_load(ctx); - if (r) { - ERROR(pakfire, "Could not load syscall filter into the kernel: %m\n"); - goto ERROR; - } - -ERROR: - if (ctx) - seccomp_release(ctx); - - return r; -} - -static int find_environ(struct pakfire_execute* env, const char* key) { - if (!key) { - errno = EINVAL; - return -1; - } - - char buffer[strlen(key) + 2]; - unsigned int i = 0; - - pakfire_string_format(buffer, "%s=", key); - - for (i = 0; env->envp[i]; i++) { - if (pakfire_string_startswith(env->envp[i], buffer)) - return i; - } - - // Return -ENOSPC when the environment is full - if (i >= ENVIRON_SIZE) { - errno = ENOSPC; - return -1; - } - - // Return the index of the next free slot - return i; -} - -static int set_environ(struct pakfire_execute* env, const char* key, const char* value) { - // Find the index where to write this value to - int idx = find_environ(env, key); - if (idx < 0) - return idx; - - // Free any previous value - if (env->envp[idx]) - free(env->envp[idx]); - - // Format and set environment variable - asprintf(&env->envp[idx], "%s=%s", key, value); - - DEBUG(env->pakfire, "Set environment variable: %s\n", env->envp[idx]); - - return 0; -} - -static int pakfire_execute_write_uidgid_mapping(struct pakfire* pakfire, - const char* path, uid_t mapped_id, size_t length) { - int r = 1; - - // Open file for writing - FILE* f = fopen(path, "w"); - if (!f) { - ERROR(pakfire, "Could not open %s for writing: %m\n", path); - goto ERROR; - } - - // Write configuration - int bytes_written = fprintf(f, "%d %d %ld\n", 0, mapped_id, length); - if (bytes_written < 0) { - ERROR(pakfire, "Could not write UID mapping: %m\n"); - goto ERROR; - } - - // Success - r = 0; - -ERROR: - if (f) - fclose(f); - - return r; -} - -static int pakfire_execute_setup_uid_mapping(struct pakfire* pakfire) { - // XXX hard-coded values - const uid_t mapped_uid = 100000; - const size_t length = 64536; - - return pakfire_execute_write_uidgid_mapping(pakfire, - "/proc/self/uid_map", mapped_uid, length); -} - -static int pakfire_execute_setup_gid_mapping(struct pakfire* pakfire) { - // XXX hard-coded values - const uid_t mapped_gid = 100000; - const size_t length = 64536; - - return pakfire_execute_write_uidgid_mapping(pakfire, - "/proc/self/gid_map", mapped_gid, length); -} - -static int pakfire_execute_fork2(void* data) { - struct pakfire_execute* env = (struct pakfire_execute*)data; - int r; - - struct pakfire* pakfire = env->pakfire; - - const char* root = pakfire_get_path(pakfire); - const char* arch = pakfire_get_arch(pakfire); - - DEBUG(pakfire, "Execution environment has been forked as PID %d\n", getpid()); - DEBUG(pakfire, " root : %s\n", root); - DEBUG(pakfire, " cgroup : %s\n", env->cgroup); - - for (unsigned int i = 0; env->argv[i]; i++) - DEBUG(pakfire, " argv[%u] : %s\n", i, env->argv[i]); - - for (unsigned int i = 0; env->envp[i]; i++) - DEBUG(pakfire, " env : %s\n", env->envp[i]); - - // Change root (unless root is /) - if (!pakfire_on_root(pakfire)) { - // Mount everything - r = pakfire_mount_all(pakfire); - if (r) - return r; - - // Log all mountpoints - pakfire_mount_list(pakfire); - - // Call chroot() - r = chroot(root); - if (r) { - ERROR(pakfire, "chroot() to %s failed: %m\n", root); - return 1; - } - - // Change directory to / - r = chdir("/"); - if (r) { - ERROR(pakfire, "chdir() after chroot() failed: %m\n"); - return 1; - } - } - - // Set personality - unsigned long persona = pakfire_arch_personality(arch); - if (persona) { - r = personality(persona); - if (r < 0) { - ERROR(pakfire, "Could not set personality (%x)\n", (unsigned int)persona); - - return 1; - } - } - - // Connect standard output and error - if (env->stdout[1] && env->stderr[1]) { - if (dup2(env->stdout[1], STDOUT_FILENO) < 0) { - ERROR(pakfire, "Could not connect fd %d to stdout: %m\n", - env->stdout[1]); - - return 1; - } - - if (dup2(env->stderr[1], STDERR_FILENO) < 0) { - ERROR(pakfire, "Could not connect fd %d to stderr: %m\n", - env->stderr[1]); - - return 1; - } - - // Close the reading sides of the pipe - close(env->stdout[0]); - close(env->stderr[0]); - } - - // Reset open file limit (http://0pointer.net/blog/file-descriptor-limits.html) - r = pakfire_rlimit_reset_nofile(pakfire); - if (r) - return r; - - // Drop capabilities - r = pakfire_drop_capabilities(pakfire); - if (r) - return r; - - // Filter syscalls - r = pakfire_limit_syscalls(pakfire); - if (r) - return r; - - // exec() command - r = execvpe(env->argv[0], (char**)env->argv, env->envp); - if (r < 0) { - ERROR(pakfire, "Could not execve(): %m\n"); - } - - // Translate errno into regular exit code - switch (errno) { - case ENOENT: - r = 127; - break; - - default: - r = 1; - } - - // We should not get here - return r; -} - -/* - This function is launched in a new process that has its own user namespace. - - It will set up the container. -*/ -static int pakfire_execute_fork1(struct pakfire_execute* env) { - struct pakfire* pakfire = env->pakfire; - int r; - - DEBUG(pakfire, "Launched a new container as PID %d\n", getpid()); - - // Setup UID mapping - r = pakfire_execute_setup_uid_mapping(pakfire); - if (r) - return r; - - // Setup GID mapping - r = pakfire_execute_setup_gid_mapping(pakfire); - if (r) - return r; - - // Configure child process - struct clone_args args = { - .flags = - CLONE_NEWCGROUP | - CLONE_NEWIPC | - CLONE_NEWNS | - CLONE_NEWPID | - CLONE_NEWUTS, - .exit_signal = SIGCHLD, - }; - - // Enable network? - if (!(env->flags & PAKFIRE_EXECUTE_ENABLE_NETWORK)) - args.flags |= CLONE_NEWNET; - - // Fork this process - pid_t pid = clone3(&args, sizeof(args)); - if (pid < 0) { - ERROR(pakfire, "Could not fork: %m\n"); - return -errno; - - // Child process - } else if (pid == 0) { - r = pakfire_execute_fork2(env); - _exit(r); - } - - int status = 0; - - // Wait until the child process has finished - waitpid(pid, &status, 0); - - // Pass the exit code of the child process - return WEXITSTATUS(status); -} - PAKFIRE_EXPORT int pakfire_execute(struct pakfire* pakfire, const char* argv[], char* envp[], int flags, pakfire_execute_logging_callback logging_callback, void* data) { + struct pakfire_jail* jail = NULL; int r; - struct pakfire_execute env = { - .pakfire = pakfire, - .flags = flags, - .argv = argv, - }; - - // argv is invalid - if (!argv || !argv[0]) - return -EINVAL; + // Create a new jail + r = pakfire_jail_create(&jail, pakfire, flags); + if (r) + goto ERROR; - // Set default environment - for (const struct environ* e = default_environ; e->key; e++) { - r = set_environ(&env, e->key, e->val); + // Setup logging + if (logging_callback) { + r = pakfire_jail_set_log_callback(jail, logging_callback, data); if (r) goto ERROR; } - // Setup interactive environment - if (flags & PAKFIRE_EXECUTE_INTERACTIVE) { - // Set environment - r = set_environ(&env, "PS1", "pakfire-chroot \\w> "); - if (r) - goto ERROR; - - // Copy TERM - char* TERM = secure_getenv("TERM"); - if (TERM) { - r = set_environ(&env, "TERM", TERM); - if (r) - goto ERROR; - } - - // Copy LANG - char* LANG = secure_getenv("LANG"); - if (LANG) { - r = set_environ(&env, "LANG", LANG); - if (r) - goto ERROR; - } - - // Make some file descriptors for stdout & stderr - } else { - if (pipe(env.stdout) < 0) { - ERROR(pakfire, "Could not create file descriptors for stdout: %m\n"); - r = -1; - goto ERROR; - } - - if (pipe(env.stderr) < 0) { - ERROR(pakfire, "Could not create file descriptors for stderr: %m\n"); - r = -1; - goto ERROR; - } - } - - // Copy user environment - if (envp) { - char* key; - char* val; - - // Copy environment variables - for (unsigned int i = 0; envp[i]; i++) { - r = pakfire_string_partition(envp[i], "=", &key, &val); - if (r) - continue; - - // Set value - set_environ(&env, key, val); - - if (key) - free(key); - if (val) - free(val); - } - } - - if (!logging_callback) - logging_callback = &default_logging_callback; - - // Lauch a new user namespace - struct clone_args args = { - .flags = - CLONE_VFORK | - CLONE_NEWUSER, - .exit_signal = SIGCHLD, - }; - - // Fork this process - pid_t pid = clone3(&args, sizeof(args)); - if (pid < 0) { - ERROR(pakfire, "Could not fork: %m\n"); - return -errno; - - // Child process - } else if (pid == 0) { - r = pakfire_execute_fork1(&env); - exit(r); - } - - // Set some useful error code - int exit = -ESRCH; - int status = 0; - - DEBUG(pakfire, "Waiting for PID %d to finish its work\n", pid); - - if (!(flags & PAKFIRE_EXECUTE_INTERACTIVE)) { - // Close any unused file descriptors - if (env.stdout[1]) - close(env.stdout[1]); - if (env.stderr[1]) - close(env.stderr[1]); - - if (pakfire_execute_logger(pakfire, logging_callback, data, pid, env.stdout[0], env.stderr[0], &status)) { - ERROR(pakfire, "Log reading aborted: %m\n"); - } - } - - if (!status) - waitpid(pid, &status, 0); - - if (WIFEXITED(status)) { - exit = WEXITSTATUS(status); - - DEBUG(pakfire, "Child process exited with code: %d\n", exit); - } else { - ERROR(pakfire, "Could not determine the exit status of process %d\n", pid); - } + // Import environment + r = pakfire_jail_import_env(jail, (const char**)envp); + if (r) + goto ERROR; - // Return the exit code of the application - r = exit; + // Execute the command + r = pakfire_jail_exec(jail, argv); ERROR: - // Close any file descriptors - if (env.stdout[0]) - close(env.stdout[0]); - if (env.stderr[0]) - close(env.stderr[0]); - - // Umount everything - if (!pakfire_on_root(pakfire)) - pakfire_umount_all(pakfire); - - // Free environment - for (unsigned int i = 0; env.envp[i]; i++) - free(env.envp[i]); + if (jail) + pakfire_jail_unref(jail); return r; }