# #
#############################################################################*/
-#include <dirent.h>
#include <errno.h>
-#include <fcntl.h>
-#include <linux/capability.h>
-#include <linux/limits.h>
-#include <linux/sched.h>
-#include <sched.h>
#include <stdlib.h>
-#include <string.h>
-#include <sys/capability.h>
-#include <sys/epoll.h>
-#include <sys/mount.h>
-#include <sys/personality.h>
#include <sys/prctl.h>
-#include <sys/types.h>
-#include <sys/user.h>
-#include <sys/wait.h>
#include <unistd.h>
-// libseccomp
-#include <seccomp.h>
-
-#include <pakfire/arch.h>
-#include <pakfire/cgroup.h>
#include <pakfire/execute.h>
#include <pakfire/jail.h>
#include <pakfire/logging.h>
-#include <pakfire/mount.h>
#include <pakfire/private.h>
#include <pakfire/util.h>
-#define ENVIRON_SIZE 128
-#define BUFFER_SIZE 1024 * 64
-#define EPOLL_MAX_EVENTS 2
#define LDCONFIG "/sbin/ldconfig"
-// The default environment that will be set for every command
-static const struct environ {
- const char* key;
- const char* val;
-} default_environ[] = {
- { "LANG", "en_US.utf-8" },
- { "TERM", "vt100" },
- { NULL, NULL },
-};
-
-struct pakfire_execute {
- struct pakfire* pakfire;
-
- // Flags
- int flags;
-
- // Environment
- const char** argv;
- char* envp[ENVIRON_SIZE];
-
- char cgroup[PATH_MAX];
-
- // File descriptors
- int stdout[2];
- int stderr[2];
-};
-
-struct pakfire_execute_buffer {
- char data[BUFFER_SIZE];
- size_t used;
-};
-
-static int clone3(struct clone_args* args, size_t size) {
- return syscall(__NR_clone3, args, size);
-}
-
-static int pakfire_execute_buffer_is_full(const struct pakfire_execute_buffer* buffer) {
- return (sizeof(buffer->data) == buffer->used);
-}
-
-/*
- This function reads as much data as it can from the file descriptor.
- If it finds a whole line in it, it will send it to the logger and repeat the process.
- If not newline character is found, it will try to read more data until it finds one.
-*/
-static int pakfire_execute_logger_proxy(struct pakfire* pakfire, int fd,
- pakfire_execute_logging_callback logging_callback, void* data, int priority,
- struct pakfire_execute_buffer* buffer) {
- char line[BUFFER_SIZE + 1];
-
- // Fill up buffer from fd
- if (buffer->used < sizeof(buffer->data)) {
- ssize_t bytes_read = read(fd, buffer->data + buffer->used,
- sizeof(buffer->data) - buffer->used);
-
- // Handle errors
- if (bytes_read < 0) {
- ERROR(pakfire, "Could not read from fd %d: %m\n", fd);
- return -1;
- }
-
- // Update buffer size
- buffer->used += bytes_read;
- }
-
- // See if we have any lines that we can write
- while (buffer->used) {
- // Search for the end of the first line
- char* eol = memchr(buffer->data, '\n', buffer->used);
-
- // No newline found
- if (!eol) {
- // If the buffer is full, we send the content to the logger and try again
- // This should not happen in practise
- if (pakfire_execute_buffer_is_full(buffer)) {
- ERROR(pakfire, "Logging buffer is full. Sending all content\n");
- eol = buffer->data + sizeof(buffer->data) - 1;
-
- // Otherwise we might have only read parts of the output
- } else
- break;
- }
-
- // Find the length of the string
- size_t length = eol - buffer->data + 1;
-
- // Copy the line into the buffer
- memcpy(line, buffer->data, length);
-
- // Terminate the string
- line[length] = '\0';
-
- // Log the line
- int r = logging_callback(pakfire, data, priority, line, length);
- if (r) {
- ERROR(pakfire, "The logging callback returned an error: %d\n", r);
- return r;
- }
-
- // Remove line from buffer
- memmove(buffer->data, buffer->data + length, buffer->used - length);
- buffer->used -= length;
- }
-
- return 0;
-}
-
-static int pakfire_execute_logger(struct pakfire* pakfire, pakfire_execute_logging_callback logging_callback,
- void* data, pid_t pid, int stdout, int stderr, int* status) {
- int epollfd = -1;
- struct epoll_event ev;
- struct epoll_event events[EPOLL_MAX_EVENTS];
- int r = 0;
-
- int fds[2] = {
- stdout, stderr,
- };
-
- // Allocate buffers
- struct buffers {
- struct pakfire_execute_buffer stdout;
- struct pakfire_execute_buffer stderr;
- } buffers;
- buffers.stdout.used = buffers.stderr.used = 0;
-
- // Setup epoll
- epollfd = epoll_create1(0);
- if (epollfd < 0) {
- ERROR(pakfire, "Could not initialize epoll(): %m\n");
- r = -errno;
- goto OUT;
- }
-
- ev.events = EPOLLIN;
-
- // Turn file descriptors into non-blocking mode and add them to epoll()
- for (unsigned int i = 0; i < 2; i++) {
- int fd = fds[i];
-
- // Read flags
- int flags = fcntl(fd, F_GETFL, 0);
-
- // Set modified flags
- if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) {
- ERROR(pakfire, "Could not set file descriptor %d into non-blocking mode: %m\n",
- fd);
- r = -errno;
- goto OUT;
- }
-
- ev.data.fd = fd;
-
- if (epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) {
- ERROR(pakfire, "Could not add file descriptor %d to epoll(): %m\n", fd);
- r = -errno;
- goto OUT;
- }
- }
-
- int ended = 0;
-
- // Loop for as long as the process is alive
- while (!ended) {
- // If waitpid() returns non-zero, the process has ended, but we want to perform
- // one last iteration over the loop to read any remaining content from the file
- // descriptor buffers.
- r = waitpid(pid, status, WNOHANG);
- if (r)
- ended = 1;
-
- int num = epoll_wait(epollfd, events, EPOLL_MAX_EVENTS, -1);
- if (num < 1) {
- // Ignore if epoll_wait() has been interrupted
- if (errno == EINTR)
- continue;
-
- ERROR(pakfire, "epoll_wait() failed: %m\n");
- r = -errno;
-
- goto OUT;
- }
-
- struct pakfire_execute_buffer* buffer;
- int priority;
-
- for (int i = 0; i < num; i++) {
- int fd = events[i].data.fd;
-
- if (fd == stdout) {
- buffer = &buffers.stdout;
- priority = LOG_INFO;
-
- } else if (fd == stderr) {
- buffer = &buffers.stderr;
- priority = LOG_ERR;
-
- } else {
- DEBUG(pakfire, "Received invalid file descriptor %d\n", fd);
- continue;
- }
-
- // Send everything to the logger
- r = pakfire_execute_logger_proxy(pakfire, fd, logging_callback, data, priority, buffer);
- if (r)
- goto OUT;
- }
- }
-
-OUT:
- if (epollfd > 0)
- close(epollfd);
-
- return r;
-}
-
static int default_logging_callback(struct pakfire* pakfire, void* data, int priority,
const char* line, size_t length) {
switch (priority) {
return default_logging_callback(pakfire, NULL, priority, line, length);
}
-static int pakfire_drop_capabilities(struct pakfire* pakfire) {
- const int capabilities[] = {
- // Deny access to the kernel's audit system
- CAP_AUDIT_CONTROL,
- CAP_AUDIT_READ,
- CAP_AUDIT_WRITE,
-
- // Deny suspending block devices
- CAP_BLOCK_SUSPEND,
-
- // Deny any stuff with BPF
- CAP_BPF,
-
- // Deny checkpoint restore
- CAP_CHECKPOINT_RESTORE,
-
- // Deny opening files by inode number (open_by_handle_at)
- CAP_DAC_READ_SEARCH,
-
- // Deny setting SUID bits
- CAP_FSETID,
-
- // Deny locking more memory
- CAP_IPC_LOCK,
-
- // Deny modifying any Apparmor/SELinux/SMACK configuration
- CAP_MAC_ADMIN,
- CAP_MAC_OVERRIDE,
-
- // Deny creating any special devices
- CAP_MKNOD,
-
- // Deny setting any capabilities
- CAP_SETFCAP,
-
- // Deny reading from syslog
- CAP_SYSLOG,
-
- // Deny any admin actions (mount, sethostname, ...)
- CAP_SYS_ADMIN,
-
- // Deny rebooting the system
- CAP_SYS_BOOT,
-
- // Deny loading kernel modules
- CAP_SYS_MODULE,
-
- // Deny setting nice level
- CAP_SYS_NICE,
-
- // Deny access to /proc/kcore, /dev/mem, /dev/kmem
- CAP_SYS_RAWIO,
-
- // Deny circumventing any resource limits
- CAP_SYS_RESOURCE,
-
- // Deny setting the system time
- CAP_SYS_TIME,
-
- // Deny playing with suspend
- CAP_WAKE_ALARM,
-
- 0,
- };
-
- size_t num_caps = 0;
- int r;
-
- // Drop any capabilities
- for (const int* cap = capabilities; *cap; cap++) {
- r = prctl(PR_CAPBSET_DROP, *cap, 0, 0, 0);
- if (r) {
- ERROR(pakfire, "Could not drop capability %d: %m\n", *cap);
- return r;
- }
-
- num_caps++;
- }
-
- // Fetch any capabilities
- cap_t caps = cap_get_proc();
- if (!caps) {
- ERROR(pakfire, "Could not read capabilities: %m\n");
- return 1;
- }
-
- /*
- Set inheritable capabilities
-
- This ensures that no processes will be able to gain any of the listed
- capabilities again.
- */
- r = cap_set_flag(caps, CAP_INHERITABLE, num_caps, capabilities, CAP_CLEAR);
- if (r) {
- ERROR(pakfire, "cap_set_flag() failed: %m\n");
- goto ERROR;
- }
-
- // Restore capabilities
- r = cap_set_proc(caps);
- if (r) {
- ERROR(pakfire, "Could not restore capabilities: %m\n");
- goto ERROR;
- }
-
-ERROR:
- if (caps)
- cap_free(caps);
-
- return r;
-}
-
-static int pakfire_limit_syscalls(struct pakfire* pakfire) {
- const int syscalls[] = {
- // The kernel's keyring isn't namespaced
- SCMP_SYS(keyctl),
- SCMP_SYS(add_key),
- SCMP_SYS(request_key),
-
- // Disable userfaultfd
- SCMP_SYS(userfaultfd),
-
- // Disable perf which could leak a lot of information about the host
- SCMP_SYS(perf_event_open),
-
- 0,
- };
- int r = 1;
-
- // Setup a syscall filter which allows everything by default
- scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW);
- if (!ctx) {
- ERROR(pakfire, "Could not setup seccomp filter: %m\n");
- goto ERROR;
- }
-
- // All all syscalls
- for (const int* syscall = syscalls; *syscall; syscall++) {
- r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), *syscall, 0);
- if (r) {
- ERROR(pakfire, "Could not configure syscall %d: %m\n", *syscall);
- goto ERROR;
- }
- }
-
- // Load syscall filter into the kernel
- r = seccomp_load(ctx);
- if (r) {
- ERROR(pakfire, "Could not load syscall filter into the kernel: %m\n");
- goto ERROR;
- }
-
-ERROR:
- if (ctx)
- seccomp_release(ctx);
-
- return r;
-}
-
-static int find_environ(struct pakfire_execute* env, const char* key) {
- if (!key) {
- errno = EINVAL;
- return -1;
- }
-
- char buffer[strlen(key) + 2];
- unsigned int i = 0;
-
- pakfire_string_format(buffer, "%s=", key);
-
- for (i = 0; env->envp[i]; i++) {
- if (pakfire_string_startswith(env->envp[i], buffer))
- return i;
- }
-
- // Return -ENOSPC when the environment is full
- if (i >= ENVIRON_SIZE) {
- errno = ENOSPC;
- return -1;
- }
-
- // Return the index of the next free slot
- return i;
-}
-
-static int set_environ(struct pakfire_execute* env, const char* key, const char* value) {
- // Find the index where to write this value to
- int idx = find_environ(env, key);
- if (idx < 0)
- return idx;
-
- // Free any previous value
- if (env->envp[idx])
- free(env->envp[idx]);
-
- // Format and set environment variable
- asprintf(&env->envp[idx], "%s=%s", key, value);
-
- DEBUG(env->pakfire, "Set environment variable: %s\n", env->envp[idx]);
-
- return 0;
-}
-
-static int pakfire_execute_write_uidgid_mapping(struct pakfire* pakfire,
- const char* path, uid_t mapped_id, size_t length) {
- int r = 1;
-
- // Open file for writing
- FILE* f = fopen(path, "w");
- if (!f) {
- ERROR(pakfire, "Could not open %s for writing: %m\n", path);
- goto ERROR;
- }
-
- // Write configuration
- int bytes_written = fprintf(f, "%d %d %ld\n", 0, mapped_id, length);
- if (bytes_written < 0) {
- ERROR(pakfire, "Could not write UID mapping: %m\n");
- goto ERROR;
- }
-
- // Success
- r = 0;
-
-ERROR:
- if (f)
- fclose(f);
-
- return r;
-}
-
-static int pakfire_execute_setup_uid_mapping(struct pakfire* pakfire) {
- // XXX hard-coded values
- const uid_t mapped_uid = 100000;
- const size_t length = 64536;
-
- return pakfire_execute_write_uidgid_mapping(pakfire,
- "/proc/self/uid_map", mapped_uid, length);
-}
-
-static int pakfire_execute_setup_gid_mapping(struct pakfire* pakfire) {
- // XXX hard-coded values
- const uid_t mapped_gid = 100000;
- const size_t length = 64536;
-
- return pakfire_execute_write_uidgid_mapping(pakfire,
- "/proc/self/gid_map", mapped_gid, length);
-}
-
-static int pakfire_execute_fork2(void* data) {
- struct pakfire_execute* env = (struct pakfire_execute*)data;
- int r;
-
- struct pakfire* pakfire = env->pakfire;
-
- const char* root = pakfire_get_path(pakfire);
- const char* arch = pakfire_get_arch(pakfire);
-
- DEBUG(pakfire, "Execution environment has been forked as PID %d\n", getpid());
- DEBUG(pakfire, " root : %s\n", root);
- DEBUG(pakfire, " cgroup : %s\n", env->cgroup);
-
- for (unsigned int i = 0; env->argv[i]; i++)
- DEBUG(pakfire, " argv[%u] : %s\n", i, env->argv[i]);
-
- for (unsigned int i = 0; env->envp[i]; i++)
- DEBUG(pakfire, " env : %s\n", env->envp[i]);
-
- // Change root (unless root is /)
- if (!pakfire_on_root(pakfire)) {
- // Mount everything
- r = pakfire_mount_all(pakfire);
- if (r)
- return r;
-
- // Log all mountpoints
- pakfire_mount_list(pakfire);
-
- // Call chroot()
- r = chroot(root);
- if (r) {
- ERROR(pakfire, "chroot() to %s failed: %m\n", root);
- return 1;
- }
-
- // Change directory to /
- r = chdir("/");
- if (r) {
- ERROR(pakfire, "chdir() after chroot() failed: %m\n");
- return 1;
- }
- }
-
- // Set personality
- unsigned long persona = pakfire_arch_personality(arch);
- if (persona) {
- r = personality(persona);
- if (r < 0) {
- ERROR(pakfire, "Could not set personality (%x)\n", (unsigned int)persona);
-
- return 1;
- }
- }
-
- // Connect standard output and error
- if (env->stdout[1] && env->stderr[1]) {
- if (dup2(env->stdout[1], STDOUT_FILENO) < 0) {
- ERROR(pakfire, "Could not connect fd %d to stdout: %m\n",
- env->stdout[1]);
-
- return 1;
- }
-
- if (dup2(env->stderr[1], STDERR_FILENO) < 0) {
- ERROR(pakfire, "Could not connect fd %d to stderr: %m\n",
- env->stderr[1]);
-
- return 1;
- }
-
- // Close the reading sides of the pipe
- close(env->stdout[0]);
- close(env->stderr[0]);
- }
-
- // Reset open file limit (http://0pointer.net/blog/file-descriptor-limits.html)
- r = pakfire_rlimit_reset_nofile(pakfire);
- if (r)
- return r;
-
- // Drop capabilities
- r = pakfire_drop_capabilities(pakfire);
- if (r)
- return r;
-
- // Filter syscalls
- r = pakfire_limit_syscalls(pakfire);
- if (r)
- return r;
-
- // exec() command
- r = execvpe(env->argv[0], (char**)env->argv, env->envp);
- if (r < 0) {
- ERROR(pakfire, "Could not execve(): %m\n");
- }
-
- // Translate errno into regular exit code
- switch (errno) {
- case ENOENT:
- r = 127;
- break;
-
- default:
- r = 1;
- }
-
- // We should not get here
- return r;
-}
-
-/*
- This function is launched in a new process that has its own user namespace.
-
- It will set up the container.
-*/
-static int pakfire_execute_fork1(struct pakfire_execute* env) {
- struct pakfire* pakfire = env->pakfire;
- int r;
-
- DEBUG(pakfire, "Launched a new container as PID %d\n", getpid());
-
- // Setup UID mapping
- r = pakfire_execute_setup_uid_mapping(pakfire);
- if (r)
- return r;
-
- // Setup GID mapping
- r = pakfire_execute_setup_gid_mapping(pakfire);
- if (r)
- return r;
-
- // Configure child process
- struct clone_args args = {
- .flags =
- CLONE_NEWCGROUP |
- CLONE_NEWIPC |
- CLONE_NEWNS |
- CLONE_NEWPID |
- CLONE_NEWUTS,
- .exit_signal = SIGCHLD,
- };
-
- // Enable network?
- if (!(env->flags & PAKFIRE_EXECUTE_ENABLE_NETWORK))
- args.flags |= CLONE_NEWNET;
-
- // Fork this process
- pid_t pid = clone3(&args, sizeof(args));
- if (pid < 0) {
- ERROR(pakfire, "Could not fork: %m\n");
- return -errno;
-
- // Child process
- } else if (pid == 0) {
- r = pakfire_execute_fork2(env);
- _exit(r);
- }
-
- int status = 0;
-
- // Wait until the child process has finished
- waitpid(pid, &status, 0);
-
- // Pass the exit code of the child process
- return WEXITSTATUS(status);
-}
-
PAKFIRE_EXPORT int pakfire_execute(struct pakfire* pakfire, const char* argv[], char* envp[],
int flags, pakfire_execute_logging_callback logging_callback, void* data) {
+ struct pakfire_jail* jail = NULL;
int r;
- struct pakfire_execute env = {
- .pakfire = pakfire,
- .flags = flags,
- .argv = argv,
- };
-
- // argv is invalid
- if (!argv || !argv[0])
- return -EINVAL;
+ // Create a new jail
+ r = pakfire_jail_create(&jail, pakfire, flags);
+ if (r)
+ goto ERROR;
- // Set default environment
- for (const struct environ* e = default_environ; e->key; e++) {
- r = set_environ(&env, e->key, e->val);
+ // Setup logging
+ if (logging_callback) {
+ r = pakfire_jail_set_log_callback(jail, logging_callback, data);
if (r)
goto ERROR;
}
- // Setup interactive environment
- if (flags & PAKFIRE_EXECUTE_INTERACTIVE) {
- // Set environment
- r = set_environ(&env, "PS1", "pakfire-chroot \\w> ");
- if (r)
- goto ERROR;
-
- // Copy TERM
- char* TERM = secure_getenv("TERM");
- if (TERM) {
- r = set_environ(&env, "TERM", TERM);
- if (r)
- goto ERROR;
- }
-
- // Copy LANG
- char* LANG = secure_getenv("LANG");
- if (LANG) {
- r = set_environ(&env, "LANG", LANG);
- if (r)
- goto ERROR;
- }
-
- // Make some file descriptors for stdout & stderr
- } else {
- if (pipe(env.stdout) < 0) {
- ERROR(pakfire, "Could not create file descriptors for stdout: %m\n");
- r = -1;
- goto ERROR;
- }
-
- if (pipe(env.stderr) < 0) {
- ERROR(pakfire, "Could not create file descriptors for stderr: %m\n");
- r = -1;
- goto ERROR;
- }
- }
-
- // Copy user environment
- if (envp) {
- char* key;
- char* val;
-
- // Copy environment variables
- for (unsigned int i = 0; envp[i]; i++) {
- r = pakfire_string_partition(envp[i], "=", &key, &val);
- if (r)
- continue;
-
- // Set value
- set_environ(&env, key, val);
-
- if (key)
- free(key);
- if (val)
- free(val);
- }
- }
-
- if (!logging_callback)
- logging_callback = &default_logging_callback;
-
- // Lauch a new user namespace
- struct clone_args args = {
- .flags =
- CLONE_VFORK |
- CLONE_NEWUSER,
- .exit_signal = SIGCHLD,
- };
-
- // Fork this process
- pid_t pid = clone3(&args, sizeof(args));
- if (pid < 0) {
- ERROR(pakfire, "Could not fork: %m\n");
- return -errno;
-
- // Child process
- } else if (pid == 0) {
- r = pakfire_execute_fork1(&env);
- exit(r);
- }
-
- // Set some useful error code
- int exit = -ESRCH;
- int status = 0;
-
- DEBUG(pakfire, "Waiting for PID %d to finish its work\n", pid);
-
- if (!(flags & PAKFIRE_EXECUTE_INTERACTIVE)) {
- // Close any unused file descriptors
- if (env.stdout[1])
- close(env.stdout[1]);
- if (env.stderr[1])
- close(env.stderr[1]);
-
- if (pakfire_execute_logger(pakfire, logging_callback, data, pid, env.stdout[0], env.stderr[0], &status)) {
- ERROR(pakfire, "Log reading aborted: %m\n");
- }
- }
-
- if (!status)
- waitpid(pid, &status, 0);
-
- if (WIFEXITED(status)) {
- exit = WEXITSTATUS(status);
-
- DEBUG(pakfire, "Child process exited with code: %d\n", exit);
- } else {
- ERROR(pakfire, "Could not determine the exit status of process %d\n", pid);
- }
+ // Import environment
+ r = pakfire_jail_import_env(jail, (const char**)envp);
+ if (r)
+ goto ERROR;
- // Return the exit code of the application
- r = exit;
+ // Execute the command
+ r = pakfire_jail_exec(jail, argv);
ERROR:
- // Close any file descriptors
- if (env.stdout[0])
- close(env.stdout[0]);
- if (env.stderr[0])
- close(env.stderr[0]);
-
- // Umount everything
- if (!pakfire_on_root(pakfire))
- pakfire_umount_all(pakfire);
-
- // Free environment
- for (unsigned int i = 0; env.envp[i]; i++)
- free(env.envp[i]);
+ if (jail)
+ pakfire_jail_unref(jail);
return r;
}