1 /*#############################################################################
3 # Pakfire - The IPFire package management system #
4 # Copyright (C) 2022 Pakfire development team #
6 # This program is free software: you can redistribute it and/or modify #
7 # it under the terms of the GNU General Public License as published by #
8 # the Free Software Foundation, either version 3 of the License, or #
9 # (at your option) any later version. #
11 # This program is distributed in the hope that it will be useful, #
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
14 # GNU General Public License for more details. #
16 # You should have received a copy of the GNU General Public License #
17 # along with this program. If not, see <http://www.gnu.org/licenses/>. #
19 #############################################################################*/
23 #include <linux/capability.h>
24 #include <linux/sched.h>
26 #include <linux/wait.h>
31 #include <sys/capability.h>
32 #include <sys/epoll.h>
33 #include <sys/eventfd.h>
34 #include <sys/mount.h>
35 #include <sys/personality.h>
36 #include <sys/prctl.h>
37 #include <sys/resource.h>
38 #include <sys/timerfd.h>
39 #include <sys/types.h>
48 #include <pakfire/arch.h>
49 #include <pakfire/cgroup.h>
50 #include <pakfire/jail.h>
51 #include <pakfire/logging.h>
52 #include <pakfire/mount.h>
53 #include <pakfire/pakfire.h>
54 #include <pakfire/private.h>
55 #include <pakfire/pwd.h>
56 #include <pakfire/string.h>
57 #include <pakfire/util.h>
59 #define BUFFER_SIZE 1024 * 64
60 #define ENVIRON_SIZE 128
61 #define EPOLL_MAX_EVENTS 2
62 #define MAX_MOUNTPOINTS 8
64 // The default environment that will be set for every command
65 static const struct environ
{
70 { "LANG", "C.utf-8" },
71 { "PATH", "/usr/local/sbin:/usr/sbin:/sbin:/usr/local/bin:/usr/bin:/bin", },
74 // Tell everything that it is running inside a Pakfire container
75 { "container", "pakfire" },
79 struct pakfire_jail_mountpoint
{
80 char source
[PATH_MAX
];
81 char target
[PATH_MAX
];
86 struct pakfire
* pakfire
;
89 // A unique ID for each jail
91 char __uuid
[UUID_STR_LEN
];
100 struct itimerspec timeout
;
103 struct pakfire_cgroup
* cgroup
;
106 char* env
[ENVIRON_SIZE
];
109 struct pakfire_jail_mountpoint mountpoints
[MAX_MOUNTPOINTS
];
110 unsigned int num_mountpoints
;
113 struct pakfire_log_buffer
{
114 char data
[BUFFER_SIZE
];
118 enum pakfire_jail_exec_flags
{
119 PAKFIRE_JAIL_HAS_NETWORKING
= (1 << 0),
122 struct pakfire_jail_exec
{
125 // PID (of the child)
129 // Process status (from waitid)
132 // FD to notify the client that the parent has finished initialization
136 struct pakfire_jail_pipes
{
148 struct pakfire_jail_communicate
{
149 pakfire_jail_communicate_in in
;
150 pakfire_jail_communicate_out out
;
155 struct pakfire_jail_buffers
{
156 struct pakfire_log_buffer stdout
;
157 struct pakfire_log_buffer stderr
;
160 struct pakfire_log_buffer log_INFO
;
161 struct pakfire_log_buffer log_ERROR
;
162 struct pakfire_log_buffer log_DEBUG
;
165 struct pakfire_cgroup
* cgroup
;
166 struct pakfire_cgroup_stats cgroup_stats
;
169 static int clone3(struct clone_args
* args
, size_t size
) {
170 return syscall(__NR_clone3
, args
, size
);
173 static int pidfd_send_signal(int pidfd
, int sig
, siginfo_t
* info
, unsigned int flags
) {
174 return syscall(SYS_pidfd_send_signal
, pidfd
, sig
, info
, flags
);
177 static int pakfire_jail_exec_has_flag(
178 const struct pakfire_jail_exec
* ctx
, const enum pakfire_jail_exec_flags flag
) {
179 return ctx
->flags
& flag
;
182 static void pakfire_jail_free(struct pakfire_jail
* jail
) {
183 DEBUG(jail
->pakfire
, "Freeing jail at %p\n", jail
);
186 for (unsigned int i
= 0; jail
->env
[i
]; i
++)
190 pakfire_cgroup_unref(jail
->cgroup
);
192 pakfire_unref(jail
->pakfire
);
197 Passes any log messages on to the default pakfire log callback
199 static int pakfire_jail_default_log_callback(struct pakfire
* pakfire
, void* data
,
200 int priority
, const char* line
, size_t length
) {
203 INFO(pakfire
, "%s", line
);
207 ERROR(pakfire
, "%s", line
);
212 DEBUG(pakfire
, "%s", line
);
220 static const char* pakfire_jail_uuid(struct pakfire_jail
* jail
) {
222 uuid_unparse_lower(jail
->uuid
, jail
->__uuid
);
227 static int pakfire_jail_setup_interactive_env(struct pakfire_jail
* jail
) {
229 int r
= pakfire_jail_set_env(jail
, "PS1", "pakfire-jail \\w> ");
234 char* TERM
= secure_getenv("TERM");
236 r
= pakfire_jail_set_env(jail
, "TERM", TERM
);
242 char* LANG
= secure_getenv("LANG");
244 r
= pakfire_jail_set_env(jail
, "LANG", LANG
);
252 PAKFIRE_EXPORT
int pakfire_jail_create(struct pakfire_jail
** jail
,
253 struct pakfire
* pakfire
, int flags
) {
256 const char* arch
= pakfire_get_arch(pakfire
);
258 // Allocate a new jail
259 struct pakfire_jail
* j
= calloc(1, sizeof(*j
));
264 j
->pakfire
= pakfire_ref(pakfire
);
266 // Initialize reference counter
272 // Generate a random UUID
273 uuid_generate_random(j
->uuid
);
275 DEBUG(j
->pakfire
, "Allocated new jail at %p\n", j
);
277 // Set default environment
278 for (const struct environ
* e
= ENV
; e
->key
; e
++) {
279 r
= pakfire_jail_set_env(j
, e
->key
, e
->val
);
284 // Enable all CPU features that CPU has to offer
285 if (!pakfire_arch_supported_by_host(arch
)) {
286 r
= pakfire_jail_set_env(j
, "QEMU_CPU", "max");
291 // Set container UUID
292 r
= pakfire_jail_set_env(j
, "container_uuid", pakfire_jail_uuid(j
));
296 // Disable systemctl to talk to systemd
297 if (!pakfire_on_root(j
->pakfire
)) {
298 r
= pakfire_jail_set_env(j
, "SYSTEMD_OFFLINE", "1");
308 pakfire_jail_free(j
);
313 PAKFIRE_EXPORT
struct pakfire_jail
* pakfire_jail_ref(struct pakfire_jail
* jail
) {
319 PAKFIRE_EXPORT
struct pakfire_jail
* pakfire_jail_unref(struct pakfire_jail
* jail
) {
320 if (--jail
->nrefs
> 0)
323 pakfire_jail_free(jail
);
329 PAKFIRE_EXPORT
int pakfire_jail_nice(struct pakfire_jail
* jail
, int nice
) {
330 // Check if nice level is in range
331 if (nice
< -19 || nice
> 20) {
342 int pakfire_jail_set_cgroup(struct pakfire_jail
* jail
, struct pakfire_cgroup
* cgroup
) {
343 // Free any previous cgroup
345 pakfire_cgroup_unref(jail
->cgroup
);
349 // Set any new cgroup
351 DEBUG(jail
->pakfire
, "Setting cgroup %p\n", cgroup
);
353 jail
->cgroup
= pakfire_cgroup_ref(cgroup
);
362 // Returns the length of the environment
363 static unsigned int pakfire_jail_env_length(struct pakfire_jail
* jail
) {
366 // Count everything in the environment
367 for (char** e
= jail
->env
; *e
; e
++)
373 // Finds an existing environment variable and returns its index or -1 if not found
374 static int pakfire_jail_find_env(struct pakfire_jail
* jail
, const char* key
) {
380 char buffer
[strlen(key
) + 2];
381 pakfire_string_format(buffer
, "%s=", key
);
383 for (unsigned int i
= 0; jail
->env
[i
]; i
++) {
384 if (pakfire_string_startswith(jail
->env
[i
], buffer
))
392 // Returns the value of an environment variable or NULL
393 PAKFIRE_EXPORT
const char* pakfire_jail_get_env(struct pakfire_jail
* jail
,
395 int i
= pakfire_jail_find_env(jail
, key
);
399 return jail
->env
[i
] + strlen(key
) + 1;
402 // Sets an environment variable
403 PAKFIRE_EXPORT
int pakfire_jail_set_env(struct pakfire_jail
* jail
,
404 const char* key
, const char* value
) {
405 // Find the index where to write this value to
406 int i
= pakfire_jail_find_env(jail
, key
);
408 i
= pakfire_jail_env_length(jail
);
410 // Return -ENOSPC when the environment is full
411 if (i
>= ENVIRON_SIZE
) {
416 // Free any previous value
420 // Format and set environment variable
421 asprintf(&jail
->env
[i
], "%s=%s", key
, value
);
423 DEBUG(jail
->pakfire
, "Set environment variable: %s\n", jail
->env
[i
]);
428 // Imports an environment
429 PAKFIRE_EXPORT
int pakfire_jail_import_env(struct pakfire_jail
* jail
, const char* env
[]) {
437 // Copy environment variables
438 for (unsigned int i
= 0; env
[i
]; i
++) {
439 r
= pakfire_string_partition(env
[i
], "=", &key
, &val
);
444 r
= pakfire_jail_set_env(jail
, key
, val
);
461 PAKFIRE_EXPORT
int pakfire_jail_set_timeout(
462 struct pakfire_jail
* jail
, unsigned int timeout
) {
464 jail
->timeout
.it_value
.tv_sec
= timeout
;
467 DEBUG(jail
->pakfire
, "Timeout set to %d second(s)\n", timeout
);
469 DEBUG(jail
->pakfire
, "Timeout disabled\n");
474 static int pakfire_jail_create_timer(struct pakfire_jail
* jail
) {
477 // Nothing to do if no timeout has been set
478 if (!jail
->timeout
.it_value
.tv_sec
)
481 // Create a new timer
482 const int fd
= timerfd_create(CLOCK_MONOTONIC
, 0);
484 ERROR(jail
->pakfire
, "Could not create timer: %m\n");
489 r
= timerfd_settime(fd
, 0, &jail
->timeout
, NULL
);
491 ERROR(jail
->pakfire
, "Could not arm timer: %m\n");
505 This function replaces any logging in the child process.
507 All log messages will be sent to the parent process through their respective pipes.
509 static void pakfire_jail_log(void* data
, int priority
, const char* file
,
510 int line
, const char* fn
, const char* format
, va_list args
) {
511 struct pakfire_jail_pipes
* pipes
= (struct pakfire_jail_pipes
*)data
;
516 fd
= pipes
->log_INFO
[1];
520 fd
= pipes
->log_ERROR
[1];
525 fd
= pipes
->log_DEBUG
[1];
527 #endif /* ENABLE_DEBUG */
529 // Ignore any messages of an unknown priority
534 // Send the log message
536 vdprintf(fd
, format
, args
);
539 static int pakfire_jail_log_buffer_is_full(const struct pakfire_log_buffer
* buffer
) {
540 return (sizeof(buffer
->data
) == buffer
->used
);
544 This function reads as much data as it can from the file descriptor.
545 If it finds a whole line in it, it will send it to the logger and repeat the process.
546 If not newline character is found, it will try to read more data until it finds one.
548 static int pakfire_jail_handle_log(struct pakfire_jail
* jail
,
549 struct pakfire_jail_exec
* ctx
, int priority
, int fd
,
550 struct pakfire_log_buffer
* buffer
, pakfire_jail_communicate_out callback
, void* data
) {
551 char line
[BUFFER_SIZE
+ 1];
553 // Fill up buffer from fd
554 if (buffer
->used
< sizeof(buffer
->data
)) {
555 ssize_t bytes_read
= read(fd
, buffer
->data
+ buffer
->used
,
556 sizeof(buffer
->data
) - buffer
->used
);
559 if (bytes_read
< 0) {
560 ERROR(jail
->pakfire
, "Could not read from fd %d: %m\n", fd
);
564 // Update buffer size
565 buffer
->used
+= bytes_read
;
568 // See if we have any lines that we can write
569 while (buffer
->used
) {
570 // Search for the end of the first line
571 char* eol
= memchr(buffer
->data
, '\n', buffer
->used
);
575 // If the buffer is full, we send the content to the logger and try again
576 // This should not happen in practise
577 if (pakfire_jail_log_buffer_is_full(buffer
)) {
578 DEBUG(jail
->pakfire
, "Logging buffer is full. Sending all content\n");
580 eol
= buffer
->data
+ sizeof(buffer
->data
) - 1;
582 // Otherwise we might have only read parts of the output
587 // Find the length of the string
588 size_t length
= eol
- buffer
->data
+ 1;
590 // Copy the line into the buffer
591 memcpy(line
, buffer
->data
, length
);
593 // Terminate the string
598 int r
= callback(jail
->pakfire
, data
, priority
, line
, length
);
600 ERROR(jail
->pakfire
, "The logging callback returned an error: %d\n", r
);
605 // Remove line from buffer
606 memmove(buffer
->data
, buffer
->data
+ length
, buffer
->used
- length
);
607 buffer
->used
-= length
;
613 static int pakfire_jail_stream_stdin(struct pakfire_jail
* jail
,
614 struct pakfire_jail_exec
* ctx
, const int fd
) {
617 // Nothing to do if there is no stdin callback set
618 if (!ctx
->communicate
.in
) {
619 DEBUG(jail
->pakfire
, "Callback for standard input is not set\n");
623 // Skip if the writing pipe has already been closed
624 if (!ctx
->pipes
.stdin
[1])
627 DEBUG(jail
->pakfire
, "Streaming standard input...\n");
629 // Calling the callback
630 r
= ctx
->communicate
.in(jail
->pakfire
, ctx
->communicate
.data
, fd
);
632 DEBUG(jail
->pakfire
, "Standard input callback finished: %d\n", r
);
634 // The callback signaled that it has written everything
636 DEBUG(jail
->pakfire
, "Closing standard input pipe\n");
638 // Close the file-descriptor
641 // Reset the file-descriptor so it won't be closed again later
642 ctx
->pipes
.stdin
[1] = 0;
651 static int pakfire_jail_setup_pipe(struct pakfire_jail
* jail
, int (*fds
)[2], const int flags
) {
652 int r
= pipe2(*fds
, flags
);
654 ERROR(jail
->pakfire
, "Could not setup pipe: %m\n");
661 static void pakfire_jail_close_pipe(struct pakfire_jail
* jail
, int fds
[2]) {
662 for (unsigned int i
= 0; i
< 2; i
++)
668 This is a convenience function to fetch the reading end of a pipe and
669 closes the write end.
671 static int pakfire_jail_get_pipe_to_read(struct pakfire_jail
* jail
, int (*fds
)[2]) {
672 // Give the variables easier names to avoid confusion
673 int* fd_read
= &(*fds
)[0];
674 int* fd_write
= &(*fds
)[1];
676 // Close the write end of the pipe
682 // Return the read end
686 static int pakfire_jail_get_pipe_to_write(struct pakfire_jail
* jail
, int (*fds
)[2]) {
687 // Give the variables easier names to avoid confusion
688 int* fd_read
= &(*fds
)[0];
689 int* fd_write
= &(*fds
)[1];
691 // Close the read end of the pipe
697 // Return the write end
701 static int pakfire_jail_wait(struct pakfire_jail
* jail
, struct pakfire_jail_exec
* ctx
) {
703 struct epoll_event ev
;
704 struct epoll_event events
[EPOLL_MAX_EVENTS
];
708 // Fetch file descriptors from context
709 const int stdin
= pakfire_jail_get_pipe_to_write(jail
, &ctx
->pipes
.stdin
);
710 const int stdout
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.stdout
);
711 const int stderr
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.stderr
);
712 const int pidfd
= ctx
->pidfd
;
715 const int timerfd
= pakfire_jail_create_timer(jail
);
718 const int log_INFO
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.log_INFO
);
719 const int log_ERROR
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.log_ERROR
);
720 const int log_DEBUG
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.log_DEBUG
);
722 // Make a list of all file descriptors we are interested in
724 stdin
, stdout
, stderr
, pidfd
, timerfd
, log_INFO
, log_ERROR
, log_DEBUG
,
728 epollfd
= epoll_create1(0);
730 ERROR(jail
->pakfire
, "Could not initialize epoll(): %m\n");
735 // Turn file descriptors into non-blocking mode and add them to epoll()
736 for (unsigned int i
= 0; i
< sizeof(fds
) / sizeof(*fds
); i
++) {
739 // Skip fds which were not initialized
743 ev
.events
= EPOLLHUP
;
746 ev
.events
|= EPOLLOUT
;
748 ev
.events
|= EPOLLIN
;
751 int flags
= fcntl(fd
, F_GETFL
, 0);
753 // Set modified flags
754 if (fcntl(fd
, F_SETFL
, flags
|O_NONBLOCK
) < 0) {
756 "Could not set file descriptor %d into non-blocking mode: %m\n", fd
);
763 if (epoll_ctl(epollfd
, EPOLL_CTL_ADD
, fd
, &ev
) < 0) {
764 ERROR(jail
->pakfire
, "Could not add file descriptor %d to epoll(): %m\n", fd
);
772 // Loop for as long as the process is alive
774 int num
= epoll_wait(epollfd
, events
, EPOLL_MAX_EVENTS
, -1);
776 // Ignore if epoll_wait() has been interrupted
780 ERROR(jail
->pakfire
, "epoll_wait() failed: %m\n");
786 for (int i
= 0; i
< num
; i
++) {
787 int e
= events
[i
].events
;
788 int fd
= events
[i
].data
.fd
;
790 struct pakfire_log_buffer
* buffer
= NULL
;
791 pakfire_jail_communicate_out callback
= NULL
;
795 // Check if there is any data to be read
797 // Handle any changes to the PIDFD
799 // Call waidid() and store the result
800 r
= waitid(P_PIDFD
, ctx
->pidfd
, &ctx
->status
, WEXITED
);
802 ERROR(jail
->pakfire
, "waitid() failed: %m\n");
806 // Mark that we have ended so that we will process the remaining
807 // events from epoll() now, but won't restart the outer loop.
811 // Handle timer events
812 } else if (fd
== timerfd
) {
813 DEBUG(jail
->pakfire
, "Timer event received\n");
816 r
= read(timerfd
, garbage
, sizeof(garbage
));
818 ERROR(jail
->pakfire
, "Could not disarm timer: %m\n");
823 // Terminate the process if it hasn't already ended
825 DEBUG(jail
->pakfire
, "Terminating process...\n");
827 // Send SIGTERM to the process
828 r
= pidfd_send_signal(pidfd
, SIGKILL
, NULL
, 0);
830 ERROR(jail
->pakfire
, "Could not kill process: %m\n");
835 // There is nothing else to do
838 // Handle logging messages
839 } else if (fd
== log_INFO
) {
840 buffer
= &ctx
->buffers
.log_INFO
;
843 callback
= pakfire_jail_default_log_callback
;
845 } else if (fd
== log_ERROR
) {
846 buffer
= &ctx
->buffers
.log_ERROR
;
849 callback
= pakfire_jail_default_log_callback
;
851 } else if (fd
== log_DEBUG
) {
852 buffer
= &ctx
->buffers
.log_DEBUG
;
853 priority
= LOG_DEBUG
;
855 callback
= pakfire_jail_default_log_callback
;
857 // Handle anything from the log pipes
858 } else if (fd
== stdout
) {
859 buffer
= &ctx
->buffers
.stdout
;
862 callback
= ctx
->communicate
.out
;
863 data
= ctx
->communicate
.data
;
865 } else if (fd
== stderr
) {
866 buffer
= &ctx
->buffers
.stderr
;
869 callback
= ctx
->communicate
.out
;
870 data
= ctx
->communicate
.data
;
873 DEBUG(jail
->pakfire
, "Received invalid file descriptor %d\n", fd
);
878 r
= pakfire_jail_handle_log(jail
, ctx
, priority
, fd
, buffer
, callback
, data
);
884 // Handle standard input
886 r
= pakfire_jail_stream_stdin(jail
, ctx
, fd
);
889 // Ignore if we filled up the buffer
894 ERROR(jail
->pakfire
, "Could not write to stdin: %m\n");
901 // Check if any file descriptors have been closed
903 // Remove the file descriptor
904 r
= epoll_ctl(epollfd
, EPOLL_CTL_DEL
, fd
, NULL
);
906 ERROR(jail
->pakfire
, "Could not remove closed file-descriptor %d: %m\n", fd
);
922 int pakfire_jail_capture_stdout(struct pakfire
* pakfire
, void* data
,
923 int priority
, const char* line
, size_t length
) {
924 char** output
= (char**)data
;
927 // Append everything from stdout to a buffer
928 if (output
&& priority
== LOG_INFO
) {
929 r
= asprintf(output
, "%s%s", (output
&& *output
) ? *output
: "", line
);
935 // Send everything else to the default logger
936 return pakfire_jail_default_log_callback(pakfire
, NULL
, priority
, line
, length
);
941 static int pakfire_jail_drop_capabilities(struct pakfire_jail
* jail
) {
942 const int capabilities
[] = {
943 // Deny access to the kernel's audit system
948 // Deny suspending block devices
951 // Deny any stuff with BPF
954 // Deny checkpoint restore
955 CAP_CHECKPOINT_RESTORE
,
957 // Deny opening files by inode number (open_by_handle_at)
960 // Deny setting SUID bits
963 // Deny locking more memory
966 // Deny modifying any Apparmor/SELinux/SMACK configuration
970 // Deny creating any special devices
973 // Deny setting any capabilities
976 // Deny reading from syslog
979 // Deny any admin actions (mount, sethostname, ...)
982 // Deny rebooting the system
985 // Deny loading kernel modules
988 // Deny setting nice level
991 // Deny access to /proc/kcore, /dev/mem, /dev/kmem
994 // Deny circumventing any resource limits
997 // Deny setting the system time
1000 // Deny playing with suspend
1006 DEBUG(jail
->pakfire
, "Dropping capabilities...\n");
1008 size_t num_caps
= 0;
1011 // Drop any capabilities
1012 for (const int* cap
= capabilities
; *cap
; cap
++) {
1013 r
= prctl(PR_CAPBSET_DROP
, *cap
, 0, 0, 0);
1015 ERROR(jail
->pakfire
, "Could not drop capability %d: %m\n", *cap
);
1022 // Fetch any capabilities
1023 cap_t caps
= cap_get_proc();
1025 ERROR(jail
->pakfire
, "Could not read capabilities: %m\n");
1030 Set inheritable capabilities
1032 This ensures that no processes will be able to gain any of the listed
1035 r
= cap_set_flag(caps
, CAP_INHERITABLE
, num_caps
, capabilities
, CAP_CLEAR
);
1037 ERROR(jail
->pakfire
, "cap_set_flag() failed: %m\n");
1041 // Restore capabilities
1042 r
= cap_set_proc(caps
);
1044 ERROR(jail
->pakfire
, "Could not restore capabilities: %m\n");
1057 static int pakfire_jail_limit_syscalls(struct pakfire_jail
* jail
) {
1058 const int syscalls
[] = {
1059 // The kernel's keyring isn't namespaced
1062 SCMP_SYS(request_key
),
1064 // Disable userfaultfd
1065 SCMP_SYS(userfaultfd
),
1067 // Disable perf which could leak a lot of information about the host
1068 SCMP_SYS(perf_event_open
),
1074 DEBUG(jail
->pakfire
, "Applying syscall filter...\n");
1076 // Setup a syscall filter which allows everything by default
1077 scmp_filter_ctx ctx
= seccomp_init(SCMP_ACT_ALLOW
);
1079 ERROR(jail
->pakfire
, "Could not setup seccomp filter: %m\n");
1084 for (const int* syscall
= syscalls
; *syscall
; syscall
++) {
1085 r
= seccomp_rule_add(ctx
, SCMP_ACT_ERRNO(EPERM
), *syscall
, 0);
1087 ERROR(jail
->pakfire
, "Could not configure syscall %d: %m\n", *syscall
);
1092 // Load syscall filter into the kernel
1093 r
= seccomp_load(ctx
);
1095 ERROR(jail
->pakfire
, "Could not load syscall filter into the kernel: %m\n");
1101 seccomp_release(ctx
);
1108 PAKFIRE_EXPORT
int pakfire_jail_bind(struct pakfire_jail
* jail
,
1109 const char* source
, const char* target
, int flags
) {
1110 struct pakfire_jail_mountpoint
* mp
= NULL
;
1113 // Check if there is any space left
1114 if (jail
->num_mountpoints
>= MAX_MOUNTPOINTS
) {
1119 // Check for valid inputs
1120 if (!source
|| !target
) {
1125 // Select the next free slot
1126 mp
= &jail
->mountpoints
[jail
->num_mountpoints
];
1129 r
= pakfire_string_set(mp
->source
, source
);
1131 ERROR(jail
->pakfire
, "Could not copy source: %m\n");
1136 r
= pakfire_string_set(mp
->target
, target
);
1138 ERROR(jail
->pakfire
, "Could not copy target: %m\n");
1145 // Increment counter
1146 jail
->num_mountpoints
++;
1151 static int pakfire_jail_mount_networking(struct pakfire_jail
* jail
) {
1154 const char* paths
[] = {
1160 // Bind-mount all paths read-only
1161 for (const char** path
= paths
; *path
; path
++) {
1162 r
= pakfire_bind(jail
->pakfire
, *path
, NULL
, MS_RDONLY
);
1171 Mounts everything that we require in the new namespace
1173 static int pakfire_jail_mount(struct pakfire_jail
* jail
, struct pakfire_jail_exec
* ctx
) {
1174 struct pakfire_jail_mountpoint
* mp
= NULL
;
1177 // Mount all default stuff
1178 r
= pakfire_mount_all(jail
->pakfire
);
1182 // Mount networking stuff
1183 if (pakfire_jail_exec_has_flag(ctx
, PAKFIRE_JAIL_HAS_NETWORKING
)) {
1184 r
= pakfire_jail_mount_networking(jail
);
1189 // Mount all custom stuff
1190 for (unsigned int i
= 0; i
< jail
->num_mountpoints
; i
++) {
1192 mp
= &jail
->mountpoints
[i
];
1195 r
= pakfire_bind(jail
->pakfire
, mp
->source
, mp
->target
, mp
->flags
);
1200 // Log all mountpoints
1201 pakfire_mount_list(jail
->pakfire
);
1208 static int pakfire_jail_setup_uid_mapping(struct pakfire_jail
* jail
, pid_t pid
) {
1209 char path
[PATH_MAX
];
1212 // Skip mapping anything when running on /
1213 if (pakfire_on_root(jail
->pakfire
))
1217 r
= pakfire_string_format(path
, "/proc/%d/uid_map", pid
);
1222 const uid_t uid
= pakfire_uid(jail
->pakfire
);
1225 const struct pakfire_subid
* subuid
= pakfire_subuid(jail
->pakfire
);
1229 /* When running as root, we will map the entire range.
1231 When running as a non-privileged user, we will map the root user inside the jail
1232 to the user's UID outside of the jail, and we will map the rest starting from one.
1237 r
= pakfire_file_write(jail
->pakfire
, path
, 0, 0, 0,
1238 "0 %lu %lu\n", subuid
->id
, subuid
->length
);
1240 r
= pakfire_file_write(jail
->pakfire
, path
, 0, 0, 0,
1241 "0 %lu 1\n1 %lu %lu\n", uid
, subuid
->id
, subuid
->length
);
1245 ERROR(jail
->pakfire
, "Could not map UIDs: %m\n");
1252 static int pakfire_jail_setup_gid_mapping(struct pakfire_jail
* jail
, pid_t pid
) {
1253 char path
[PATH_MAX
];
1256 // Skip mapping anything when running on /
1257 if (pakfire_on_root(jail
->pakfire
))
1261 const gid_t gid
= pakfire_gid(jail
->pakfire
);
1264 const struct pakfire_subid
* subgid
= pakfire_subgid(jail
->pakfire
);
1269 r
= pakfire_string_format(path
, "/proc/%d/gid_map", pid
);
1275 r
= pakfire_file_write(jail
->pakfire
, path
, 0, 0, 0,
1276 "0 %lu %lu\n", subgid
->id
, subgid
->length
);
1278 r
= pakfire_file_write(jail
->pakfire
, path
, 0, 0, 0,
1279 "0 %lu 1\n%1 %lu %lu\n", gid
, subgid
->id
, subgid
->length
);
1283 ERROR(jail
->pakfire
, "Could not map GIDs: %m\n");
1290 static int pakfire_jail_setgroups(struct pakfire_jail
* jail
, pid_t pid
) {
1291 char path
[PATH_MAX
];
1295 r
= pakfire_string_format(path
, "/proc/%d/setgroups", pid
);
1299 // Open file for writing
1300 FILE* f
= fopen(path
, "w");
1302 ERROR(jail
->pakfire
, "Could not open %s for writing: %m\n", path
);
1307 int bytes_written
= fprintf(f
, "deny\n");
1308 if (bytes_written
<= 0) {
1309 ERROR(jail
->pakfire
, "Could not write to %s: %m\n", path
);
1316 ERROR(jail
->pakfire
, "Could not close %s: %m\n", path
);
1327 static int pakfire_jail_send_signal(struct pakfire_jail
* jail
, int fd
) {
1328 const uint64_t val
= 1;
1331 DEBUG(jail
->pakfire
, "Sending signal...\n");
1333 // Write to the file descriptor
1334 ssize_t bytes_written
= write(fd
, &val
, sizeof(val
));
1335 if (bytes_written
< 0 || (size_t)bytes_written
< sizeof(val
)) {
1336 ERROR(jail
->pakfire
, "Could not send signal: %m\n");
1340 // Close the file descriptor
1346 static int pakfire_jail_wait_for_signal(struct pakfire_jail
* jail
, int fd
) {
1350 DEBUG(jail
->pakfire
, "Waiting for signal...\n");
1352 ssize_t bytes_read
= read(fd
, &val
, sizeof(val
));
1353 if (bytes_read
< 0 || (size_t)bytes_read
< sizeof(val
)) {
1354 ERROR(jail
->pakfire
, "Error waiting for signal: %m\n");
1358 // Close the file descriptor
1365 Performs the initialisation that needs to happen in the parent part
1367 static int pakfire_jail_parent(struct pakfire_jail
* jail
, struct pakfire_jail_exec
* ctx
) {
1370 // Setup UID mapping
1371 r
= pakfire_jail_setup_uid_mapping(jail
, ctx
->pid
);
1375 // Write "deny" to /proc/PID/setgroups
1376 r
= pakfire_jail_setgroups(jail
, ctx
->pid
);
1380 // Setup GID mapping
1381 r
= pakfire_jail_setup_gid_mapping(jail
, ctx
->pid
);
1385 // Parent has finished initialisation
1386 DEBUG(jail
->pakfire
, "Parent has finished initialization\n");
1388 // Send signal to client
1389 r
= pakfire_jail_send_signal(jail
, ctx
->completed_fd
);
1396 static int pakfire_jail_child(struct pakfire_jail
* jail
, struct pakfire_jail_exec
* ctx
,
1397 const char* argv
[]) {
1400 // Redirect any logging to our log pipe
1401 pakfire_set_log_callback(jail
->pakfire
, pakfire_jail_log
, &ctx
->pipes
);
1404 r
= prctl(PR_SET_PDEATHSIG
, SIGKILL
, 0, 0, 0);
1406 ERROR(jail
->pakfire
, "Could not configure to die with parent: %m\n");
1411 pid_t pid
= getpid();
1413 DEBUG(jail
->pakfire
, "Launched child process in jail with PID %d\n", pid
);
1415 // Wait for the parent to finish initialization
1416 r
= pakfire_jail_wait_for_signal(jail
, ctx
->completed_fd
);
1420 // Perform further initialization
1423 uid_t uid
= getuid();
1424 gid_t gid
= getgid();
1427 uid_t euid
= geteuid();
1428 gid_t egid
= getegid();
1430 DEBUG(jail
->pakfire
, " UID: %d (effective %d)\n", uid
, euid
);
1431 DEBUG(jail
->pakfire
, " GID: %d (effective %d)\n", gid
, egid
);
1433 // Check if we are (effectively running as root)
1434 if (uid
|| gid
|| euid
|| egid
) {
1435 ERROR(jail
->pakfire
, "Child process is not running as root\n");
1439 const char* root
= pakfire_get_path(jail
->pakfire
);
1440 const char* arch
= pakfire_get_arch(jail
->pakfire
);
1442 // Change root (unless root is /)
1443 if (!pakfire_on_root(jail
->pakfire
)) {
1445 r
= pakfire_jail_mount(jail
, ctx
);
1452 ERROR(jail
->pakfire
, "chroot() to %s failed: %m\n", root
);
1456 // Change directory to /
1459 ERROR(jail
->pakfire
, "chdir() after chroot() failed: %m\n");
1465 unsigned long persona
= pakfire_arch_personality(arch
);
1467 r
= personality(persona
);
1469 ERROR(jail
->pakfire
, "Could not set personality (%x)\n", (unsigned int)persona
);
1476 DEBUG(jail
->pakfire
, "Setting nice level to %d\n", jail
->nice
);
1478 r
= setpriority(PRIO_PROCESS
, pid
, jail
->nice
);
1480 ERROR(jail
->pakfire
, "Could not set nice level: %m\n");
1485 // Close other end of log pipes
1486 close(ctx
->pipes
.log_INFO
[0]);
1487 close(ctx
->pipes
.log_ERROR
[0]);
1489 close(ctx
->pipes
.log_DEBUG
[0]);
1490 #endif /* ENABLE_DEBUG */
1492 // Connect standard input
1493 if (ctx
->pipes
.stdin
[0]) {
1494 r
= dup2(ctx
->pipes
.stdin
[0], STDIN_FILENO
);
1496 ERROR(jail
->pakfire
, "Could not connect fd %d to stdin: %m\n",
1497 ctx
->pipes
.stdin
[0]);
1503 // Connect standard output and error
1504 if (ctx
->pipes
.stdout
[1] && ctx
->pipes
.stderr
[1]) {
1505 r
= dup2(ctx
->pipes
.stdout
[1], STDOUT_FILENO
);
1507 ERROR(jail
->pakfire
, "Could not connect fd %d to stdout: %m\n",
1508 ctx
->pipes
.stdout
[1]);
1513 r
= dup2(ctx
->pipes
.stderr
[1], STDERR_FILENO
);
1515 ERROR(jail
->pakfire
, "Could not connect fd %d to stderr: %m\n",
1516 ctx
->pipes
.stderr
[1]);
1521 // Close the pipe (as we have moved the original file descriptors)
1522 pakfire_jail_close_pipe(jail
, ctx
->pipes
.stdin
);
1523 pakfire_jail_close_pipe(jail
, ctx
->pipes
.stdout
);
1524 pakfire_jail_close_pipe(jail
, ctx
->pipes
.stderr
);
1527 // Reset open file limit (http://0pointer.net/blog/file-descriptor-limits.html)
1528 r
= pakfire_rlimit_reset_nofile(jail
->pakfire
);
1532 // Drop capabilities
1533 r
= pakfire_jail_drop_capabilities(jail
);
1538 r
= pakfire_jail_limit_syscalls(jail
);
1542 DEBUG(jail
->pakfire
, "Child process initialization done\n");
1543 DEBUG(jail
->pakfire
, "Launching command:\n");
1546 for (unsigned int i
= 0; argv
[i
]; i
++)
1547 DEBUG(jail
->pakfire
, " argv[%d] = %s\n", i
, argv
[i
]);
1550 r
= execvpe(argv
[0], (char**)argv
, jail
->env
);
1552 ERROR(jail
->pakfire
, "Could not execve(): %m\n");
1554 // Translate errno into regular exit code
1564 // We should not get here
1568 // Run a command in the jail
1569 static int __pakfire_jail_exec(struct pakfire_jail
* jail
, const char* argv
[],
1570 const int interactive
,
1571 pakfire_jail_communicate_in communicate_in
,
1572 pakfire_jail_communicate_out communicate_out
,
1577 // Check if argv is valid
1578 if (!argv
|| !argv
[0]) {
1583 // Send any output to the default logger if no callback is set
1584 if (!communicate_out
)
1585 communicate_out
= pakfire_jail_default_log_callback
;
1587 // Initialize context for this call
1588 struct pakfire_jail_exec ctx
= {
1598 .in
= communicate_in
,
1599 .out
= communicate_out
,
1604 DEBUG(jail
->pakfire
, "Executing jail...\n");
1606 // Enable networking in interactive mode
1608 ctx
.flags
|= PAKFIRE_JAIL_HAS_NETWORKING
;
1611 Setup a file descriptor which can be used to notify the client that the parent
1612 has completed configuration.
1614 ctx
.completed_fd
= eventfd(0, EFD_CLOEXEC
);
1615 if (ctx
.completed_fd
< 0) {
1616 ERROR(jail
->pakfire
, "eventfd() failed: %m\n");
1620 // Create pipes to communicate with child process if we are not running interactively
1622 // stdin (only if callback is set)
1623 if (ctx
.communicate
.in
) {
1624 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.stdin
, 0);
1630 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.stdout
, 0);
1635 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.stderr
, 0);
1640 // Setup pipes for logging
1642 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.log_INFO
, O_CLOEXEC
);
1647 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.log_ERROR
, O_CLOEXEC
);
1653 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.log_DEBUG
, O_CLOEXEC
);
1656 #endif /* ENABLE_DEBUG */
1658 // Configure child process
1659 struct clone_args args
= {
1668 .exit_signal
= SIGCHLD
,
1669 .pidfd
= (long long unsigned int)&ctx
.pidfd
,
1672 // Launch the process in a cgroup that is a leaf of the configured cgroup
1674 args
.flags
|= CLONE_INTO_CGROUP
;
1677 const char* uuid
= pakfire_jail_uuid(jail
);
1679 // Create a temporary cgroup
1680 r
= pakfire_cgroup_child(&ctx
.cgroup
, jail
->cgroup
, uuid
, 0);
1682 ERROR(jail
->pakfire
, "Could not create cgroup for jail: %m\n");
1686 // Clone into this cgroup
1687 args
.cgroup
= pakfire_cgroup_fd(ctx
.cgroup
);
1691 if (!pakfire_jail_exec_has_flag(&ctx
, PAKFIRE_JAIL_HAS_NETWORKING
)) {
1692 args
.flags
|= CLONE_NEWNET
;
1695 // Fork this process
1696 ctx
.pid
= clone3(&args
, sizeof(args
));
1698 ERROR(jail
->pakfire
, "Could not clone: %m\n");
1702 } else if (ctx
.pid
== 0) {
1703 r
= pakfire_jail_child(jail
, &ctx
, argv
);
1708 r
= pakfire_jail_parent(jail
, &ctx
);
1712 DEBUG(jail
->pakfire
, "Waiting for PID %d to finish its work\n", ctx
.pid
);
1714 // Read output of the child process
1715 r
= pakfire_jail_wait(jail
, &ctx
);
1719 // Handle exit status
1720 switch (ctx
.status
.si_code
) {
1722 DEBUG(jail
->pakfire
, "The child process exited with code %d\n",
1723 ctx
.status
.si_status
);
1726 exit
= ctx
.status
.si_status
;
1730 ERROR(jail
->pakfire
, "The child process was killed\n");
1735 ERROR(jail
->pakfire
, "The child process terminated abnormally\n");
1738 // Log anything else
1740 ERROR(jail
->pakfire
, "Unknown child exit code: %d\n", ctx
.status
.si_code
);
1745 // Destroy the temporary cgroup (if any)
1747 // Read cgroup stats
1748 r
= pakfire_cgroup_stat(ctx
.cgroup
, &ctx
.cgroup_stats
);
1750 ERROR(jail
->pakfire
, "Could not read cgroup stats: %m\n");
1752 pakfire_cgroup_stat_dump(ctx
.cgroup
, &ctx
.cgroup_stats
);
1755 pakfire_cgroup_destroy(ctx
.cgroup
);
1756 pakfire_cgroup_unref(ctx
.cgroup
);
1759 // Close any file descriptors
1760 pakfire_jail_close_pipe(jail
, ctx
.pipes
.stdin
);
1761 pakfire_jail_close_pipe(jail
, ctx
.pipes
.stdout
);
1762 pakfire_jail_close_pipe(jail
, ctx
.pipes
.stderr
);
1765 pakfire_jail_close_pipe(jail
, ctx
.pipes
.log_INFO
);
1766 pakfire_jail_close_pipe(jail
, ctx
.pipes
.log_ERROR
);
1767 pakfire_jail_close_pipe(jail
, ctx
.pipes
.log_DEBUG
);
1772 PAKFIRE_EXPORT
int pakfire_jail_exec(
1773 struct pakfire_jail
* jail
,
1775 pakfire_jail_communicate_in callback_in
,
1776 pakfire_jail_communicate_out callback_out
,
1778 return __pakfire_jail_exec(jail
, argv
, 0, callback_in
, callback_out
, data
);
1781 static int pakfire_jail_exec_interactive(
1782 struct pakfire_jail
* jail
, const char* argv
[]) {
1785 // Setup interactive stuff
1786 r
= pakfire_jail_setup_interactive_env(jail
);
1790 return __pakfire_jail_exec(jail
, argv
, 1, NULL
, NULL
, NULL
);
1793 int pakfire_jail_exec_script(struct pakfire_jail
* jail
,
1797 pakfire_jail_communicate_in callback_in
,
1798 pakfire_jail_communicate_out callback_out
,
1800 char path
[PATH_MAX
];
1801 const char** argv
= NULL
;
1805 const char* root
= pakfire_get_path(jail
->pakfire
);
1807 // Write the scriptlet to disk
1808 r
= pakfire_path_join(path
, root
, PAKFIRE_TMP_DIR
"/pakfire-script.XXXXXX");
1812 // Create a temporary file
1813 f
= pakfire_mktemp(path
, 0700);
1815 ERROR(jail
->pakfire
, "Could not create temporary file: %m\n");
1819 DEBUG(jail
->pakfire
, "Writing script to %s:\n%.*s\n", path
, (int)size
, script
);
1822 r
= fprintf(f
, "%s", script
);
1824 ERROR(jail
->pakfire
, "Could not write script to file %s: %m\n", path
);
1831 ERROR(jail
->pakfire
, "Could not close script file %s: %m\n", path
);
1837 // Count how many arguments were passed
1838 unsigned int argc
= 1;
1840 for (const char** arg
= args
; *arg
; arg
++)
1844 argv
= calloc(argc
+ 1, sizeof(*argv
));
1846 ERROR(jail
->pakfire
, "Could not allocate argv: %m\n");
1851 argv
[0] = (root
) ? pakfire_path_relpath(root
, path
) : path
;
1854 for (unsigned int i
= 1; i
< argc
; i
++)
1855 argv
[i
] = args
[i
-1];
1858 r
= pakfire_jail_exec(jail
, argv
, callback_in
, callback_out
, data
);
1866 // Remove script from disk
1874 A convenience function that creates a new jail, runs the given command and destroys
1877 int pakfire_jail_run(struct pakfire
* pakfire
, const char* argv
[], int flags
, char** output
) {
1878 struct pakfire_jail
* jail
= NULL
;
1881 // Create a new jail
1882 r
= pakfire_jail_create(&jail
, pakfire
, flags
);
1886 // Execute the command
1887 r
= pakfire_jail_exec(jail
, argv
, NULL
, pakfire_jail_capture_stdout
, output
);
1891 pakfire_jail_unref(jail
);
1896 int pakfire_jail_run_script(struct pakfire
* pakfire
,
1897 const char* script
, const size_t length
, const char* argv
[], int flags
) {
1898 struct pakfire_jail
* jail
= NULL
;
1901 // Create a new jail
1902 r
= pakfire_jail_create(&jail
, pakfire
, flags
);
1906 // Execute the command
1907 r
= pakfire_jail_exec_script(jail
, script
, length
, argv
, NULL
, NULL
, NULL
);
1911 pakfire_jail_unref(jail
);
1916 int pakfire_jail_shell(struct pakfire_jail
* jail
) {
1917 const char* argv
[] = {
1918 "/bin/bash", "--login", NULL
,
1921 // Execute /bin/bash
1922 return pakfire_jail_exec_interactive(jail
, argv
);
1925 int pakfire_jail_ldconfig(struct pakfire
* pakfire
) {
1926 char path
[PATH_MAX
];
1928 const char* ldconfig
= "/sbin/ldconfig";
1930 // Check if ldconfig exists before calling it to avoid overhead
1931 int r
= pakfire_path(pakfire
, path
, "%s", ldconfig
);
1935 // Check if ldconfig is executable
1936 r
= access(path
, X_OK
);
1938 DEBUG(pakfire
, "%s is not executable. Skipping...\n", ldconfig
);
1942 const char* argv
[] = {
1947 return pakfire_jail_run(pakfire
, argv
, 0, NULL
);