1 /*#############################################################################
3 # Pakfire - The IPFire package management system #
4 # Copyright (C) 2022 Pakfire development team #
6 # This program is free software: you can redistribute it and/or modify #
7 # it under the terms of the GNU General Public License as published by #
8 # the Free Software Foundation, either version 3 of the License, or #
9 # (at your option) any later version. #
11 # This program is distributed in the hope that it will be useful, #
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
14 # GNU General Public License for more details. #
16 # You should have received a copy of the GNU General Public License #
17 # along with this program. If not, see <http://www.gnu.org/licenses/>. #
19 #############################################################################*/
23 #include <linux/capability.h>
24 #include <linux/sched.h>
26 #include <linux/wait.h>
31 #include <sys/capability.h>
32 #include <sys/epoll.h>
33 #include <sys/eventfd.h>
34 #include <sys/mount.h>
35 #include <sys/personality.h>
36 #include <sys/prctl.h>
37 #include <sys/resource.h>
38 #include <sys/timerfd.h>
39 #include <sys/types.h>
48 #include <pakfire/arch.h>
49 #include <pakfire/cgroup.h>
50 #include <pakfire/jail.h>
51 #include <pakfire/logging.h>
52 #include <pakfire/mount.h>
53 #include <pakfire/pakfire.h>
54 #include <pakfire/private.h>
55 #include <pakfire/pwd.h>
56 #include <pakfire/string.h>
57 #include <pakfire/util.h>
59 #define BUFFER_SIZE 1024 * 64
60 #define ENVIRON_SIZE 128
61 #define EPOLL_MAX_EVENTS 2
62 #define MAX_MOUNTPOINTS 8
64 // The default environment that will be set for every command
65 static const struct environ
{
70 { "LANG", "C.utf-8" },
71 { "PATH", "/usr/local/sbin:/usr/sbin:/sbin:/usr/local/bin:/usr/bin:/bin", },
74 // Tell everything that it is running inside a Pakfire container
75 { "container", "pakfire" },
79 struct pakfire_jail_mountpoint
{
80 char source
[PATH_MAX
];
81 char target
[PATH_MAX
];
86 struct pakfire
* pakfire
;
89 // A unique ID for each jail
91 char __uuid
[UUID_STR_LEN
];
100 struct itimerspec timeout
;
103 struct pakfire_cgroup
* cgroup
;
106 char* env
[ENVIRON_SIZE
];
109 struct pakfire_jail_mountpoint mountpoints
[MAX_MOUNTPOINTS
];
110 unsigned int num_mountpoints
;
113 struct pakfire_log_buffer
{
114 char data
[BUFFER_SIZE
];
118 enum pakfire_jail_exec_flags
{
119 PAKFIRE_JAIL_HAS_NETWORKING
= (1 << 0),
122 struct pakfire_jail_exec
{
125 // PID (of the child)
129 // Process status (from waitid)
132 // FD to notify the client that the parent has finished initialization
136 struct pakfire_jail_pipes
{
148 struct pakfire_jail_communicate
{
149 pakfire_jail_communicate_in in
;
150 pakfire_jail_communicate_out out
;
155 struct pakfire_jail_buffers
{
156 struct pakfire_log_buffer stdout
;
157 struct pakfire_log_buffer stderr
;
160 struct pakfire_log_buffer log_INFO
;
161 struct pakfire_log_buffer log_ERROR
;
162 struct pakfire_log_buffer log_DEBUG
;
165 struct pakfire_cgroup
* cgroup
;
166 struct pakfire_cgroup_stats cgroup_stats
;
169 static int clone3(struct clone_args
* args
, size_t size
) {
170 return syscall(__NR_clone3
, args
, size
);
173 static int pidfd_send_signal(int pidfd
, int sig
, siginfo_t
* info
, unsigned int flags
) {
174 return syscall(SYS_pidfd_send_signal
, pidfd
, sig
, info
, flags
);
177 static int pakfire_jail_exec_has_flag(
178 const struct pakfire_jail_exec
* ctx
, const enum pakfire_jail_exec_flags flag
) {
179 return ctx
->flags
& flag
;
182 static void pakfire_jail_free(struct pakfire_jail
* jail
) {
183 DEBUG(jail
->pakfire
, "Freeing jail at %p\n", jail
);
186 for (unsigned int i
= 0; jail
->env
[i
]; i
++)
190 pakfire_cgroup_unref(jail
->cgroup
);
192 pakfire_unref(jail
->pakfire
);
197 Passes any log messages on to the default pakfire log callback
199 static int pakfire_jail_default_log_callback(struct pakfire
* pakfire
, void* data
,
200 int priority
, const char* line
, size_t length
) {
203 INFO(pakfire
, "%s", line
);
207 ERROR(pakfire
, "%s", line
);
212 DEBUG(pakfire
, "%s", line
);
220 static const char* pakfire_jail_uuid(struct pakfire_jail
* jail
) {
222 uuid_unparse_lower(jail
->uuid
, jail
->__uuid
);
227 static int pakfire_jail_setup_interactive_env(struct pakfire_jail
* jail
) {
229 int r
= pakfire_jail_set_env(jail
, "PS1", "pakfire-jail \\w> ");
234 char* TERM
= secure_getenv("TERM");
236 r
= pakfire_jail_set_env(jail
, "TERM", TERM
);
242 char* LANG
= secure_getenv("LANG");
244 r
= pakfire_jail_set_env(jail
, "LANG", LANG
);
252 PAKFIRE_EXPORT
int pakfire_jail_create(struct pakfire_jail
** jail
,
253 struct pakfire
* pakfire
, int flags
) {
256 const char* arch
= pakfire_get_arch(pakfire
);
258 // Allocate a new jail
259 struct pakfire_jail
* j
= calloc(1, sizeof(*j
));
264 j
->pakfire
= pakfire_ref(pakfire
);
266 // Initialize reference counter
272 // Generate a random UUID
273 uuid_generate_random(j
->uuid
);
275 DEBUG(j
->pakfire
, "Allocated new jail at %p\n", j
);
277 // Set default environment
278 for (const struct environ
* e
= ENV
; e
->key
; e
++) {
279 r
= pakfire_jail_set_env(j
, e
->key
, e
->val
);
284 // Enable all CPU features that CPU has to offer
285 if (!pakfire_arch_supported_by_host(arch
)) {
286 r
= pakfire_jail_set_env(j
, "QEMU_CPU", "max");
291 // Set container UUID
292 r
= pakfire_jail_set_env(j
, "container_uuid", pakfire_jail_uuid(j
));
296 // Disable systemctl to talk to systemd
297 if (!pakfire_on_root(j
->pakfire
)) {
298 r
= pakfire_jail_set_env(j
, "SYSTEMD_OFFLINE", "1");
308 pakfire_jail_free(j
);
313 PAKFIRE_EXPORT
struct pakfire_jail
* pakfire_jail_ref(struct pakfire_jail
* jail
) {
319 PAKFIRE_EXPORT
struct pakfire_jail
* pakfire_jail_unref(struct pakfire_jail
* jail
) {
320 if (--jail
->nrefs
> 0)
323 pakfire_jail_free(jail
);
329 PAKFIRE_EXPORT
int pakfire_jail_nice(struct pakfire_jail
* jail
, int nice
) {
330 // Check if nice level is in range
331 if (nice
< -19 || nice
> 20) {
342 int pakfire_jail_set_cgroup(struct pakfire_jail
* jail
, struct pakfire_cgroup
* cgroup
) {
343 // Free any previous cgroup
345 pakfire_cgroup_unref(jail
->cgroup
);
349 // Set any new cgroup
351 DEBUG(jail
->pakfire
, "Setting cgroup %p\n", cgroup
);
353 jail
->cgroup
= pakfire_cgroup_ref(cgroup
);
362 // Returns the length of the environment
363 static unsigned int pakfire_jail_env_length(struct pakfire_jail
* jail
) {
366 // Count everything in the environment
367 for (char** e
= jail
->env
; *e
; e
++)
373 // Finds an existing environment variable and returns its index or -1 if not found
374 static int pakfire_jail_find_env(struct pakfire_jail
* jail
, const char* key
) {
380 char buffer
[strlen(key
) + 2];
381 pakfire_string_format(buffer
, "%s=", key
);
383 for (unsigned int i
= 0; jail
->env
[i
]; i
++) {
384 if (pakfire_string_startswith(jail
->env
[i
], buffer
))
392 // Returns the value of an environment variable or NULL
393 PAKFIRE_EXPORT
const char* pakfire_jail_get_env(struct pakfire_jail
* jail
,
395 int i
= pakfire_jail_find_env(jail
, key
);
399 return jail
->env
[i
] + strlen(key
) + 1;
402 // Sets an environment variable
403 PAKFIRE_EXPORT
int pakfire_jail_set_env(struct pakfire_jail
* jail
,
404 const char* key
, const char* value
) {
405 // Find the index where to write this value to
406 int i
= pakfire_jail_find_env(jail
, key
);
408 i
= pakfire_jail_env_length(jail
);
410 // Return -ENOSPC when the environment is full
411 if (i
>= ENVIRON_SIZE
) {
416 // Free any previous value
420 // Format and set environment variable
421 asprintf(&jail
->env
[i
], "%s=%s", key
, value
);
423 DEBUG(jail
->pakfire
, "Set environment variable: %s\n", jail
->env
[i
]);
428 // Imports an environment
429 PAKFIRE_EXPORT
int pakfire_jail_import_env(struct pakfire_jail
* jail
, const char* env
[]) {
437 // Copy environment variables
438 for (unsigned int i
= 0; env
[i
]; i
++) {
439 r
= pakfire_string_partition(env
[i
], "=", &key
, &val
);
444 r
= pakfire_jail_set_env(jail
, key
, val
);
461 PAKFIRE_EXPORT
int pakfire_jail_set_timeout(
462 struct pakfire_jail
* jail
, unsigned int timeout
) {
464 jail
->timeout
.it_value
.tv_sec
= timeout
;
467 DEBUG(jail
->pakfire
, "Timeout set to %d second(s)\n", timeout
);
469 DEBUG(jail
->pakfire
, "Timeout disabled\n");
474 static int pakfire_jail_create_timer(struct pakfire_jail
* jail
) {
477 // Nothing to do if no timeout has been set
478 if (!jail
->timeout
.it_value
.tv_sec
)
481 // Create a new timer
482 const int fd
= timerfd_create(CLOCK_MONOTONIC
, 0);
484 ERROR(jail
->pakfire
, "Could not create timer: %m\n");
489 r
= timerfd_settime(fd
, 0, &jail
->timeout
, NULL
);
491 ERROR(jail
->pakfire
, "Could not arm timer: %m\n");
505 This function replaces any logging in the child process.
507 All log messages will be sent to the parent process through their respective pipes.
509 static void pakfire_jail_log(void* data
, int priority
, const char* file
,
510 int line
, const char* fn
, const char* format
, va_list args
) {
511 struct pakfire_jail_pipes
* pipes
= (struct pakfire_jail_pipes
*)data
;
516 fd
= pipes
->log_INFO
[1];
520 fd
= pipes
->log_ERROR
[1];
525 fd
= pipes
->log_DEBUG
[1];
527 #endif /* ENABLE_DEBUG */
529 // Ignore any messages of an unknown priority
534 // Send the log message
536 vdprintf(fd
, format
, args
);
539 static int pakfire_jail_log_buffer_is_full(const struct pakfire_log_buffer
* buffer
) {
540 return (sizeof(buffer
->data
) == buffer
->used
);
544 This function reads as much data as it can from the file descriptor.
545 If it finds a whole line in it, it will send it to the logger and repeat the process.
546 If not newline character is found, it will try to read more data until it finds one.
548 static int pakfire_jail_handle_log(struct pakfire_jail
* jail
,
549 struct pakfire_jail_exec
* ctx
, int priority
, int fd
,
550 struct pakfire_log_buffer
* buffer
, pakfire_jail_communicate_out callback
, void* data
) {
551 char line
[BUFFER_SIZE
+ 1];
553 // Fill up buffer from fd
554 if (buffer
->used
< sizeof(buffer
->data
)) {
555 ssize_t bytes_read
= read(fd
, buffer
->data
+ buffer
->used
,
556 sizeof(buffer
->data
) - buffer
->used
);
559 if (bytes_read
< 0) {
560 ERROR(jail
->pakfire
, "Could not read from fd %d: %m\n", fd
);
564 // Update buffer size
565 buffer
->used
+= bytes_read
;
568 // See if we have any lines that we can write
569 while (buffer
->used
) {
570 // Search for the end of the first line
571 char* eol
= memchr(buffer
->data
, '\n', buffer
->used
);
575 // If the buffer is full, we send the content to the logger and try again
576 // This should not happen in practise
577 if (pakfire_jail_log_buffer_is_full(buffer
)) {
578 DEBUG(jail
->pakfire
, "Logging buffer is full. Sending all content\n");
580 eol
= buffer
->data
+ sizeof(buffer
->data
) - 1;
582 // Otherwise we might have only read parts of the output
587 // Find the length of the string
588 size_t length
= eol
- buffer
->data
+ 1;
590 // Copy the line into the buffer
591 memcpy(line
, buffer
->data
, length
);
593 // Terminate the string
598 int r
= callback(jail
->pakfire
, data
, priority
, line
, length
);
600 ERROR(jail
->pakfire
, "The logging callback returned an error: %d\n", r
);
605 // Remove line from buffer
606 memmove(buffer
->data
, buffer
->data
+ length
, buffer
->used
- length
);
607 buffer
->used
-= length
;
613 static int pakfire_jail_stream_stdin(struct pakfire_jail
* jail
,
614 struct pakfire_jail_exec
* ctx
, const int fd
) {
617 // Nothing to do if there is no stdin callback set
618 if (!ctx
->communicate
.in
) {
619 DEBUG(jail
->pakfire
, "Callback for standard input is not set\n");
623 // Skip if the writing pipe has already been closed
624 if (!ctx
->pipes
.stdin
[1])
627 DEBUG(jail
->pakfire
, "Streaming standard input...\n");
629 // Calling the callback
630 r
= ctx
->communicate
.in(jail
->pakfire
, ctx
->communicate
.data
, fd
);
632 DEBUG(jail
->pakfire
, "Standard input callback finished: %d\n", r
);
634 // The callback signaled that it has written everything
636 DEBUG(jail
->pakfire
, "Closing standard input pipe\n");
638 // Close the file-descriptor
641 // Reset the file-descriptor so it won't be closed again later
642 ctx
->pipes
.stdin
[1] = 0;
651 static int pakfire_jail_setup_pipe(struct pakfire_jail
* jail
, int (*fds
)[2], const int flags
) {
652 int r
= pipe2(*fds
, flags
);
654 ERROR(jail
->pakfire
, "Could not setup pipe: %m\n");
661 static void pakfire_jail_close_pipe(struct pakfire_jail
* jail
, int fds
[2]) {
662 for (unsigned int i
= 0; i
< 2; i
++)
668 This is a convenience function to fetch the reading end of a pipe and
669 closes the write end.
671 static int pakfire_jail_get_pipe_to_read(struct pakfire_jail
* jail
, int (*fds
)[2]) {
672 // Give the variables easier names to avoid confusion
673 int* fd_read
= &(*fds
)[0];
674 int* fd_write
= &(*fds
)[1];
676 // Close the write end of the pipe
682 // Return the read end
686 static int pakfire_jail_get_pipe_to_write(struct pakfire_jail
* jail
, int (*fds
)[2]) {
687 // Give the variables easier names to avoid confusion
688 int* fd_read
= &(*fds
)[0];
689 int* fd_write
= &(*fds
)[1];
691 // Close the read end of the pipe
697 // Return the write end
701 static int pakfire_jail_wait(struct pakfire_jail
* jail
, struct pakfire_jail_exec
* ctx
) {
703 struct epoll_event ev
;
704 struct epoll_event events
[EPOLL_MAX_EVENTS
];
708 // Fetch file descriptors from context
709 const int stdin
= pakfire_jail_get_pipe_to_write(jail
, &ctx
->pipes
.stdin
);
710 const int stdout
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.stdout
);
711 const int stderr
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.stderr
);
712 const int pidfd
= ctx
->pidfd
;
715 const int timerfd
= pakfire_jail_create_timer(jail
);
718 const int log_INFO
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.log_INFO
);
719 const int log_ERROR
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.log_ERROR
);
720 const int log_DEBUG
= pakfire_jail_get_pipe_to_read(jail
, &ctx
->pipes
.log_DEBUG
);
722 // Make a list of all file descriptors we are interested in
724 stdin
, stdout
, stderr
, pidfd
, timerfd
, log_INFO
, log_ERROR
, log_DEBUG
,
728 epollfd
= epoll_create1(0);
730 ERROR(jail
->pakfire
, "Could not initialize epoll(): %m\n");
735 // Turn file descriptors into non-blocking mode and add them to epoll()
736 for (unsigned int i
= 0; i
< sizeof(fds
) / sizeof(*fds
); i
++) {
739 // Skip fds which were not initialized
743 ev
.events
= EPOLLHUP
;
746 ev
.events
|= EPOLLOUT
;
748 ev
.events
|= EPOLLIN
;
751 int flags
= fcntl(fd
, F_GETFL
, 0);
753 // Set modified flags
754 if (fcntl(fd
, F_SETFL
, flags
|O_NONBLOCK
) < 0) {
756 "Could not set file descriptor %d into non-blocking mode: %m\n", fd
);
763 if (epoll_ctl(epollfd
, EPOLL_CTL_ADD
, fd
, &ev
) < 0) {
764 ERROR(jail
->pakfire
, "Could not add file descriptor %d to epoll(): %m\n", fd
);
772 // Loop for as long as the process is alive
774 int num
= epoll_wait(epollfd
, events
, EPOLL_MAX_EVENTS
, -1);
776 // Ignore if epoll_wait() has been interrupted
780 ERROR(jail
->pakfire
, "epoll_wait() failed: %m\n");
786 for (int i
= 0; i
< num
; i
++) {
787 int e
= events
[i
].events
;
788 int fd
= events
[i
].data
.fd
;
790 struct pakfire_log_buffer
* buffer
= NULL
;
791 pakfire_jail_communicate_out callback
= NULL
;
795 // Check if there is any data to be read
797 // Handle any changes to the PIDFD
799 // Call waidid() and store the result
800 r
= waitid(P_PIDFD
, ctx
->pidfd
, &ctx
->status
, WEXITED
);
802 ERROR(jail
->pakfire
, "waitid() failed: %m\n");
806 // Mark that we have ended so that we will process the remaining
807 // events from epoll() now, but won't restart the outer loop.
811 // Handle timer events
812 } else if (fd
== timerfd
) {
813 DEBUG(jail
->pakfire
, "Timer event received\n");
816 r
= read(timerfd
, garbage
, sizeof(garbage
));
818 ERROR(jail
->pakfire
, "Could not disarm timer: %m\n");
823 // Terminate the process if it hasn't already ended
825 DEBUG(jail
->pakfire
, "Terminating process...\n");
827 // Send SIGTERM to the process
828 r
= pidfd_send_signal(pidfd
, SIGKILL
, NULL
, 0);
830 ERROR(jail
->pakfire
, "Could not kill process: %m\n");
835 // There is nothing else to do
838 // Handle logging messages
839 } else if (fd
== log_INFO
) {
840 buffer
= &ctx
->buffers
.log_INFO
;
843 callback
= pakfire_jail_default_log_callback
;
845 } else if (fd
== log_ERROR
) {
846 buffer
= &ctx
->buffers
.log_ERROR
;
849 callback
= pakfire_jail_default_log_callback
;
851 } else if (fd
== log_DEBUG
) {
852 buffer
= &ctx
->buffers
.log_DEBUG
;
853 priority
= LOG_DEBUG
;
855 callback
= pakfire_jail_default_log_callback
;
857 // Handle anything from the log pipes
858 } else if (fd
== stdout
) {
859 buffer
= &ctx
->buffers
.stdout
;
862 callback
= ctx
->communicate
.out
;
863 data
= ctx
->communicate
.data
;
865 } else if (fd
== stderr
) {
866 buffer
= &ctx
->buffers
.stderr
;
869 callback
= ctx
->communicate
.out
;
870 data
= ctx
->communicate
.data
;
873 DEBUG(jail
->pakfire
, "Received invalid file descriptor %d\n", fd
);
878 r
= pakfire_jail_handle_log(jail
, ctx
, priority
, fd
, buffer
, callback
, data
);
884 // Handle standard input
886 r
= pakfire_jail_stream_stdin(jail
, ctx
, fd
);
889 // Ignore if we filled up the buffer
894 ERROR(jail
->pakfire
, "Could not write to stdin: %m\n");
901 // Check if any file descriptors have been closed
903 // Remove the file descriptor
904 r
= epoll_ctl(epollfd
, EPOLL_CTL_DEL
, fd
, NULL
);
906 ERROR(jail
->pakfire
, "Could not remove closed file-descriptor %d: %m\n", fd
);
922 int pakfire_jail_capture_stdout(struct pakfire
* pakfire
, void* data
,
923 int priority
, const char* line
, size_t length
) {
924 char** output
= (char**)data
;
927 // Append everything from stdout to a buffer
928 if (output
&& priority
== LOG_INFO
) {
929 r
= asprintf(output
, "%s%s", (output
&& *output
) ? *output
: "", line
);
935 // Send everything else to the default logger
936 return pakfire_jail_default_log_callback(pakfire
, NULL
, priority
, line
, length
);
941 static int pakfire_jail_drop_capabilities(struct pakfire_jail
* jail
) {
942 const int capabilities
[] = {
943 // Deny access to the kernel's audit system
948 // Deny suspending block devices
951 // Deny any stuff with BPF
954 // Deny checkpoint restore
955 CAP_CHECKPOINT_RESTORE
,
957 // Deny opening files by inode number (open_by_handle_at)
960 // Deny setting SUID bits
963 // Deny locking more memory
966 // Deny modifying any Apparmor/SELinux/SMACK configuration
970 // Deny creating any special devices
973 // Deny reading from syslog
976 // Deny any admin actions (mount, sethostname, ...)
979 // Deny rebooting the system
982 // Deny loading kernel modules
985 // Deny setting nice level
988 // Deny access to /proc/kcore, /dev/mem, /dev/kmem
991 // Deny circumventing any resource limits
994 // Deny setting the system time
997 // Deny playing with suspend
1003 DEBUG(jail
->pakfire
, "Dropping capabilities...\n");
1005 size_t num_caps
= 0;
1008 // Drop any capabilities
1009 for (const int* cap
= capabilities
; *cap
; cap
++) {
1010 r
= prctl(PR_CAPBSET_DROP
, *cap
, 0, 0, 0);
1012 ERROR(jail
->pakfire
, "Could not drop capability %d: %m\n", *cap
);
1019 // Fetch any capabilities
1020 cap_t caps
= cap_get_proc();
1022 ERROR(jail
->pakfire
, "Could not read capabilities: %m\n");
1027 Set inheritable capabilities
1029 This ensures that no processes will be able to gain any of the listed
1032 r
= cap_set_flag(caps
, CAP_INHERITABLE
, num_caps
, capabilities
, CAP_CLEAR
);
1034 ERROR(jail
->pakfire
, "cap_set_flag() failed: %m\n");
1038 // Restore capabilities
1039 r
= cap_set_proc(caps
);
1041 ERROR(jail
->pakfire
, "Could not restore capabilities: %m\n");
1054 static int pakfire_jail_limit_syscalls(struct pakfire_jail
* jail
) {
1055 const int syscalls
[] = {
1056 // The kernel's keyring isn't namespaced
1059 SCMP_SYS(request_key
),
1061 // Disable userfaultfd
1062 SCMP_SYS(userfaultfd
),
1064 // Disable perf which could leak a lot of information about the host
1065 SCMP_SYS(perf_event_open
),
1071 DEBUG(jail
->pakfire
, "Applying syscall filter...\n");
1073 // Setup a syscall filter which allows everything by default
1074 scmp_filter_ctx ctx
= seccomp_init(SCMP_ACT_ALLOW
);
1076 ERROR(jail
->pakfire
, "Could not setup seccomp filter: %m\n");
1081 for (const int* syscall
= syscalls
; *syscall
; syscall
++) {
1082 r
= seccomp_rule_add(ctx
, SCMP_ACT_ERRNO(EPERM
), *syscall
, 0);
1084 ERROR(jail
->pakfire
, "Could not configure syscall %d: %m\n", *syscall
);
1089 // Load syscall filter into the kernel
1090 r
= seccomp_load(ctx
);
1092 ERROR(jail
->pakfire
, "Could not load syscall filter into the kernel: %m\n");
1098 seccomp_release(ctx
);
1105 PAKFIRE_EXPORT
int pakfire_jail_bind(struct pakfire_jail
* jail
,
1106 const char* source
, const char* target
, int flags
) {
1107 struct pakfire_jail_mountpoint
* mp
= NULL
;
1110 // Check if there is any space left
1111 if (jail
->num_mountpoints
>= MAX_MOUNTPOINTS
) {
1116 // Check for valid inputs
1117 if (!source
|| !target
) {
1122 // Select the next free slot
1123 mp
= &jail
->mountpoints
[jail
->num_mountpoints
];
1126 r
= pakfire_string_set(mp
->source
, source
);
1128 ERROR(jail
->pakfire
, "Could not copy source: %m\n");
1133 r
= pakfire_string_set(mp
->target
, target
);
1135 ERROR(jail
->pakfire
, "Could not copy target: %m\n");
1142 // Increment counter
1143 jail
->num_mountpoints
++;
1148 static int pakfire_jail_mount_networking(struct pakfire_jail
* jail
) {
1151 const char* paths
[] = {
1157 // Bind-mount all paths read-only
1158 for (const char** path
= paths
; *path
; path
++) {
1159 r
= pakfire_bind(jail
->pakfire
, *path
, NULL
, MS_RDONLY
);
1168 Mounts everything that we require in the new namespace
1170 static int pakfire_jail_mount(struct pakfire_jail
* jail
, struct pakfire_jail_exec
* ctx
) {
1171 struct pakfire_jail_mountpoint
* mp
= NULL
;
1174 // Mount all default stuff
1175 r
= pakfire_mount_all(jail
->pakfire
);
1179 // Mount networking stuff
1180 if (pakfire_jail_exec_has_flag(ctx
, PAKFIRE_JAIL_HAS_NETWORKING
)) {
1181 r
= pakfire_jail_mount_networking(jail
);
1186 // Mount all custom stuff
1187 for (unsigned int i
= 0; i
< jail
->num_mountpoints
; i
++) {
1189 mp
= &jail
->mountpoints
[i
];
1192 r
= pakfire_bind(jail
->pakfire
, mp
->source
, mp
->target
, mp
->flags
);
1197 // Log all mountpoints
1198 pakfire_mount_list(jail
->pakfire
);
1205 static int pakfire_jail_setup_uid_mapping(struct pakfire_jail
* jail
, pid_t pid
) {
1206 char path
[PATH_MAX
];
1209 // Skip mapping anything when running on /
1210 if (pakfire_on_root(jail
->pakfire
))
1214 r
= pakfire_string_format(path
, "/proc/%d/uid_map", pid
);
1219 const uid_t uid
= pakfire_uid(jail
->pakfire
);
1222 const struct pakfire_subid
* subuid
= pakfire_subuid(jail
->pakfire
);
1226 /* When running as root, we will map the entire range.
1228 When running as a non-privileged user, we will map the root user inside the jail
1229 to the user's UID outside of the jail, and we will map the rest starting from one.
1234 r
= pakfire_file_write(jail
->pakfire
, path
, 0, 0, 0,
1235 "0 %lu %lu\n", subuid
->id
, subuid
->length
);
1237 r
= pakfire_file_write(jail
->pakfire
, path
, 0, 0, 0,
1238 "0 %lu 1\n1 %lu %lu\n", uid
, subuid
->id
, subuid
->length
);
1242 ERROR(jail
->pakfire
, "Could not map UIDs: %m\n");
1249 static int pakfire_jail_setup_gid_mapping(struct pakfire_jail
* jail
, pid_t pid
) {
1250 char path
[PATH_MAX
];
1253 // Skip mapping anything when running on /
1254 if (pakfire_on_root(jail
->pakfire
))
1258 const gid_t gid
= pakfire_gid(jail
->pakfire
);
1261 const struct pakfire_subid
* subgid
= pakfire_subgid(jail
->pakfire
);
1266 r
= pakfire_string_format(path
, "/proc/%d/gid_map", pid
);
1272 r
= pakfire_file_write(jail
->pakfire
, path
, 0, 0, 0,
1273 "0 %lu %lu\n", subgid
->id
, subgid
->length
);
1275 r
= pakfire_file_write(jail
->pakfire
, path
, 0, 0, 0,
1276 "0 %lu 1\n%1 %lu %lu\n", gid
, subgid
->id
, subgid
->length
);
1280 ERROR(jail
->pakfire
, "Could not map GIDs: %m\n");
1287 static int pakfire_jail_setgroups(struct pakfire_jail
* jail
, pid_t pid
) {
1288 char path
[PATH_MAX
];
1292 r
= pakfire_string_format(path
, "/proc/%d/setgroups", pid
);
1296 // Open file for writing
1297 FILE* f
= fopen(path
, "w");
1299 ERROR(jail
->pakfire
, "Could not open %s for writing: %m\n", path
);
1304 int bytes_written
= fprintf(f
, "deny\n");
1305 if (bytes_written
<= 0) {
1306 ERROR(jail
->pakfire
, "Could not write to %s: %m\n", path
);
1313 ERROR(jail
->pakfire
, "Could not close %s: %m\n", path
);
1324 static int pakfire_jail_send_signal(struct pakfire_jail
* jail
, int fd
) {
1325 const uint64_t val
= 1;
1328 DEBUG(jail
->pakfire
, "Sending signal...\n");
1330 // Write to the file descriptor
1331 ssize_t bytes_written
= write(fd
, &val
, sizeof(val
));
1332 if (bytes_written
< 0 || (size_t)bytes_written
< sizeof(val
)) {
1333 ERROR(jail
->pakfire
, "Could not send signal: %m\n");
1337 // Close the file descriptor
1343 static int pakfire_jail_wait_for_signal(struct pakfire_jail
* jail
, int fd
) {
1347 DEBUG(jail
->pakfire
, "Waiting for signal...\n");
1349 ssize_t bytes_read
= read(fd
, &val
, sizeof(val
));
1350 if (bytes_read
< 0 || (size_t)bytes_read
< sizeof(val
)) {
1351 ERROR(jail
->pakfire
, "Error waiting for signal: %m\n");
1355 // Close the file descriptor
1362 Performs the initialisation that needs to happen in the parent part
1364 static int pakfire_jail_parent(struct pakfire_jail
* jail
, struct pakfire_jail_exec
* ctx
) {
1367 // Setup UID mapping
1368 r
= pakfire_jail_setup_uid_mapping(jail
, ctx
->pid
);
1372 // Write "deny" to /proc/PID/setgroups
1373 r
= pakfire_jail_setgroups(jail
, ctx
->pid
);
1377 // Setup GID mapping
1378 r
= pakfire_jail_setup_gid_mapping(jail
, ctx
->pid
);
1382 // Parent has finished initialisation
1383 DEBUG(jail
->pakfire
, "Parent has finished initialization\n");
1385 // Send signal to client
1386 r
= pakfire_jail_send_signal(jail
, ctx
->completed_fd
);
1393 static int pakfire_jail_child(struct pakfire_jail
* jail
, struct pakfire_jail_exec
* ctx
,
1394 const char* argv
[]) {
1397 // Redirect any logging to our log pipe
1398 pakfire_set_log_callback(jail
->pakfire
, pakfire_jail_log
, &ctx
->pipes
);
1401 r
= prctl(PR_SET_PDEATHSIG
, SIGKILL
, 0, 0, 0);
1403 ERROR(jail
->pakfire
, "Could not configure to die with parent: %m\n");
1408 pid_t pid
= getpid();
1410 DEBUG(jail
->pakfire
, "Launched child process in jail with PID %d\n", pid
);
1412 // Wait for the parent to finish initialization
1413 r
= pakfire_jail_wait_for_signal(jail
, ctx
->completed_fd
);
1417 // Perform further initialization
1420 uid_t uid
= getuid();
1421 gid_t gid
= getgid();
1424 uid_t euid
= geteuid();
1425 gid_t egid
= getegid();
1427 DEBUG(jail
->pakfire
, " UID: %d (effective %d)\n", uid
, euid
);
1428 DEBUG(jail
->pakfire
, " GID: %d (effective %d)\n", gid
, egid
);
1430 // Check if we are (effectively running as root)
1431 if (uid
|| gid
|| euid
|| egid
) {
1432 ERROR(jail
->pakfire
, "Child process is not running as root\n");
1436 const char* root
= pakfire_get_path(jail
->pakfire
);
1437 const char* arch
= pakfire_get_arch(jail
->pakfire
);
1439 // Change root (unless root is /)
1440 if (!pakfire_on_root(jail
->pakfire
)) {
1442 r
= pakfire_jail_mount(jail
, ctx
);
1449 ERROR(jail
->pakfire
, "chroot() to %s failed: %m\n", root
);
1453 // Change directory to /
1456 ERROR(jail
->pakfire
, "chdir() after chroot() failed: %m\n");
1462 unsigned long persona
= pakfire_arch_personality(arch
);
1464 r
= personality(persona
);
1466 ERROR(jail
->pakfire
, "Could not set personality (%x)\n", (unsigned int)persona
);
1473 DEBUG(jail
->pakfire
, "Setting nice level to %d\n", jail
->nice
);
1475 r
= setpriority(PRIO_PROCESS
, pid
, jail
->nice
);
1477 ERROR(jail
->pakfire
, "Could not set nice level: %m\n");
1482 // Close other end of log pipes
1483 close(ctx
->pipes
.log_INFO
[0]);
1484 close(ctx
->pipes
.log_ERROR
[0]);
1486 close(ctx
->pipes
.log_DEBUG
[0]);
1487 #endif /* ENABLE_DEBUG */
1489 // Connect standard input
1490 if (ctx
->pipes
.stdin
[0]) {
1491 r
= dup2(ctx
->pipes
.stdin
[0], STDIN_FILENO
);
1493 ERROR(jail
->pakfire
, "Could not connect fd %d to stdin: %m\n",
1494 ctx
->pipes
.stdin
[0]);
1500 // Connect standard output and error
1501 if (ctx
->pipes
.stdout
[1] && ctx
->pipes
.stderr
[1]) {
1502 r
= dup2(ctx
->pipes
.stdout
[1], STDOUT_FILENO
);
1504 ERROR(jail
->pakfire
, "Could not connect fd %d to stdout: %m\n",
1505 ctx
->pipes
.stdout
[1]);
1510 r
= dup2(ctx
->pipes
.stderr
[1], STDERR_FILENO
);
1512 ERROR(jail
->pakfire
, "Could not connect fd %d to stderr: %m\n",
1513 ctx
->pipes
.stderr
[1]);
1518 // Close the pipe (as we have moved the original file descriptors)
1519 pakfire_jail_close_pipe(jail
, ctx
->pipes
.stdin
);
1520 pakfire_jail_close_pipe(jail
, ctx
->pipes
.stdout
);
1521 pakfire_jail_close_pipe(jail
, ctx
->pipes
.stderr
);
1524 // Reset open file limit (http://0pointer.net/blog/file-descriptor-limits.html)
1525 r
= pakfire_rlimit_reset_nofile(jail
->pakfire
);
1529 // Drop capabilities
1530 r
= pakfire_jail_drop_capabilities(jail
);
1535 r
= pakfire_jail_limit_syscalls(jail
);
1539 DEBUG(jail
->pakfire
, "Child process initialization done\n");
1540 DEBUG(jail
->pakfire
, "Launching command:\n");
1543 for (unsigned int i
= 0; argv
[i
]; i
++)
1544 DEBUG(jail
->pakfire
, " argv[%d] = %s\n", i
, argv
[i
]);
1547 r
= execvpe(argv
[0], (char**)argv
, jail
->env
);
1549 ERROR(jail
->pakfire
, "Could not execve(%s): %m\n", argv
[0]);
1551 // Translate errno into regular exit code
1561 // We should not get here
1565 // Run a command in the jail
1566 static int __pakfire_jail_exec(struct pakfire_jail
* jail
, const char* argv
[],
1567 const int interactive
,
1568 pakfire_jail_communicate_in communicate_in
,
1569 pakfire_jail_communicate_out communicate_out
,
1574 // Check if argv is valid
1575 if (!argv
|| !argv
[0]) {
1580 // Send any output to the default logger if no callback is set
1581 if (!communicate_out
)
1582 communicate_out
= pakfire_jail_default_log_callback
;
1584 // Initialize context for this call
1585 struct pakfire_jail_exec ctx
= {
1595 .in
= communicate_in
,
1596 .out
= communicate_out
,
1601 DEBUG(jail
->pakfire
, "Executing jail...\n");
1603 // Enable networking in interactive mode
1605 ctx
.flags
|= PAKFIRE_JAIL_HAS_NETWORKING
;
1608 Setup a file descriptor which can be used to notify the client that the parent
1609 has completed configuration.
1611 ctx
.completed_fd
= eventfd(0, EFD_CLOEXEC
);
1612 if (ctx
.completed_fd
< 0) {
1613 ERROR(jail
->pakfire
, "eventfd() failed: %m\n");
1617 // Create pipes to communicate with child process if we are not running interactively
1619 // stdin (only if callback is set)
1620 if (ctx
.communicate
.in
) {
1621 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.stdin
, 0);
1627 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.stdout
, 0);
1632 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.stderr
, 0);
1637 // Setup pipes for logging
1639 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.log_INFO
, O_CLOEXEC
);
1644 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.log_ERROR
, O_CLOEXEC
);
1650 r
= pakfire_jail_setup_pipe(jail
, &ctx
.pipes
.log_DEBUG
, O_CLOEXEC
);
1653 #endif /* ENABLE_DEBUG */
1655 // Configure child process
1656 struct clone_args args
= {
1665 .exit_signal
= SIGCHLD
,
1666 .pidfd
= (long long unsigned int)&ctx
.pidfd
,
1669 // Launch the process in a cgroup that is a leaf of the configured cgroup
1671 args
.flags
|= CLONE_INTO_CGROUP
;
1674 const char* uuid
= pakfire_jail_uuid(jail
);
1676 // Create a temporary cgroup
1677 r
= pakfire_cgroup_child(&ctx
.cgroup
, jail
->cgroup
, uuid
, 0);
1679 ERROR(jail
->pakfire
, "Could not create cgroup for jail: %m\n");
1683 // Clone into this cgroup
1684 args
.cgroup
= pakfire_cgroup_fd(ctx
.cgroup
);
1688 if (!pakfire_jail_exec_has_flag(&ctx
, PAKFIRE_JAIL_HAS_NETWORKING
)) {
1689 args
.flags
|= CLONE_NEWNET
;
1692 // Fork this process
1693 ctx
.pid
= clone3(&args
, sizeof(args
));
1695 ERROR(jail
->pakfire
, "Could not clone: %m\n");
1699 } else if (ctx
.pid
== 0) {
1700 r
= pakfire_jail_child(jail
, &ctx
, argv
);
1705 r
= pakfire_jail_parent(jail
, &ctx
);
1709 DEBUG(jail
->pakfire
, "Waiting for PID %d to finish its work\n", ctx
.pid
);
1711 // Read output of the child process
1712 r
= pakfire_jail_wait(jail
, &ctx
);
1716 // Handle exit status
1717 switch (ctx
.status
.si_code
) {
1719 DEBUG(jail
->pakfire
, "The child process exited with code %d\n",
1720 ctx
.status
.si_status
);
1723 exit
= ctx
.status
.si_status
;
1727 ERROR(jail
->pakfire
, "The child process was killed\n");
1732 ERROR(jail
->pakfire
, "The child process terminated abnormally\n");
1735 // Log anything else
1737 ERROR(jail
->pakfire
, "Unknown child exit code: %d\n", ctx
.status
.si_code
);
1742 // Destroy the temporary cgroup (if any)
1744 // Read cgroup stats
1745 r
= pakfire_cgroup_stat(ctx
.cgroup
, &ctx
.cgroup_stats
);
1747 ERROR(jail
->pakfire
, "Could not read cgroup stats: %m\n");
1749 pakfire_cgroup_stat_dump(ctx
.cgroup
, &ctx
.cgroup_stats
);
1752 pakfire_cgroup_destroy(ctx
.cgroup
);
1753 pakfire_cgroup_unref(ctx
.cgroup
);
1756 // Close any file descriptors
1757 pakfire_jail_close_pipe(jail
, ctx
.pipes
.stdin
);
1758 pakfire_jail_close_pipe(jail
, ctx
.pipes
.stdout
);
1759 pakfire_jail_close_pipe(jail
, ctx
.pipes
.stderr
);
1762 pakfire_jail_close_pipe(jail
, ctx
.pipes
.log_INFO
);
1763 pakfire_jail_close_pipe(jail
, ctx
.pipes
.log_ERROR
);
1764 pakfire_jail_close_pipe(jail
, ctx
.pipes
.log_DEBUG
);
1769 PAKFIRE_EXPORT
int pakfire_jail_exec(
1770 struct pakfire_jail
* jail
,
1772 pakfire_jail_communicate_in callback_in
,
1773 pakfire_jail_communicate_out callback_out
,
1775 return __pakfire_jail_exec(jail
, argv
, 0, callback_in
, callback_out
, data
);
1778 static int pakfire_jail_exec_interactive(
1779 struct pakfire_jail
* jail
, const char* argv
[]) {
1782 // Setup interactive stuff
1783 r
= pakfire_jail_setup_interactive_env(jail
);
1787 return __pakfire_jail_exec(jail
, argv
, 1, NULL
, NULL
, NULL
);
1790 int pakfire_jail_exec_script(struct pakfire_jail
* jail
,
1794 pakfire_jail_communicate_in callback_in
,
1795 pakfire_jail_communicate_out callback_out
,
1797 char path
[PATH_MAX
];
1798 const char** argv
= NULL
;
1802 const char* root
= pakfire_get_path(jail
->pakfire
);
1804 // Write the scriptlet to disk
1805 r
= pakfire_path_join(path
, root
, PAKFIRE_TMP_DIR
"/pakfire-script.XXXXXX");
1809 // Create a temporary file
1810 f
= pakfire_mktemp(path
, 0700);
1812 ERROR(jail
->pakfire
, "Could not create temporary file: %m\n");
1816 DEBUG(jail
->pakfire
, "Writing script to %s:\n%.*s\n", path
, (int)size
, script
);
1819 r
= fprintf(f
, "%s", script
);
1821 ERROR(jail
->pakfire
, "Could not write script to file %s: %m\n", path
);
1828 ERROR(jail
->pakfire
, "Could not close script file %s: %m\n", path
);
1834 // Count how many arguments were passed
1835 unsigned int argc
= 1;
1837 for (const char** arg
= args
; *arg
; arg
++)
1841 argv
= calloc(argc
+ 1, sizeof(*argv
));
1843 ERROR(jail
->pakfire
, "Could not allocate argv: %m\n");
1848 argv
[0] = (root
) ? pakfire_path_relpath(root
, path
) : path
;
1851 for (unsigned int i
= 1; i
< argc
; i
++)
1852 argv
[i
] = args
[i
-1];
1855 r
= pakfire_jail_exec(jail
, argv
, callback_in
, callback_out
, data
);
1863 // Remove script from disk
1871 A convenience function that creates a new jail, runs the given command and destroys
1874 int pakfire_jail_run(struct pakfire
* pakfire
, const char* argv
[], int flags
, char** output
) {
1875 struct pakfire_jail
* jail
= NULL
;
1878 // Create a new jail
1879 r
= pakfire_jail_create(&jail
, pakfire
, flags
);
1883 // Execute the command
1884 r
= pakfire_jail_exec(jail
, argv
, NULL
, pakfire_jail_capture_stdout
, output
);
1888 pakfire_jail_unref(jail
);
1893 int pakfire_jail_run_script(struct pakfire
* pakfire
,
1894 const char* script
, const size_t length
, const char* argv
[], int flags
) {
1895 struct pakfire_jail
* jail
= NULL
;
1898 // Create a new jail
1899 r
= pakfire_jail_create(&jail
, pakfire
, flags
);
1903 // Execute the command
1904 r
= pakfire_jail_exec_script(jail
, script
, length
, argv
, NULL
, NULL
, NULL
);
1908 pakfire_jail_unref(jail
);
1913 int pakfire_jail_shell(struct pakfire_jail
* jail
) {
1914 const char* argv
[] = {
1915 "/bin/bash", "--login", NULL
,
1918 // Execute /bin/bash
1919 return pakfire_jail_exec_interactive(jail
, argv
);
1922 static int pakfire_jail_run_if_possible(struct pakfire
* pakfire
, const char** argv
) {
1923 char path
[PATH_MAX
];
1926 r
= pakfire_path(pakfire
, path
, "%s", *argv
);
1930 // Check if the file is executable
1931 r
= access(path
, X_OK
);
1933 DEBUG(pakfire
, "%s is not executable. Skipping...\n", *argv
);
1937 return pakfire_jail_run(pakfire
, argv
, 0, NULL
);
1940 int pakfire_jail_ldconfig(struct pakfire
* pakfire
) {
1941 const char* argv
[] = {
1946 return pakfire_jail_run_if_possible(pakfire
, argv
);
1949 int pakfire_jail_run_systemd_tmpfiles(struct pakfire
* pakfire
) {
1950 const char* argv
[] = {
1951 "/usr/bin/systemd-tmpfiles",
1956 return pakfire_jail_run_if_possible(pakfire
, argv
);