]> git.ipfire.org Git - people/ms/pakfire.git/blob - src/libpakfire/jail.c
jail: Don't create own cgroups any more
[people/ms/pakfire.git] / src / libpakfire / jail.c
1 /*#############################################################################
2 # #
3 # Pakfire - The IPFire package management system #
4 # Copyright (C) 2022 Pakfire development team #
5 # #
6 # This program is free software: you can redistribute it and/or modify #
7 # it under the terms of the GNU General Public License as published by #
8 # the Free Software Foundation, either version 3 of the License, or #
9 # (at your option) any later version. #
10 # #
11 # This program is distributed in the hope that it will be useful, #
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
14 # GNU General Public License for more details. #
15 # #
16 # You should have received a copy of the GNU General Public License #
17 # along with this program. If not, see <http://www.gnu.org/licenses/>. #
18 # #
19 #############################################################################*/
20
21 #include <errno.h>
22 #include <linux/capability.h>
23 #include <linux/fcntl.h>
24 #include <linux/sched.h>
25 #include <linux/wait.h>
26 #include <sched.h>
27 #include <signal.h>
28 #include <stdlib.h>
29 #include <syscall.h>
30 #include <sys/capability.h>
31 #include <sys/epoll.h>
32 #include <sys/eventfd.h>
33 #include <sys/personality.h>
34 #include <sys/prctl.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38
39 // libseccomp
40 #include <seccomp.h>
41
42 #include <pakfire/arch.h>
43 #include <pakfire/cgroup.h>
44 #include <pakfire/jail.h>
45 #include <pakfire/logging.h>
46 #include <pakfire/mount.h>
47 #include <pakfire/pakfire.h>
48 #include <pakfire/private.h>
49 #include <pakfire/util.h>
50
51 #define BUFFER_SIZE 1024 * 64
52 #define ENVIRON_SIZE 128
53 #define EPOLL_MAX_EVENTS 2
54
55 // The default environment that will be set for every command
56 static const struct environ {
57 const char* key;
58 const char* val;
59 } ENV[] = {
60 { "LANG", "en_US.utf-8" },
61 { "TERM", "vt100" },
62 { NULL, NULL },
63 };
64
65 struct pakfire_jail {
66 struct pakfire* pakfire;
67 int nrefs;
68
69 // Flags
70 int flags;
71
72 // Resource Limits
73 int nice;
74
75 // CGroup
76 struct pakfire_cgroup* cgroup;
77
78 // Environment
79 char* env[ENVIRON_SIZE];
80
81 // Logging
82 pakfire_jail_log_callback log_callback;
83 void* log_data;
84 };
85
86 struct pakfire_log_buffer {
87 char data[BUFFER_SIZE];
88 size_t used;
89 };
90
91 struct pakfire_jail_exec {
92 // PID (of the child)
93 pid_t pid;
94 int pidfd;
95
96 // Process status (from waitid)
97 siginfo_t status;
98
99 // FD to notify the client that the parent has finished initialization
100 int completed_fd;
101
102 // Log pipes
103 struct pakfire_jail_pipes {
104 int stdout[2];
105 int stderr[2];
106
107 // Logging
108 int log_INFO[2];
109 int log_ERROR[2];
110 int log_DEBUG[2];
111 } pipes;
112
113 // Log buffers
114 struct pakfire_jail_buffers {
115 struct pakfire_log_buffer stdout;
116 struct pakfire_log_buffer stderr;
117
118 // Logging
119 struct pakfire_log_buffer log_INFO;
120 struct pakfire_log_buffer log_ERROR;
121 struct pakfire_log_buffer log_DEBUG;
122 } buffers;
123 };
124
125 static int clone3(struct clone_args* args, size_t size) {
126 return syscall(__NR_clone3, args, size);
127 }
128
129 static void pakfire_jail_free(struct pakfire_jail* jail) {
130 DEBUG(jail->pakfire, "Freeing jail at %p\n", jail);
131
132 // Free environment
133 for (unsigned int i = 0; jail->env[i]; i++)
134 free(jail->env[i]);
135
136 pakfire_unref(jail->pakfire);
137 free(jail);
138 }
139
140 /*
141 Passes any log messages on to the default pakfire log callback
142 */
143 static int pakfire_jail_default_log_callback(struct pakfire* pakfire, void* data,
144 int priority, const char* line, size_t length) {
145 switch (priority) {
146 case LOG_INFO:
147 INFO(pakfire, "%s", line);
148 break;
149
150 case LOG_ERR:
151 ERROR(pakfire, "%s", line);
152 break;
153
154 #ifdef ENABLE_DEBUG
155 case LOG_DEBUG:
156 DEBUG(pakfire, "%s", line);
157 break;
158 #endif
159 }
160
161 return 0;
162 }
163
164 static int pakfire_jail_setup_interactive_env(struct pakfire_jail* jail) {
165 // Set PS1
166 int r = pakfire_jail_set_env(jail, "PS1", "pakfire-jail \\w> ");
167 if (r)
168 return r;
169
170 // Copy TERM
171 char* TERM = secure_getenv("TERM");
172 if (TERM) {
173 r = pakfire_jail_set_env(jail, "TERM", TERM);
174 if (r)
175 return r;
176 }
177
178 // Copy LANG
179 char* LANG = secure_getenv("LANG");
180 if (LANG) {
181 r = pakfire_jail_set_env(jail, "LANG", LANG);
182 if (r)
183 return r;
184 }
185
186 return 0;
187 }
188
189 PAKFIRE_EXPORT int pakfire_jail_create(struct pakfire_jail** jail,
190 struct pakfire* pakfire, int flags) {
191 int r;
192
193 // Allocate a new jail
194 struct pakfire_jail* j = calloc(1, sizeof(*j));
195 if (!j)
196 return 1;
197
198 // Reference Pakfire
199 j->pakfire = pakfire_ref(pakfire);
200
201 // Initialize reference counter
202 j->nrefs = 1;
203
204 // Store flags
205 j->flags = flags;
206
207 DEBUG(j->pakfire, "Allocated new jail at %p\n", j);
208
209 // Set default log callback
210 r = pakfire_jail_set_log_callback(j, pakfire_jail_default_log_callback, NULL);
211 if (r)
212 goto ERROR;
213
214 // Set default environment
215 for (const struct environ* e = ENV; e->key; e++) {
216 r = pakfire_jail_set_env(j, e->key, e->val);
217 if (r)
218 goto ERROR;
219 }
220
221 // Setup interactive stuff
222 if (j->flags & PAKFIRE_JAIL_INTERACTIVE) {
223 r = pakfire_jail_setup_interactive_env(j);
224 if (r)
225 goto ERROR;
226 }
227
228 // Done
229 *jail = j;
230 return 0;
231
232 ERROR:
233 pakfire_jail_free(j);
234
235 return r;
236 }
237
238 PAKFIRE_EXPORT struct pakfire_jail* pakfire_jail_ref(struct pakfire_jail* jail) {
239 ++jail->nrefs;
240
241 return jail;
242 }
243
244 PAKFIRE_EXPORT struct pakfire_jail* pakfire_jail_unref(struct pakfire_jail* jail) {
245 if (--jail->nrefs > 0)
246 return jail;
247
248 pakfire_jail_free(jail);
249 return NULL;
250 }
251
252 static int pakfire_jail_has_flag(struct pakfire_jail* jail, int flag) {
253 return jail->flags & flag;
254 }
255
256 // Resource Limits
257
258 PAKFIRE_EXPORT int pakfire_jail_nice(struct pakfire_jail* jail, int nice) {
259 // Check if nice level is in range
260 if (nice < -19 || nice > 20) {
261 errno = EINVAL;
262 return 1;
263 }
264
265 // Store nice level
266 jail->nice = nice;
267
268 return 0;
269 }
270
271 int pakfire_jail_set_cgroup(struct pakfire_jail* jail, struct pakfire_cgroup* cgroup) {
272 // Free any previous cgroup
273 if (jail->cgroup) {
274 pakfire_cgroup_unref(jail->cgroup);
275 jail->cgroup = NULL;
276 }
277
278 // Set any new cgroup
279 if (cgroup) {
280 DEBUG(jail->pakfire, "Setting cgroup %p\n", cgroup);
281
282 jail->cgroup = pakfire_cgroup_ref(cgroup);
283 }
284
285 // Done
286 return 0;
287 }
288
289 // Environment
290
291 // Returns the length of the environment
292 static unsigned int pakfire_jail_env_length(struct pakfire_jail* jail) {
293 unsigned int i = 0;
294
295 // Count everything in the environment
296 for (char** e = jail->env; *e; e++)
297 i++;
298
299 return i;
300 }
301
302 // Finds an existing environment variable and returns its index or -1 if not found
303 static int pakfire_jail_find_env(struct pakfire_jail* jail, const char* key) {
304 if (!key) {
305 errno = EINVAL;
306 return -1;
307 }
308
309 char buffer[strlen(key) + 2];
310 pakfire_string_format(buffer, "%s=", key);
311
312 for (unsigned int i = 0; jail->env[i]; i++) {
313 if (pakfire_string_startswith(jail->env[i], buffer))
314 return i;
315 }
316
317 // Nothing found
318 return -1;
319 }
320
321 // Returns the value of an environment variable or NULL
322 PAKFIRE_EXPORT const char* pakfire_jail_get_env(struct pakfire_jail* jail,
323 const char* key) {
324 int i = pakfire_jail_find_env(jail, key);
325 if (i < 0)
326 return NULL;
327
328 return jail->env[i] + strlen(key) + 1;
329 }
330
331 // Sets an environment variable
332 PAKFIRE_EXPORT int pakfire_jail_set_env(struct pakfire_jail* jail,
333 const char* key, const char* value) {
334 // Find the index where to write this value to
335 int i = pakfire_jail_find_env(jail, key);
336 if (i < 0)
337 i = pakfire_jail_env_length(jail);
338
339 // Return -ENOSPC when the environment is full
340 if (i >= ENVIRON_SIZE) {
341 errno = ENOSPC;
342 return -1;
343 }
344
345 // Free any previous value
346 if (jail->env[i])
347 free(jail->env[i]);
348
349 // Format and set environment variable
350 asprintf(&jail->env[i], "%s=%s", key, value);
351
352 DEBUG(jail->pakfire, "Set environment variable: %s\n", jail->env[i]);
353
354 return 0;
355 }
356
357 // Imports an environment
358 PAKFIRE_EXPORT int pakfire_jail_import_env(struct pakfire_jail* jail, const char* env[]) {
359 if (!env)
360 return 0;
361
362 char* key;
363 char* val;
364 int r;
365
366 // Copy environment variables
367 for (unsigned int i = 0; env[i]; i++) {
368 r = pakfire_string_partition(env[i], "=", &key, &val);
369 if (r)
370 continue;
371
372 // Set value
373 r = pakfire_jail_set_env(jail, key, val);
374
375 if (key)
376 free(key);
377 if (val)
378 free(val);
379
380 // Break on error
381 if (r)
382 return r;
383 }
384
385 return 0;
386 }
387
388 // Logging
389
390 PAKFIRE_EXPORT int pakfire_jail_set_log_callback(struct pakfire_jail* jail,
391 pakfire_jail_log_callback callback, void* data) {
392 jail->log_callback = callback;
393 jail->log_data = data;
394
395 return 0;
396 }
397
398 /*
399 This function replaces any logging in the child process.
400
401 All log messages will be sent to the parent process through their respective pipes.
402 */
403 static void pakfire_jail_log(void* data, int priority, const char* file,
404 int line, const char* fn, const char* format, va_list args) {
405 struct pakfire_jail_pipes* pipes = (struct pakfire_jail_pipes*)data;
406 int fd;
407
408 switch (priority) {
409 case LOG_INFO:
410 fd = pipes->log_INFO[1];
411 break;
412
413 case LOG_ERR:
414 fd = pipes->log_ERROR[1];
415 break;
416
417 #ifdef ENABLE_DEBUG
418 case LOG_DEBUG:
419 fd = pipes->log_DEBUG[1];
420 break;
421 #endif /* ENABLE_DEBUG */
422
423 // Ignore any messages of an unknown priority
424 default:
425 return;
426 }
427
428 // Send the log message
429 if (fd)
430 vdprintf(fd, format, args);
431 }
432
433 static int pakfire_jail_log_buffer_is_full(const struct pakfire_log_buffer* buffer) {
434 return (sizeof(buffer->data) == buffer->used);
435 }
436
437 /*
438 This function reads as much data as it can from the file descriptor.
439 If it finds a whole line in it, it will send it to the logger and repeat the process.
440 If not newline character is found, it will try to read more data until it finds one.
441 */
442 static int pakfire_jail_handle_log(struct pakfire_jail* jail,
443 struct pakfire_jail_exec* ctx, int priority, int fd,
444 struct pakfire_log_buffer* buffer, pakfire_jail_log_callback callback, void* data) {
445 char line[BUFFER_SIZE + 1];
446
447 // Fill up buffer from fd
448 if (buffer->used < sizeof(buffer->data)) {
449 ssize_t bytes_read = read(fd, buffer->data + buffer->used,
450 sizeof(buffer->data) - buffer->used);
451
452 // Handle errors
453 if (bytes_read < 0) {
454 ERROR(jail->pakfire, "Could not read from fd %d: %m\n", fd);
455 return -1;
456 }
457
458 // Update buffer size
459 buffer->used += bytes_read;
460 }
461
462 // See if we have any lines that we can write
463 while (buffer->used) {
464 // Search for the end of the first line
465 char* eol = memchr(buffer->data, '\n', buffer->used);
466
467 // No newline found
468 if (!eol) {
469 // If the buffer is full, we send the content to the logger and try again
470 // This should not happen in practise
471 if (pakfire_jail_log_buffer_is_full(buffer)) {
472 DEBUG(jail->pakfire, "Logging buffer is full. Sending all content\n");
473
474 eol = buffer->data + sizeof(buffer->data) - 1;
475
476 // Otherwise we might have only read parts of the output
477 } else
478 break;
479 }
480
481 // Find the length of the string
482 size_t length = eol - buffer->data + 1;
483
484 // Copy the line into the buffer
485 memcpy(line, buffer->data, length);
486
487 // Terminate the string
488 line[length] = '\0';
489
490 // Log the line
491 if (callback) {
492 int r = callback(jail->pakfire, data, priority, line, length);
493 if (r) {
494 ERROR(jail->pakfire, "The logging callback returned an error: %d\n", r);
495 return r;
496 }
497 }
498
499 // Remove line from buffer
500 memmove(buffer->data, buffer->data + length, buffer->used - length);
501 buffer->used -= length;
502 }
503
504 return 0;
505 }
506
507 static int pakfire_jail_setup_pipe(struct pakfire_jail* jail, int (*fds)[2], const int flags) {
508 int r = pipe2(*fds, flags);
509 if (r < 0) {
510 ERROR(jail->pakfire, "Could not setup pipe: %m\n");
511 return 1;
512 }
513
514 return 0;
515 }
516
517 static void pakfire_jail_close_pipe(struct pakfire_jail* jail, int fds[2]) {
518 for (unsigned int i = 0; i < 2; i++)
519 if (fds[i])
520 close(fds[i]);
521 }
522
523 /*
524 This is a convenience function to fetch the reading end of a pipe and
525 closes the write end.
526 */
527 static int pakfire_jail_get_pipe(struct pakfire_jail* jail, int (*fds)[2]) {
528 // Give the variables easier names to avoid confusion
529 int* fd_read = &(*fds)[0];
530 int* fd_write = &(*fds)[1];
531
532 // Close the write end of the pipe
533 if (*fd_write) {
534 close(*fd_write);
535 *fd_write = 0;
536 }
537
538 // Return the read end
539 return *fd_read;
540 }
541
542 static int pakfire_jail_wait(struct pakfire_jail* jail, struct pakfire_jail_exec* ctx) {
543 int epollfd = -1;
544 struct epoll_event ev;
545 struct epoll_event events[EPOLL_MAX_EVENTS];
546 int r = 0;
547
548 // Fetch file descriptors from context
549 const int stdout = pakfire_jail_get_pipe(jail, &ctx->pipes.stdout);
550 const int stderr = pakfire_jail_get_pipe(jail, &ctx->pipes.stderr);
551 const int pidfd = ctx->pidfd;
552
553 // Logging
554 const int log_INFO = pakfire_jail_get_pipe(jail, &ctx->pipes.log_INFO);
555 const int log_ERROR = pakfire_jail_get_pipe(jail, &ctx->pipes.log_ERROR);
556 const int log_DEBUG = pakfire_jail_get_pipe(jail, &ctx->pipes.log_DEBUG);
557
558 // Make a list of all file descriptors we are interested in
559 int fds[] = {
560 stdout, stderr, pidfd, log_INFO, log_ERROR, log_DEBUG,
561 };
562
563 // Setup epoll
564 epollfd = epoll_create1(0);
565 if (epollfd < 0) {
566 ERROR(jail->pakfire, "Could not initialize epoll(): %m\n");
567 r = 1;
568 goto ERROR;
569 }
570
571 ev.events = EPOLLIN;
572
573 // Turn file descriptors into non-blocking mode and add them to epoll()
574 for (unsigned int i = 0; i < sizeof(fds) / sizeof(*fds); i++) {
575 int fd = fds[i];
576
577 // Skip fds which were not initialized
578 if (fd <= 0)
579 continue;
580
581 ev.data.fd = fd;
582
583 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) {
584 ERROR(jail->pakfire, "Could not add file descriptor %d to epoll(): %m\n", fd);
585 r = 1;
586 goto ERROR;
587 }
588 }
589
590 int ended = 0;
591
592 // Loop for as long as the process is alive
593 while (!ended) {
594 int num = epoll_wait(epollfd, events, EPOLL_MAX_EVENTS, -1);
595 if (num < 1) {
596 // Ignore if epoll_wait() has been interrupted
597 if (errno == EINTR)
598 continue;
599
600 ERROR(jail->pakfire, "epoll_wait() failed: %m\n");
601 r = 1;
602
603 goto ERROR;
604 }
605
606 for (int i = 0; i < num; i++) {
607 int fd = events[i].data.fd;
608
609 struct pakfire_log_buffer* buffer = NULL;
610 pakfire_jail_log_callback callback = NULL;
611 void* data = NULL;
612 int priority;
613
614 // Handle any changes to the PIDFD
615 if (fd == pidfd) {
616 // Call waidid() and store the result
617 r = waitid(P_PIDFD, ctx->pidfd, &ctx->status, WEXITED);
618 if (r) {
619 ERROR(jail->pakfire, "waitid() failed: %m\n");
620 goto ERROR;
621 }
622
623 // Mark that we have ended so that we will process the remaining
624 // events from epoll() now, but won't restart the outer loop.
625 ended = 1;
626 continue;
627
628 // Handle logging messages
629 } else if (fd == log_INFO) {
630 buffer = &ctx->buffers.log_INFO;
631 priority = LOG_INFO;
632
633 callback = pakfire_jail_default_log_callback;
634
635 } else if (fd == log_ERROR) {
636 buffer = &ctx->buffers.log_ERROR;
637 priority = LOG_ERR;
638
639 callback = pakfire_jail_default_log_callback;
640
641 } else if (fd == log_DEBUG) {
642 buffer = &ctx->buffers.log_DEBUG;
643 priority = LOG_DEBUG;
644
645 callback = pakfire_jail_default_log_callback;
646
647 // Handle anything from the log pipes
648 } else if (fd == stdout) {
649 buffer = &ctx->buffers.stdout;
650 priority = LOG_INFO;
651
652 callback = jail->log_callback;
653 data = jail->log_data;
654
655 } else if (fd == stderr) {
656 buffer = &ctx->buffers.stderr;
657 priority = LOG_ERR;
658
659 callback = jail->log_callback;
660 data = jail->log_data;
661
662 } else {
663 DEBUG(jail->pakfire, "Received invalid file descriptor %d\n", fd);
664 continue;
665 }
666
667 // Handle log event
668 r = pakfire_jail_handle_log(jail, ctx, priority, fd, buffer, callback, data);
669 if (r)
670 goto ERROR;
671 }
672 }
673
674 ERROR:
675 if (epollfd > 0)
676 close(epollfd);
677
678 return r;
679 }
680
681 static int pakfire_jail_capture_stdout(struct pakfire* pakfire, void* data, int priority,
682 const char* line, size_t length) {
683 char*** array = (char***)data;
684
685 // Append everything from stdout to an array
686 if (priority == LOG_INFO) {
687 length = 0;
688
689 // Create a copy of line
690 char* message = strdup(line);
691 if (!message)
692 return 1;
693
694 // Determine the length of the existing array
695 if (*array) {
696 for (char** element = *array; *element; element++)
697 length++;
698 }
699
700 // Allocate space
701 *array = reallocarray(*array, length + 2, sizeof(**array));
702 if (!*array)
703 return 1;
704
705 // Append message and terminate the array
706 (*array)[length] = message;
707 (*array)[length + 1] = NULL;
708
709 return 0;
710 }
711
712 // Send everything else to the default logger
713 return pakfire_jail_default_log_callback(pakfire, NULL, priority, line, length);
714 }
715
716 // Capabilities
717
718 static int pakfire_jail_drop_capabilities(struct pakfire_jail* jail) {
719 const int capabilities[] = {
720 // Deny access to the kernel's audit system
721 CAP_AUDIT_CONTROL,
722 CAP_AUDIT_READ,
723 CAP_AUDIT_WRITE,
724
725 // Deny suspending block devices
726 CAP_BLOCK_SUSPEND,
727
728 // Deny any stuff with BPF
729 CAP_BPF,
730
731 // Deny checkpoint restore
732 CAP_CHECKPOINT_RESTORE,
733
734 // Deny opening files by inode number (open_by_handle_at)
735 CAP_DAC_READ_SEARCH,
736
737 // Deny setting SUID bits
738 CAP_FSETID,
739
740 // Deny locking more memory
741 CAP_IPC_LOCK,
742
743 // Deny modifying any Apparmor/SELinux/SMACK configuration
744 CAP_MAC_ADMIN,
745 CAP_MAC_OVERRIDE,
746
747 // Deny creating any special devices
748 CAP_MKNOD,
749
750 // Deny setting any capabilities
751 CAP_SETFCAP,
752
753 // Deny reading from syslog
754 CAP_SYSLOG,
755
756 // Deny any admin actions (mount, sethostname, ...)
757 CAP_SYS_ADMIN,
758
759 // Deny rebooting the system
760 CAP_SYS_BOOT,
761
762 // Deny loading kernel modules
763 CAP_SYS_MODULE,
764
765 // Deny setting nice level
766 CAP_SYS_NICE,
767
768 // Deny access to /proc/kcore, /dev/mem, /dev/kmem
769 CAP_SYS_RAWIO,
770
771 // Deny circumventing any resource limits
772 CAP_SYS_RESOURCE,
773
774 // Deny setting the system time
775 CAP_SYS_TIME,
776
777 // Deny playing with suspend
778 CAP_WAKE_ALARM,
779
780 0,
781 };
782
783 DEBUG(jail->pakfire, "Dropping capabilities...\n");
784
785 size_t num_caps = 0;
786 int r;
787
788 // Drop any capabilities
789 for (const int* cap = capabilities; *cap; cap++) {
790 r = prctl(PR_CAPBSET_DROP, *cap, 0, 0, 0);
791 if (r) {
792 ERROR(jail->pakfire, "Could not drop capability %d: %m\n", *cap);
793 return r;
794 }
795
796 num_caps++;
797 }
798
799 // Fetch any capabilities
800 cap_t caps = cap_get_proc();
801 if (!caps) {
802 ERROR(jail->pakfire, "Could not read capabilities: %m\n");
803 return 1;
804 }
805
806 /*
807 Set inheritable capabilities
808
809 This ensures that no processes will be able to gain any of the listed
810 capabilities again.
811 */
812 r = cap_set_flag(caps, CAP_INHERITABLE, num_caps, capabilities, CAP_CLEAR);
813 if (r) {
814 ERROR(jail->pakfire, "cap_set_flag() failed: %m\n");
815 goto ERROR;
816 }
817
818 // Restore capabilities
819 r = cap_set_proc(caps);
820 if (r) {
821 ERROR(jail->pakfire, "Could not restore capabilities: %m\n");
822 goto ERROR;
823 }
824
825 ERROR:
826 if (caps)
827 cap_free(caps);
828
829 return r;
830 }
831
832 // Syscall Filter
833
834 static int pakfire_jail_limit_syscalls(struct pakfire_jail* jail) {
835 const int syscalls[] = {
836 // The kernel's keyring isn't namespaced
837 SCMP_SYS(keyctl),
838 SCMP_SYS(add_key),
839 SCMP_SYS(request_key),
840
841 // Disable userfaultfd
842 SCMP_SYS(userfaultfd),
843
844 // Disable perf which could leak a lot of information about the host
845 SCMP_SYS(perf_event_open),
846
847 0,
848 };
849 int r = 1;
850
851 DEBUG(jail->pakfire, "Applying syscall filter...\n");
852
853 // Setup a syscall filter which allows everything by default
854 scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW);
855 if (!ctx) {
856 ERROR(jail->pakfire, "Could not setup seccomp filter: %m\n");
857 goto ERROR;
858 }
859
860 // All all syscalls
861 for (const int* syscall = syscalls; *syscall; syscall++) {
862 r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), *syscall, 0);
863 if (r) {
864 ERROR(jail->pakfire, "Could not configure syscall %d: %m\n", *syscall);
865 goto ERROR;
866 }
867 }
868
869 // Load syscall filter into the kernel
870 r = seccomp_load(ctx);
871 if (r) {
872 ERROR(jail->pakfire, "Could not load syscall filter into the kernel: %m\n");
873 goto ERROR;
874 }
875
876 ERROR:
877 if (ctx)
878 seccomp_release(ctx);
879
880 return r;
881 }
882
883 // UID/GID Mapping
884
885 static int pakfire_jail_write_uidgid_mapping(struct pakfire_jail* jail,
886 const char* path, uid_t mapped_id, size_t length) {
887 int r = 1;
888
889 // Open file for writing
890 FILE* f = fopen(path, "w");
891 if (!f) {
892 ERROR(jail->pakfire, "Could not open %s for writing: %m\n", path);
893 goto ERROR;
894 }
895
896 // Write configuration
897 int bytes_written = fprintf(f, "%d %d %ld\n", 0, mapped_id, length);
898 if (bytes_written <= 0) {
899 ERROR(jail->pakfire, "Could not write UID/GID mapping: %m\n");
900 goto ERROR;
901 }
902
903 // Close the file
904 r = fclose(f);
905 f = NULL;
906 if (r) {
907 ERROR(jail->pakfire, "Could not write UID/GID mapping: %m\n");
908
909 goto ERROR;
910 }
911
912 // Success
913 r = 0;
914
915 ERROR:
916 if (f)
917 fclose(f);
918
919 return r;
920 }
921
922 static int pakfire_jail_setup_uid_mapping(struct pakfire_jail* jail, pid_t pid) {
923 char path[PATH_MAX];
924 int r;
925
926 uid_t mapped_uid = 0;
927 const size_t length = 1;
928
929 // Fetch the UID of the calling process
930 uid_t uid = getuid();
931
932 // Have we been called by root?
933 if (uid == 0) {
934 mapped_uid = 0;
935
936 // Have we been called by an unprivileged user?
937 } else {
938 // XXX fetch SUBUID
939 mapped_uid = uid;
940 }
941
942 // Make path
943 r = pakfire_string_format(path, "/proc/%d/uid_map", pid);
944 if (r < 0)
945 return 1;
946
947 DEBUG(jail->pakfire, "Mapping UID range (%u - %lu)\n", mapped_uid, mapped_uid + length);
948
949 return pakfire_jail_write_uidgid_mapping(jail, path, mapped_uid, length);
950 }
951
952 static int pakfire_jail_setup_gid_mapping(struct pakfire_jail* jail, pid_t pid) {
953 char path[PATH_MAX];
954 int r;
955
956 gid_t mapped_gid = 0;
957 const size_t length = 1;
958
959 // Fetch the GID of the calling process
960 gid_t gid = getgid();
961
962 // Have we been called from the root group?
963 if (gid == 0) {
964 mapped_gid = 0;
965
966 // Have we been called by an unprivileged group?
967 } else {
968 // XXX fetch SUBGID
969 mapped_gid = gid;
970 }
971
972 // Make path
973 r = pakfire_string_format(path, "/proc/%d/gid_map", pid);
974 if (r < 0)
975 return 1;
976
977 DEBUG(jail->pakfire, "Mapping GID range (%u - %lu)\n", mapped_gid, mapped_gid + length);
978
979 return pakfire_jail_write_uidgid_mapping(jail, path, mapped_gid, length);
980 }
981
982 static int pakfire_jail_setgroups(struct pakfire_jail* jail, pid_t pid) {
983 char path[PATH_MAX];
984 int r = 1;
985
986 // Make path
987 r = pakfire_string_format(path, "/proc/%d/setgroups", pid);
988 if (r < 0)
989 return 1;
990
991 // Open file for writing
992 FILE* f = fopen(path, "w");
993 if (!f) {
994 ERROR(jail->pakfire, "Could not open %s for writing: %m\n", path);
995 goto ERROR;
996 }
997
998 // Write content
999 int bytes_written = fprintf(f, "deny\n");
1000 if (bytes_written <= 0) {
1001 ERROR(jail->pakfire, "Could not write to %s: %m\n", path);
1002 goto ERROR;
1003 }
1004
1005 r = fclose(f);
1006 f = NULL;
1007 if (r) {
1008 ERROR(jail->pakfire, "Could not close %s: %m\n", path);
1009 goto ERROR;
1010 }
1011
1012 ERROR:
1013 if (f)
1014 fclose(f);
1015
1016 return r;
1017 }
1018
1019 static int pakfire_jail_send_signal(struct pakfire_jail* jail, int fd) {
1020 const uint64_t val = 1;
1021 int r = 0;
1022
1023 DEBUG(jail->pakfire, "Sending signal...\n");
1024
1025 // Write to the file descriptor
1026 ssize_t bytes_written = write(fd, &val, sizeof(val));
1027 if (bytes_written < 0 || (size_t)bytes_written < sizeof(val)) {
1028 ERROR(jail->pakfire, "Could not send signal: %m\n");
1029 r = 1;
1030 }
1031
1032 // Close the file descriptor
1033 close(fd);
1034
1035 return r;
1036 }
1037
1038 static int pakfire_jail_wait_for_signal(struct pakfire_jail* jail, int fd) {
1039 uint64_t val = 0;
1040 int r = 0;
1041
1042 DEBUG(jail->pakfire, "Waiting for signal...\n");
1043
1044 ssize_t bytes_read = read(fd, &val, sizeof(val));
1045 if (bytes_read < 0 || (size_t)bytes_read < sizeof(val)) {
1046 ERROR(jail->pakfire, "Error waiting for signal: %m\n");
1047 r = 1;
1048 }
1049
1050 // Close the file descriptor
1051 close(fd);
1052
1053 return r;
1054 }
1055
1056 /*
1057 Performs the initialisation that needs to happen in the parent part
1058 */
1059 static int pakfire_jail_parent(struct pakfire_jail* jail, struct pakfire_jail_exec* ctx) {
1060 int r;
1061
1062 // Setup UID mapping
1063 r = pakfire_jail_setup_uid_mapping(jail, ctx->pid);
1064 if (r)
1065 return r;
1066
1067 // Write "deny" to /proc/PID/setgroups
1068 r = pakfire_jail_setgroups(jail, ctx->pid);
1069 if (r)
1070 return r;
1071
1072 // Setup GID mapping
1073 r = pakfire_jail_setup_gid_mapping(jail, ctx->pid);
1074 if (r)
1075 return r;
1076
1077 // Parent has finished initialisation
1078 DEBUG(jail->pakfire, "Parent has finished initialization\n");
1079
1080 // Send signal to client
1081 r = pakfire_jail_send_signal(jail, ctx->completed_fd);
1082 if (r)
1083 return r;
1084
1085 return 0;
1086 }
1087
1088 static int pakfire_jail_child(struct pakfire_jail* jail, struct pakfire_jail_exec* ctx,
1089 const char* argv[]) {
1090 int r;
1091
1092 // Redirect any logging to our log pipe
1093 pakfire_set_log_callback(jail->pakfire, pakfire_jail_log, &ctx->pipes);
1094
1095 // Fetch my own PID
1096 pid_t pid = getpid();
1097
1098 DEBUG(jail->pakfire, "Launched child process in jail with PID %d\n", pid);
1099
1100 // Wait for the parent to finish initialization
1101 r = pakfire_jail_wait_for_signal(jail, ctx->completed_fd);
1102 if (r)
1103 return r;
1104
1105 // Perform further initialization
1106
1107 // Fetch UID/GID
1108 uid_t uid = getuid();
1109 gid_t gid = getgid();
1110
1111 // Fetch EUID/EGID
1112 uid_t euid = geteuid();
1113 gid_t egid = getegid();
1114
1115 DEBUG(jail->pakfire, " UID: %d (effective %d)\n", uid, euid);
1116 DEBUG(jail->pakfire, " GID: %d (effective %d)\n", gid, egid);
1117
1118 // Check if we are (effectively running as root)
1119 if (uid != 0 || gid != 0) {
1120 ERROR(jail->pakfire, "Child process is not running as root\n");
1121 return 126;
1122 }
1123
1124 const char* root = pakfire_get_path(jail->pakfire);
1125 const char* arch = pakfire_get_arch(jail->pakfire);
1126
1127 // Change root (unless root is /)
1128 if (!pakfire_on_root(jail->pakfire)) {
1129 // Mount everything
1130 r = pakfire_mount_all(jail->pakfire);
1131 if (r)
1132 return r;
1133
1134 // Log all mountpoints
1135 pakfire_mount_list(jail->pakfire);
1136
1137 // Call chroot()
1138 r = chroot(root);
1139 if (r) {
1140 ERROR(jail->pakfire, "chroot() to %s failed: %m\n", root);
1141 return 1;
1142 }
1143
1144 // Change directory to /
1145 r = chdir("/");
1146 if (r) {
1147 ERROR(jail->pakfire, "chdir() after chroot() failed: %m\n");
1148 return 1;
1149 }
1150 }
1151
1152 // Set personality
1153 unsigned long persona = pakfire_arch_personality(arch);
1154 if (persona) {
1155 r = personality(persona);
1156 if (r < 0) {
1157 ERROR(jail->pakfire, "Could not set personality (%x)\n", (unsigned int)persona);
1158 return 1;
1159 }
1160 }
1161
1162 // Set nice level
1163 if (jail->nice) {
1164 DEBUG(jail->pakfire, "Setting nice level to %d\n", jail->nice);
1165
1166 r = setpriority(PRIO_PROCESS, pid, jail->nice);
1167 if (r) {
1168 ERROR(jail->pakfire, "Could not set nice level: %m\n");
1169 return 1;
1170 }
1171 }
1172
1173 // Close other end of log pipes
1174 close(ctx->pipes.log_INFO[0]);
1175 close(ctx->pipes.log_ERROR[0]);
1176 #ifdef ENABLE_DEBUG
1177 close(ctx->pipes.log_DEBUG[0]);
1178 #endif /* ENABLE_DEBUG */
1179
1180 // Connect standard output and error
1181 if (ctx->pipes.stdout[1] && ctx->pipes.stderr[1]) {
1182 r = dup2(ctx->pipes.stdout[1], STDOUT_FILENO);
1183 if (r < 0) {
1184 ERROR(jail->pakfire, "Could not connect fd %d to stdout: %m\n",
1185 ctx->pipes.stdout[1]);
1186
1187 return 1;
1188 }
1189
1190 r = dup2(ctx->pipes.stderr[1], STDERR_FILENO);
1191 if (r < 0) {
1192 ERROR(jail->pakfire, "Could not connect fd %d to stderr: %m\n",
1193 ctx->pipes.stderr[1]);
1194
1195 return 1;
1196 }
1197
1198 // Close the pipe (as we have moved the original file descriptors)
1199 pakfire_jail_close_pipe(jail, ctx->pipes.stdout);
1200 pakfire_jail_close_pipe(jail, ctx->pipes.stderr);
1201 }
1202
1203 // Reset open file limit (http://0pointer.net/blog/file-descriptor-limits.html)
1204 r = pakfire_rlimit_reset_nofile(jail->pakfire);
1205 if (r)
1206 return r;
1207
1208 // Drop capabilities
1209 r = pakfire_jail_drop_capabilities(jail);
1210 if (r)
1211 return r;
1212
1213 // Filter syscalls
1214 r = pakfire_jail_limit_syscalls(jail);
1215 if (r)
1216 return r;
1217
1218 // exec() command
1219 r = execvpe(argv[0], (char**)argv, jail->env);
1220 if (r < 0)
1221 ERROR(jail->pakfire, "Could not execve(): %m\n");
1222
1223 // Translate errno into regular exit code
1224 switch (errno) {
1225 case ENOENT:
1226 r = 127;
1227 break;
1228
1229 default:
1230 r = 1;
1231 }
1232
1233 // We should not get here
1234 return r;
1235 }
1236
1237 // Run a command in the jail
1238 static int __pakfire_jail_exec(struct pakfire_jail* jail, const char* argv[]) {
1239 int exit = -1;
1240 int r;
1241
1242 // Check if argv is valid
1243 if (!argv || !argv[0]) {
1244 errno = EINVAL;
1245 return -1;
1246 }
1247
1248 // Initialize context for this call
1249 struct pakfire_jail_exec ctx = {
1250 .pipes = {
1251 .stdout = { 0, 0 },
1252 .stderr = { 0, 0 },
1253 },
1254 };
1255
1256 DEBUG(jail->pakfire, "Executing jail...\n");
1257
1258 /*
1259 Setup a file descriptor which can be used to notify the client that the parent
1260 has completed configuration.
1261 */
1262 ctx.completed_fd = eventfd(0, EFD_CLOEXEC);
1263 if (ctx.completed_fd < 0) {
1264 ERROR(jail->pakfire, "eventfd() failed: %m\n");
1265 return -1;
1266 }
1267
1268 // Create pipes to communicate with child process if we are not running interactively
1269 if (!pakfire_jail_has_flag(jail, PAKFIRE_JAIL_INTERACTIVE)) {
1270 // stdout
1271 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.stdout, 0);
1272 if (r)
1273 goto ERROR;
1274
1275 // stderr
1276 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.stderr, 0);
1277 if (r)
1278 goto ERROR;
1279 }
1280
1281 // Setup pipes for logging
1282 // INFO
1283 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.log_INFO, O_CLOEXEC);
1284 if (r)
1285 goto ERROR;
1286
1287 // ERROR
1288 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.log_ERROR, O_CLOEXEC);
1289 if (r)
1290 goto ERROR;
1291
1292 #ifdef ENABLE_DEBUG
1293 // DEBUG
1294 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.log_DEBUG, O_CLOEXEC);
1295 if (r)
1296 goto ERROR;
1297 #endif /* ENABLE_DEBUG */
1298
1299 // Configure child process
1300 struct clone_args args = {
1301 .flags =
1302 CLONE_NEWCGROUP |
1303 CLONE_NEWIPC |
1304 CLONE_NEWNS |
1305 CLONE_NEWPID |
1306 CLONE_NEWUSER |
1307 CLONE_NEWUTS |
1308 CLONE_PIDFD,
1309 .exit_signal = SIGCHLD,
1310 .pidfd = (long long unsigned int)&ctx.pidfd,
1311 };
1312
1313 // Launch the process in a cgroup (if requested)
1314 if (jail->cgroup) {
1315 args.flags |= CLONE_INTO_CGROUP;
1316
1317 // Clone into this cgroup
1318 args.cgroup = pakfire_cgroup_fd(jail->cgroup);
1319 }
1320
1321 // Fork this process
1322 ctx.pid = clone3(&args, sizeof(args));
1323 if (ctx.pid < 0) {
1324 ERROR(jail->pakfire, "Could not clone: %m\n");
1325 return -1;
1326
1327 // Child process
1328 } else if (ctx.pid == 0) {
1329 r = pakfire_jail_child(jail, &ctx, argv);
1330 _exit(r);
1331 }
1332
1333 // Parent process
1334 r = pakfire_jail_parent(jail, &ctx);
1335 if (r)
1336 goto ERROR;
1337
1338 DEBUG(jail->pakfire, "Waiting for PID %d to finish its work\n", ctx.pid);
1339
1340 // Read output of the child process
1341 r = pakfire_jail_wait(jail, &ctx);
1342 if (r)
1343 goto ERROR;
1344
1345 // Handle exit status
1346 switch (ctx.status.si_code) {
1347 case CLD_EXITED:
1348 DEBUG(jail->pakfire, "The child process exited with code %d\n",
1349 ctx.status.si_status);
1350
1351 // Pass exit code
1352 exit = ctx.status.si_status;
1353 break;
1354
1355 case CLD_KILLED:
1356 case CLD_DUMPED:
1357 ERROR(jail->pakfire, "The child process was killed\n");
1358 break;
1359
1360 // Log anything else
1361 default:
1362 ERROR(jail->pakfire, "Unknown child exit code: %d\n", ctx.status.si_code);
1363 break;
1364 }
1365
1366 ERROR:
1367 // Close any file descriptors
1368 pakfire_jail_close_pipe(jail, ctx.pipes.stdout);
1369 pakfire_jail_close_pipe(jail, ctx.pipes.stderr);
1370 if (ctx.pidfd)
1371 close(ctx.pidfd);
1372 pakfire_jail_close_pipe(jail, ctx.pipes.log_INFO);
1373 pakfire_jail_close_pipe(jail, ctx.pipes.log_ERROR);
1374 pakfire_jail_close_pipe(jail, ctx.pipes.log_DEBUG);
1375
1376 // Umount everything
1377 if (!pakfire_on_root(jail->pakfire))
1378 pakfire_umount_all(jail->pakfire);
1379
1380 return exit;
1381 }
1382
1383 PAKFIRE_EXPORT int pakfire_jail_exec(struct pakfire_jail* jail,
1384 const char* argv[], char*** output) {
1385 int r;
1386
1387 // Store logging callback
1388 pakfire_jail_log_callback log_callback = jail->log_callback;
1389 void* log_data = jail->log_data;
1390
1391 // Capture output if requested by user
1392 if (output)
1393 pakfire_jail_set_log_callback(jail, pakfire_jail_capture_stdout, output);
1394
1395 // Run exec()
1396 r = __pakfire_jail_exec(jail, argv);
1397
1398 // Restore log callback
1399 pakfire_jail_set_log_callback(jail, log_callback, log_data);
1400
1401 return r;
1402 }
1403
1404 PAKFIRE_EXPORT int pakfire_jail_exec_script(struct pakfire_jail* jail,
1405 const char* script, const size_t size, const char* args[], char*** output) {
1406 char path[PATH_MAX];
1407 const char** argv = NULL;
1408 int r;
1409
1410 const char* root = pakfire_get_path(jail->pakfire);
1411
1412 // Write the scriptlet to disk
1413 r = pakfire_path_join(path, root, "pakfire-script.XXXXXX");
1414 if (r < 0)
1415 goto ERROR;
1416
1417 // Open a temporary file
1418 int fd = mkstemp(path);
1419 if (fd < 0) {
1420 ERROR(jail->pakfire, "Could not open a temporary file: %m\n");
1421 r = 1;
1422 goto ERROR;
1423 }
1424
1425 DEBUG(jail->pakfire, "Writing script to %s:\n%.*s\n", path, (int)size, script);
1426
1427 // Write data
1428 ssize_t bytes_written = write(fd, script, size);
1429 if (bytes_written < (ssize_t)size) {
1430 ERROR(jail->pakfire, "Could not write script to file %s: %m\n", path);
1431 r = 1;
1432 goto ERROR;
1433 }
1434
1435 // Make the script executable
1436 r = fchmod(fd, S_IRUSR|S_IWUSR|S_IXUSR);
1437 if (r) {
1438 ERROR(jail->pakfire, "Could not set executable permissions on %s: %m\n", path);
1439 goto ERROR;
1440 }
1441
1442 // Close file
1443 r = close(fd);
1444 if (r) {
1445 ERROR(jail->pakfire, "Could not close script file %s: %m\n", path);
1446 r = 1;
1447 goto ERROR;
1448 }
1449
1450 // Count how many arguments were passed
1451 unsigned int argc = 1;
1452 if (args) {
1453 for (const char** arg = args; *arg; arg++)
1454 argc++;
1455 }
1456
1457 argv = calloc(argc + 1, sizeof(*argv));
1458 if (!argv) {
1459 ERROR(jail->pakfire, "Could not allocate argv: %m\n");
1460 goto ERROR;
1461 }
1462
1463 // Set command
1464 argv[0] = (root) ? pakfire_path_relpath(root, path) : path;
1465
1466 // Copy args
1467 for (unsigned int i = 1; i < argc; i++)
1468 argv[i] = args[i-1];
1469
1470 // Run the script
1471 r = pakfire_jail_exec(jail, argv, output);
1472
1473 ERROR:
1474 if (argv)
1475 free(argv);
1476
1477 // Remove script from disk
1478 if (*path)
1479 unlink(path);
1480
1481 return r;
1482 }
1483
1484 /*
1485 A convenience function that creates a new jail, runs the given command and destroys
1486 the jail again.
1487 */
1488 int pakfire_jail_run(struct pakfire* pakfire, const char* argv[], int flags, char*** output) {
1489 struct pakfire_jail* jail = NULL;
1490 int r;
1491
1492 // Create a new jail
1493 r = pakfire_jail_create(&jail, pakfire, flags);
1494 if (r)
1495 goto ERROR;
1496
1497 // Execute the command
1498 r = pakfire_jail_exec(jail, argv, output);
1499
1500 ERROR:
1501 if (jail)
1502 pakfire_jail_unref(jail);
1503
1504 return r;
1505 }
1506
1507 int pakfire_jail_run_script(struct pakfire* pakfire,
1508 const char* script, const size_t length, const char* argv[], int flags, char*** output) {
1509 struct pakfire_jail* jail = NULL;
1510 int r;
1511
1512 // Create a new jail
1513 r = pakfire_jail_create(&jail, pakfire, flags);
1514 if (r)
1515 goto ERROR;
1516
1517 // Execute the command
1518 r = pakfire_jail_exec_script(jail, script, length, argv, output);
1519
1520 ERROR:
1521 if (jail)
1522 pakfire_jail_unref(jail);
1523
1524 return r;
1525 }
1526
1527
1528 int pakfire_jail_shell(struct pakfire* pakfire) {
1529 const char* argv[] = {
1530 "/bin/bash", "--login", NULL,
1531 };
1532
1533 // Execute /bin/bash
1534 return pakfire_jail_run(pakfire, argv, PAKFIRE_JAIL_INTERACTIVE, NULL);
1535 }
1536
1537 int pakfire_jail_ldconfig(struct pakfire* pakfire) {
1538 char path[PATH_MAX];
1539
1540 const char* ldconfig = "/sbin/ldconfig";
1541
1542 // Check if ldconfig exists before calling it to avoid overhead
1543 int r = pakfire_make_path(pakfire, path, ldconfig);
1544 if (r < 0)
1545 return 1;
1546
1547 // Check if ldconfig is executable
1548 r = access(path, X_OK);
1549 if (r) {
1550 DEBUG(pakfire, "%s is not executable. Skipping...\n", ldconfig);
1551 return 0;
1552 }
1553
1554 const char* argv[] = {
1555 ldconfig, NULL,
1556 };
1557
1558 // Run ldconfig
1559 return pakfire_jail_run(pakfire, argv, 0, NULL);
1560 }
1561
1562 // Utility functions
1563
1564 PAKFIRE_EXPORT char* pakfire_jail_concat_output(struct pakfire_jail* jail,
1565 const char** input, size_t* length) {
1566 // Return nothing on no input
1567 if (!input)
1568 return NULL;
1569
1570 // XXX Maybe there is a more efficient way to do this
1571
1572 char* output = pakfire_string_join((char**)input, "");
1573 if (!output)
1574 return NULL;
1575
1576 // Store the length of the result
1577 if (length)
1578 *length = strlen(output);
1579
1580 return output;
1581 }