]> git.ipfire.org Git - people/ms/pakfire.git/blob - src/libpakfire/jail.c
jail: Create a leaf cgroup
[people/ms/pakfire.git] / src / libpakfire / jail.c
1 /*#############################################################################
2 # #
3 # Pakfire - The IPFire package management system #
4 # Copyright (C) 2022 Pakfire development team #
5 # #
6 # This program is free software: you can redistribute it and/or modify #
7 # it under the terms of the GNU General Public License as published by #
8 # the Free Software Foundation, either version 3 of the License, or #
9 # (at your option) any later version. #
10 # #
11 # This program is distributed in the hope that it will be useful, #
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
14 # GNU General Public License for more details. #
15 # #
16 # You should have received a copy of the GNU General Public License #
17 # along with this program. If not, see <http://www.gnu.org/licenses/>. #
18 # #
19 #############################################################################*/
20
21 #include <errno.h>
22 #include <linux/capability.h>
23 #include <linux/fcntl.h>
24 #include <linux/sched.h>
25 #include <linux/wait.h>
26 #include <sched.h>
27 #include <signal.h>
28 #include <stdlib.h>
29 #include <syscall.h>
30 #include <sys/capability.h>
31 #include <sys/epoll.h>
32 #include <sys/eventfd.h>
33 #include <sys/personality.h>
34 #include <sys/prctl.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38
39 // libseccomp
40 #include <seccomp.h>
41
42 #include <pakfire/arch.h>
43 #include <pakfire/cgroup.h>
44 #include <pakfire/jail.h>
45 #include <pakfire/logging.h>
46 #include <pakfire/mount.h>
47 #include <pakfire/pakfire.h>
48 #include <pakfire/private.h>
49 #include <pakfire/util.h>
50
51 #define BUFFER_SIZE 1024 * 64
52 #define ENVIRON_SIZE 128
53 #define EPOLL_MAX_EVENTS 2
54
55 // The default environment that will be set for every command
56 static const struct environ {
57 const char* key;
58 const char* val;
59 } ENV[] = {
60 { "LANG", "en_US.utf-8" },
61 { "TERM", "vt100" },
62 { NULL, NULL },
63 };
64
65 struct pakfire_jail {
66 struct pakfire* pakfire;
67 int nrefs;
68
69 // Flags
70 int flags;
71
72 // Resource Limits
73 int nice;
74
75 // CGroup
76 struct pakfire_cgroup* cgroup;
77
78 // Environment
79 char* env[ENVIRON_SIZE];
80
81 // Logging
82 pakfire_jail_log_callback log_callback;
83 void* log_data;
84 };
85
86 struct pakfire_log_buffer {
87 char data[BUFFER_SIZE];
88 size_t used;
89 };
90
91 struct pakfire_jail_exec {
92 // PID (of the child)
93 pid_t pid;
94 int pidfd;
95
96 // Process status (from waitid)
97 siginfo_t status;
98
99 // FD to notify the client that the parent has finished initialization
100 int completed_fd;
101
102 // Log pipes
103 struct pakfire_jail_pipes {
104 int stdout[2];
105 int stderr[2];
106
107 // Logging
108 int log_INFO[2];
109 int log_ERROR[2];
110 int log_DEBUG[2];
111 } pipes;
112
113 // Log buffers
114 struct pakfire_jail_buffers {
115 struct pakfire_log_buffer stdout;
116 struct pakfire_log_buffer stderr;
117
118 // Logging
119 struct pakfire_log_buffer log_INFO;
120 struct pakfire_log_buffer log_ERROR;
121 struct pakfire_log_buffer log_DEBUG;
122 } buffers;
123
124 struct pakfire_cgroup* cgroup;
125 };
126
127 static int clone3(struct clone_args* args, size_t size) {
128 return syscall(__NR_clone3, args, size);
129 }
130
131 static void pakfire_jail_free(struct pakfire_jail* jail) {
132 DEBUG(jail->pakfire, "Freeing jail at %p\n", jail);
133
134 // Free environment
135 for (unsigned int i = 0; jail->env[i]; i++)
136 free(jail->env[i]);
137
138 if (jail->cgroup)
139 pakfire_cgroup_unref(jail->cgroup);
140
141 pakfire_unref(jail->pakfire);
142 free(jail);
143 }
144
145 /*
146 Passes any log messages on to the default pakfire log callback
147 */
148 static int pakfire_jail_default_log_callback(struct pakfire* pakfire, void* data,
149 int priority, const char* line, size_t length) {
150 switch (priority) {
151 case LOG_INFO:
152 INFO(pakfire, "%s", line);
153 break;
154
155 case LOG_ERR:
156 ERROR(pakfire, "%s", line);
157 break;
158
159 #ifdef ENABLE_DEBUG
160 case LOG_DEBUG:
161 DEBUG(pakfire, "%s", line);
162 break;
163 #endif
164 }
165
166 return 0;
167 }
168
169 static int pakfire_jail_setup_interactive_env(struct pakfire_jail* jail) {
170 // Set PS1
171 int r = pakfire_jail_set_env(jail, "PS1", "pakfire-jail \\w> ");
172 if (r)
173 return r;
174
175 // Copy TERM
176 char* TERM = secure_getenv("TERM");
177 if (TERM) {
178 r = pakfire_jail_set_env(jail, "TERM", TERM);
179 if (r)
180 return r;
181 }
182
183 // Copy LANG
184 char* LANG = secure_getenv("LANG");
185 if (LANG) {
186 r = pakfire_jail_set_env(jail, "LANG", LANG);
187 if (r)
188 return r;
189 }
190
191 return 0;
192 }
193
194 PAKFIRE_EXPORT int pakfire_jail_create(struct pakfire_jail** jail,
195 struct pakfire* pakfire, int flags) {
196 int r;
197
198 // Allocate a new jail
199 struct pakfire_jail* j = calloc(1, sizeof(*j));
200 if (!j)
201 return 1;
202
203 // Reference Pakfire
204 j->pakfire = pakfire_ref(pakfire);
205
206 // Initialize reference counter
207 j->nrefs = 1;
208
209 // Store flags
210 j->flags = flags;
211
212 DEBUG(j->pakfire, "Allocated new jail at %p\n", j);
213
214 // Set default log callback
215 r = pakfire_jail_set_log_callback(j, pakfire_jail_default_log_callback, NULL);
216 if (r)
217 goto ERROR;
218
219 // Set default environment
220 for (const struct environ* e = ENV; e->key; e++) {
221 r = pakfire_jail_set_env(j, e->key, e->val);
222 if (r)
223 goto ERROR;
224 }
225
226 // Setup interactive stuff
227 if (j->flags & PAKFIRE_JAIL_INTERACTIVE) {
228 r = pakfire_jail_setup_interactive_env(j);
229 if (r)
230 goto ERROR;
231 }
232
233 // Done
234 *jail = j;
235 return 0;
236
237 ERROR:
238 pakfire_jail_free(j);
239
240 return r;
241 }
242
243 PAKFIRE_EXPORT struct pakfire_jail* pakfire_jail_ref(struct pakfire_jail* jail) {
244 ++jail->nrefs;
245
246 return jail;
247 }
248
249 PAKFIRE_EXPORT struct pakfire_jail* pakfire_jail_unref(struct pakfire_jail* jail) {
250 if (--jail->nrefs > 0)
251 return jail;
252
253 pakfire_jail_free(jail);
254 return NULL;
255 }
256
257 static int pakfire_jail_has_flag(struct pakfire_jail* jail, int flag) {
258 return jail->flags & flag;
259 }
260
261 // Resource Limits
262
263 PAKFIRE_EXPORT int pakfire_jail_nice(struct pakfire_jail* jail, int nice) {
264 // Check if nice level is in range
265 if (nice < -19 || nice > 20) {
266 errno = EINVAL;
267 return 1;
268 }
269
270 // Store nice level
271 jail->nice = nice;
272
273 return 0;
274 }
275
276 int pakfire_jail_set_cgroup(struct pakfire_jail* jail, struct pakfire_cgroup* cgroup) {
277 // Free any previous cgroup
278 if (jail->cgroup) {
279 pakfire_cgroup_unref(jail->cgroup);
280 jail->cgroup = NULL;
281 }
282
283 // Set any new cgroup
284 if (cgroup) {
285 DEBUG(jail->pakfire, "Setting cgroup %p\n", cgroup);
286
287 jail->cgroup = pakfire_cgroup_ref(cgroup);
288 }
289
290 // Done
291 return 0;
292 }
293
294 // Environment
295
296 // Returns the length of the environment
297 static unsigned int pakfire_jail_env_length(struct pakfire_jail* jail) {
298 unsigned int i = 0;
299
300 // Count everything in the environment
301 for (char** e = jail->env; *e; e++)
302 i++;
303
304 return i;
305 }
306
307 // Finds an existing environment variable and returns its index or -1 if not found
308 static int pakfire_jail_find_env(struct pakfire_jail* jail, const char* key) {
309 if (!key) {
310 errno = EINVAL;
311 return -1;
312 }
313
314 char buffer[strlen(key) + 2];
315 pakfire_string_format(buffer, "%s=", key);
316
317 for (unsigned int i = 0; jail->env[i]; i++) {
318 if (pakfire_string_startswith(jail->env[i], buffer))
319 return i;
320 }
321
322 // Nothing found
323 return -1;
324 }
325
326 // Returns the value of an environment variable or NULL
327 PAKFIRE_EXPORT const char* pakfire_jail_get_env(struct pakfire_jail* jail,
328 const char* key) {
329 int i = pakfire_jail_find_env(jail, key);
330 if (i < 0)
331 return NULL;
332
333 return jail->env[i] + strlen(key) + 1;
334 }
335
336 // Sets an environment variable
337 PAKFIRE_EXPORT int pakfire_jail_set_env(struct pakfire_jail* jail,
338 const char* key, const char* value) {
339 // Find the index where to write this value to
340 int i = pakfire_jail_find_env(jail, key);
341 if (i < 0)
342 i = pakfire_jail_env_length(jail);
343
344 // Return -ENOSPC when the environment is full
345 if (i >= ENVIRON_SIZE) {
346 errno = ENOSPC;
347 return -1;
348 }
349
350 // Free any previous value
351 if (jail->env[i])
352 free(jail->env[i]);
353
354 // Format and set environment variable
355 asprintf(&jail->env[i], "%s=%s", key, value);
356
357 DEBUG(jail->pakfire, "Set environment variable: %s\n", jail->env[i]);
358
359 return 0;
360 }
361
362 // Imports an environment
363 PAKFIRE_EXPORT int pakfire_jail_import_env(struct pakfire_jail* jail, const char* env[]) {
364 if (!env)
365 return 0;
366
367 char* key;
368 char* val;
369 int r;
370
371 // Copy environment variables
372 for (unsigned int i = 0; env[i]; i++) {
373 r = pakfire_string_partition(env[i], "=", &key, &val);
374 if (r)
375 continue;
376
377 // Set value
378 r = pakfire_jail_set_env(jail, key, val);
379
380 if (key)
381 free(key);
382 if (val)
383 free(val);
384
385 // Break on error
386 if (r)
387 return r;
388 }
389
390 return 0;
391 }
392
393 // Logging
394
395 PAKFIRE_EXPORT int pakfire_jail_set_log_callback(struct pakfire_jail* jail,
396 pakfire_jail_log_callback callback, void* data) {
397 jail->log_callback = callback;
398 jail->log_data = data;
399
400 return 0;
401 }
402
403 /*
404 This function replaces any logging in the child process.
405
406 All log messages will be sent to the parent process through their respective pipes.
407 */
408 static void pakfire_jail_log(void* data, int priority, const char* file,
409 int line, const char* fn, const char* format, va_list args) {
410 struct pakfire_jail_pipes* pipes = (struct pakfire_jail_pipes*)data;
411 int fd;
412
413 switch (priority) {
414 case LOG_INFO:
415 fd = pipes->log_INFO[1];
416 break;
417
418 case LOG_ERR:
419 fd = pipes->log_ERROR[1];
420 break;
421
422 #ifdef ENABLE_DEBUG
423 case LOG_DEBUG:
424 fd = pipes->log_DEBUG[1];
425 break;
426 #endif /* ENABLE_DEBUG */
427
428 // Ignore any messages of an unknown priority
429 default:
430 return;
431 }
432
433 // Send the log message
434 if (fd)
435 vdprintf(fd, format, args);
436 }
437
438 static int pakfire_jail_log_buffer_is_full(const struct pakfire_log_buffer* buffer) {
439 return (sizeof(buffer->data) == buffer->used);
440 }
441
442 /*
443 This function reads as much data as it can from the file descriptor.
444 If it finds a whole line in it, it will send it to the logger and repeat the process.
445 If not newline character is found, it will try to read more data until it finds one.
446 */
447 static int pakfire_jail_handle_log(struct pakfire_jail* jail,
448 struct pakfire_jail_exec* ctx, int priority, int fd,
449 struct pakfire_log_buffer* buffer, pakfire_jail_log_callback callback, void* data) {
450 char line[BUFFER_SIZE + 1];
451
452 // Fill up buffer from fd
453 if (buffer->used < sizeof(buffer->data)) {
454 ssize_t bytes_read = read(fd, buffer->data + buffer->used,
455 sizeof(buffer->data) - buffer->used);
456
457 // Handle errors
458 if (bytes_read < 0) {
459 ERROR(jail->pakfire, "Could not read from fd %d: %m\n", fd);
460 return -1;
461 }
462
463 // Update buffer size
464 buffer->used += bytes_read;
465 }
466
467 // See if we have any lines that we can write
468 while (buffer->used) {
469 // Search for the end of the first line
470 char* eol = memchr(buffer->data, '\n', buffer->used);
471
472 // No newline found
473 if (!eol) {
474 // If the buffer is full, we send the content to the logger and try again
475 // This should not happen in practise
476 if (pakfire_jail_log_buffer_is_full(buffer)) {
477 DEBUG(jail->pakfire, "Logging buffer is full. Sending all content\n");
478
479 eol = buffer->data + sizeof(buffer->data) - 1;
480
481 // Otherwise we might have only read parts of the output
482 } else
483 break;
484 }
485
486 // Find the length of the string
487 size_t length = eol - buffer->data + 1;
488
489 // Copy the line into the buffer
490 memcpy(line, buffer->data, length);
491
492 // Terminate the string
493 line[length] = '\0';
494
495 // Log the line
496 if (callback) {
497 int r = callback(jail->pakfire, data, priority, line, length);
498 if (r) {
499 ERROR(jail->pakfire, "The logging callback returned an error: %d\n", r);
500 return r;
501 }
502 }
503
504 // Remove line from buffer
505 memmove(buffer->data, buffer->data + length, buffer->used - length);
506 buffer->used -= length;
507 }
508
509 return 0;
510 }
511
512 static int pakfire_jail_setup_pipe(struct pakfire_jail* jail, int (*fds)[2], const int flags) {
513 int r = pipe2(*fds, flags);
514 if (r < 0) {
515 ERROR(jail->pakfire, "Could not setup pipe: %m\n");
516 return 1;
517 }
518
519 return 0;
520 }
521
522 static void pakfire_jail_close_pipe(struct pakfire_jail* jail, int fds[2]) {
523 for (unsigned int i = 0; i < 2; i++)
524 if (fds[i])
525 close(fds[i]);
526 }
527
528 /*
529 This is a convenience function to fetch the reading end of a pipe and
530 closes the write end.
531 */
532 static int pakfire_jail_get_pipe(struct pakfire_jail* jail, int (*fds)[2]) {
533 // Give the variables easier names to avoid confusion
534 int* fd_read = &(*fds)[0];
535 int* fd_write = &(*fds)[1];
536
537 // Close the write end of the pipe
538 if (*fd_write) {
539 close(*fd_write);
540 *fd_write = 0;
541 }
542
543 // Return the read end
544 return *fd_read;
545 }
546
547 static int pakfire_jail_wait(struct pakfire_jail* jail, struct pakfire_jail_exec* ctx) {
548 int epollfd = -1;
549 struct epoll_event ev;
550 struct epoll_event events[EPOLL_MAX_EVENTS];
551 int r = 0;
552
553 // Fetch file descriptors from context
554 const int stdout = pakfire_jail_get_pipe(jail, &ctx->pipes.stdout);
555 const int stderr = pakfire_jail_get_pipe(jail, &ctx->pipes.stderr);
556 const int pidfd = ctx->pidfd;
557
558 // Logging
559 const int log_INFO = pakfire_jail_get_pipe(jail, &ctx->pipes.log_INFO);
560 const int log_ERROR = pakfire_jail_get_pipe(jail, &ctx->pipes.log_ERROR);
561 const int log_DEBUG = pakfire_jail_get_pipe(jail, &ctx->pipes.log_DEBUG);
562
563 // Make a list of all file descriptors we are interested in
564 int fds[] = {
565 stdout, stderr, pidfd, log_INFO, log_ERROR, log_DEBUG,
566 };
567
568 // Setup epoll
569 epollfd = epoll_create1(0);
570 if (epollfd < 0) {
571 ERROR(jail->pakfire, "Could not initialize epoll(): %m\n");
572 r = 1;
573 goto ERROR;
574 }
575
576 ev.events = EPOLLIN;
577
578 // Turn file descriptors into non-blocking mode and add them to epoll()
579 for (unsigned int i = 0; i < sizeof(fds) / sizeof(*fds); i++) {
580 int fd = fds[i];
581
582 // Skip fds which were not initialized
583 if (fd <= 0)
584 continue;
585
586 ev.data.fd = fd;
587
588 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) {
589 ERROR(jail->pakfire, "Could not add file descriptor %d to epoll(): %m\n", fd);
590 r = 1;
591 goto ERROR;
592 }
593 }
594
595 int ended = 0;
596
597 // Loop for as long as the process is alive
598 while (!ended) {
599 int num = epoll_wait(epollfd, events, EPOLL_MAX_EVENTS, -1);
600 if (num < 1) {
601 // Ignore if epoll_wait() has been interrupted
602 if (errno == EINTR)
603 continue;
604
605 ERROR(jail->pakfire, "epoll_wait() failed: %m\n");
606 r = 1;
607
608 goto ERROR;
609 }
610
611 for (int i = 0; i < num; i++) {
612 int fd = events[i].data.fd;
613
614 struct pakfire_log_buffer* buffer = NULL;
615 pakfire_jail_log_callback callback = NULL;
616 void* data = NULL;
617 int priority;
618
619 // Handle any changes to the PIDFD
620 if (fd == pidfd) {
621 // Call waidid() and store the result
622 r = waitid(P_PIDFD, ctx->pidfd, &ctx->status, WEXITED);
623 if (r) {
624 ERROR(jail->pakfire, "waitid() failed: %m\n");
625 goto ERROR;
626 }
627
628 // Mark that we have ended so that we will process the remaining
629 // events from epoll() now, but won't restart the outer loop.
630 ended = 1;
631 continue;
632
633 // Handle logging messages
634 } else if (fd == log_INFO) {
635 buffer = &ctx->buffers.log_INFO;
636 priority = LOG_INFO;
637
638 callback = pakfire_jail_default_log_callback;
639
640 } else if (fd == log_ERROR) {
641 buffer = &ctx->buffers.log_ERROR;
642 priority = LOG_ERR;
643
644 callback = pakfire_jail_default_log_callback;
645
646 } else if (fd == log_DEBUG) {
647 buffer = &ctx->buffers.log_DEBUG;
648 priority = LOG_DEBUG;
649
650 callback = pakfire_jail_default_log_callback;
651
652 // Handle anything from the log pipes
653 } else if (fd == stdout) {
654 buffer = &ctx->buffers.stdout;
655 priority = LOG_INFO;
656
657 callback = jail->log_callback;
658 data = jail->log_data;
659
660 } else if (fd == stderr) {
661 buffer = &ctx->buffers.stderr;
662 priority = LOG_ERR;
663
664 callback = jail->log_callback;
665 data = jail->log_data;
666
667 } else {
668 DEBUG(jail->pakfire, "Received invalid file descriptor %d\n", fd);
669 continue;
670 }
671
672 // Handle log event
673 r = pakfire_jail_handle_log(jail, ctx, priority, fd, buffer, callback, data);
674 if (r)
675 goto ERROR;
676 }
677 }
678
679 ERROR:
680 if (epollfd > 0)
681 close(epollfd);
682
683 return r;
684 }
685
686 static int pakfire_jail_capture_stdout(struct pakfire* pakfire, void* data, int priority,
687 const char* line, size_t length) {
688 char** output = (char**)data;
689 int r;
690
691 // Append everything from stdout to a buffer
692 if (priority == LOG_INFO) {
693 r = asprintf(output, "%s%s", (output && *output) ? *output : "", line);
694 if (r < 0)
695 return 1;
696 return 0;
697 }
698
699 // Send everything else to the default logger
700 return pakfire_jail_default_log_callback(pakfire, NULL, priority, line, length);
701 }
702
703 // Capabilities
704
705 static int pakfire_jail_drop_capabilities(struct pakfire_jail* jail) {
706 const int capabilities[] = {
707 // Deny access to the kernel's audit system
708 CAP_AUDIT_CONTROL,
709 CAP_AUDIT_READ,
710 CAP_AUDIT_WRITE,
711
712 // Deny suspending block devices
713 CAP_BLOCK_SUSPEND,
714
715 // Deny any stuff with BPF
716 CAP_BPF,
717
718 // Deny checkpoint restore
719 CAP_CHECKPOINT_RESTORE,
720
721 // Deny opening files by inode number (open_by_handle_at)
722 CAP_DAC_READ_SEARCH,
723
724 // Deny setting SUID bits
725 CAP_FSETID,
726
727 // Deny locking more memory
728 CAP_IPC_LOCK,
729
730 // Deny modifying any Apparmor/SELinux/SMACK configuration
731 CAP_MAC_ADMIN,
732 CAP_MAC_OVERRIDE,
733
734 // Deny creating any special devices
735 CAP_MKNOD,
736
737 // Deny setting any capabilities
738 CAP_SETFCAP,
739
740 // Deny reading from syslog
741 CAP_SYSLOG,
742
743 // Deny any admin actions (mount, sethostname, ...)
744 CAP_SYS_ADMIN,
745
746 // Deny rebooting the system
747 CAP_SYS_BOOT,
748
749 // Deny loading kernel modules
750 CAP_SYS_MODULE,
751
752 // Deny setting nice level
753 CAP_SYS_NICE,
754
755 // Deny access to /proc/kcore, /dev/mem, /dev/kmem
756 CAP_SYS_RAWIO,
757
758 // Deny circumventing any resource limits
759 CAP_SYS_RESOURCE,
760
761 // Deny setting the system time
762 CAP_SYS_TIME,
763
764 // Deny playing with suspend
765 CAP_WAKE_ALARM,
766
767 0,
768 };
769
770 DEBUG(jail->pakfire, "Dropping capabilities...\n");
771
772 size_t num_caps = 0;
773 int r;
774
775 // Drop any capabilities
776 for (const int* cap = capabilities; *cap; cap++) {
777 r = prctl(PR_CAPBSET_DROP, *cap, 0, 0, 0);
778 if (r) {
779 ERROR(jail->pakfire, "Could not drop capability %d: %m\n", *cap);
780 return r;
781 }
782
783 num_caps++;
784 }
785
786 // Fetch any capabilities
787 cap_t caps = cap_get_proc();
788 if (!caps) {
789 ERROR(jail->pakfire, "Could not read capabilities: %m\n");
790 return 1;
791 }
792
793 /*
794 Set inheritable capabilities
795
796 This ensures that no processes will be able to gain any of the listed
797 capabilities again.
798 */
799 r = cap_set_flag(caps, CAP_INHERITABLE, num_caps, capabilities, CAP_CLEAR);
800 if (r) {
801 ERROR(jail->pakfire, "cap_set_flag() failed: %m\n");
802 goto ERROR;
803 }
804
805 // Restore capabilities
806 r = cap_set_proc(caps);
807 if (r) {
808 ERROR(jail->pakfire, "Could not restore capabilities: %m\n");
809 goto ERROR;
810 }
811
812 ERROR:
813 if (caps)
814 cap_free(caps);
815
816 return r;
817 }
818
819 // Syscall Filter
820
821 static int pakfire_jail_limit_syscalls(struct pakfire_jail* jail) {
822 const int syscalls[] = {
823 // The kernel's keyring isn't namespaced
824 SCMP_SYS(keyctl),
825 SCMP_SYS(add_key),
826 SCMP_SYS(request_key),
827
828 // Disable userfaultfd
829 SCMP_SYS(userfaultfd),
830
831 // Disable perf which could leak a lot of information about the host
832 SCMP_SYS(perf_event_open),
833
834 0,
835 };
836 int r = 1;
837
838 DEBUG(jail->pakfire, "Applying syscall filter...\n");
839
840 // Setup a syscall filter which allows everything by default
841 scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW);
842 if (!ctx) {
843 ERROR(jail->pakfire, "Could not setup seccomp filter: %m\n");
844 goto ERROR;
845 }
846
847 // All all syscalls
848 for (const int* syscall = syscalls; *syscall; syscall++) {
849 r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), *syscall, 0);
850 if (r) {
851 ERROR(jail->pakfire, "Could not configure syscall %d: %m\n", *syscall);
852 goto ERROR;
853 }
854 }
855
856 // Load syscall filter into the kernel
857 r = seccomp_load(ctx);
858 if (r) {
859 ERROR(jail->pakfire, "Could not load syscall filter into the kernel: %m\n");
860 goto ERROR;
861 }
862
863 ERROR:
864 if (ctx)
865 seccomp_release(ctx);
866
867 return r;
868 }
869
870 // UID/GID Mapping
871
872 static int pakfire_jail_write_uidgid_mapping(struct pakfire_jail* jail,
873 const char* path, uid_t mapped_id, size_t length) {
874 int r = 1;
875
876 // Open file for writing
877 FILE* f = fopen(path, "w");
878 if (!f) {
879 ERROR(jail->pakfire, "Could not open %s for writing: %m\n", path);
880 goto ERROR;
881 }
882
883 // Write configuration
884 int bytes_written = fprintf(f, "%d %d %ld\n", 0, mapped_id, length);
885 if (bytes_written <= 0) {
886 ERROR(jail->pakfire, "Could not write UID/GID mapping: %m\n");
887 goto ERROR;
888 }
889
890 // Close the file
891 r = fclose(f);
892 f = NULL;
893 if (r) {
894 ERROR(jail->pakfire, "Could not write UID/GID mapping: %m\n");
895
896 goto ERROR;
897 }
898
899 // Success
900 r = 0;
901
902 ERROR:
903 if (f)
904 fclose(f);
905
906 return r;
907 }
908
909 static int pakfire_jail_setup_uid_mapping(struct pakfire_jail* jail, pid_t pid) {
910 char path[PATH_MAX];
911 int r;
912
913 uid_t mapped_uid = 0;
914 const size_t length = 1;
915
916 // Fetch the UID of the calling process
917 uid_t uid = getuid();
918
919 // Have we been called by root?
920 if (uid == 0) {
921 mapped_uid = 0;
922
923 // Have we been called by an unprivileged user?
924 } else {
925 // XXX fetch SUBUID
926 mapped_uid = uid;
927 }
928
929 // Make path
930 r = pakfire_string_format(path, "/proc/%d/uid_map", pid);
931 if (r < 0)
932 return 1;
933
934 DEBUG(jail->pakfire, "Mapping UID range (%u - %lu)\n", mapped_uid, mapped_uid + length);
935
936 return pakfire_jail_write_uidgid_mapping(jail, path, mapped_uid, length);
937 }
938
939 static int pakfire_jail_setup_gid_mapping(struct pakfire_jail* jail, pid_t pid) {
940 char path[PATH_MAX];
941 int r;
942
943 gid_t mapped_gid = 0;
944 const size_t length = 1;
945
946 // Fetch the GID of the calling process
947 gid_t gid = getgid();
948
949 // Have we been called from the root group?
950 if (gid == 0) {
951 mapped_gid = 0;
952
953 // Have we been called by an unprivileged group?
954 } else {
955 // XXX fetch SUBGID
956 mapped_gid = gid;
957 }
958
959 // Make path
960 r = pakfire_string_format(path, "/proc/%d/gid_map", pid);
961 if (r < 0)
962 return 1;
963
964 DEBUG(jail->pakfire, "Mapping GID range (%u - %lu)\n", mapped_gid, mapped_gid + length);
965
966 return pakfire_jail_write_uidgid_mapping(jail, path, mapped_gid, length);
967 }
968
969 static int pakfire_jail_setgroups(struct pakfire_jail* jail, pid_t pid) {
970 char path[PATH_MAX];
971 int r = 1;
972
973 // Make path
974 r = pakfire_string_format(path, "/proc/%d/setgroups", pid);
975 if (r < 0)
976 return 1;
977
978 // Open file for writing
979 FILE* f = fopen(path, "w");
980 if (!f) {
981 ERROR(jail->pakfire, "Could not open %s for writing: %m\n", path);
982 goto ERROR;
983 }
984
985 // Write content
986 int bytes_written = fprintf(f, "deny\n");
987 if (bytes_written <= 0) {
988 ERROR(jail->pakfire, "Could not write to %s: %m\n", path);
989 goto ERROR;
990 }
991
992 r = fclose(f);
993 f = NULL;
994 if (r) {
995 ERROR(jail->pakfire, "Could not close %s: %m\n", path);
996 goto ERROR;
997 }
998
999 ERROR:
1000 if (f)
1001 fclose(f);
1002
1003 return r;
1004 }
1005
1006 static int pakfire_jail_send_signal(struct pakfire_jail* jail, int fd) {
1007 const uint64_t val = 1;
1008 int r = 0;
1009
1010 DEBUG(jail->pakfire, "Sending signal...\n");
1011
1012 // Write to the file descriptor
1013 ssize_t bytes_written = write(fd, &val, sizeof(val));
1014 if (bytes_written < 0 || (size_t)bytes_written < sizeof(val)) {
1015 ERROR(jail->pakfire, "Could not send signal: %m\n");
1016 r = 1;
1017 }
1018
1019 // Close the file descriptor
1020 close(fd);
1021
1022 return r;
1023 }
1024
1025 static int pakfire_jail_wait_for_signal(struct pakfire_jail* jail, int fd) {
1026 uint64_t val = 0;
1027 int r = 0;
1028
1029 DEBUG(jail->pakfire, "Waiting for signal...\n");
1030
1031 ssize_t bytes_read = read(fd, &val, sizeof(val));
1032 if (bytes_read < 0 || (size_t)bytes_read < sizeof(val)) {
1033 ERROR(jail->pakfire, "Error waiting for signal: %m\n");
1034 r = 1;
1035 }
1036
1037 // Close the file descriptor
1038 close(fd);
1039
1040 return r;
1041 }
1042
1043 /*
1044 Performs the initialisation that needs to happen in the parent part
1045 */
1046 static int pakfire_jail_parent(struct pakfire_jail* jail, struct pakfire_jail_exec* ctx) {
1047 int r;
1048
1049 // Setup UID mapping
1050 r = pakfire_jail_setup_uid_mapping(jail, ctx->pid);
1051 if (r)
1052 return r;
1053
1054 // Write "deny" to /proc/PID/setgroups
1055 r = pakfire_jail_setgroups(jail, ctx->pid);
1056 if (r)
1057 return r;
1058
1059 // Setup GID mapping
1060 r = pakfire_jail_setup_gid_mapping(jail, ctx->pid);
1061 if (r)
1062 return r;
1063
1064 // Parent has finished initialisation
1065 DEBUG(jail->pakfire, "Parent has finished initialization\n");
1066
1067 // Send signal to client
1068 r = pakfire_jail_send_signal(jail, ctx->completed_fd);
1069 if (r)
1070 return r;
1071
1072 return 0;
1073 }
1074
1075 static int pakfire_jail_child(struct pakfire_jail* jail, struct pakfire_jail_exec* ctx,
1076 const char* argv[]) {
1077 int r;
1078
1079 // Redirect any logging to our log pipe
1080 pakfire_set_log_callback(jail->pakfire, pakfire_jail_log, &ctx->pipes);
1081
1082 // Fetch my own PID
1083 pid_t pid = getpid();
1084
1085 DEBUG(jail->pakfire, "Launched child process in jail with PID %d\n", pid);
1086
1087 // Log argv
1088 for (unsigned int i = 0; argv[i]; i++)
1089 DEBUG(jail->pakfire, " argv[%d] = %s\n", i, argv[i]);
1090
1091 // Wait for the parent to finish initialization
1092 r = pakfire_jail_wait_for_signal(jail, ctx->completed_fd);
1093 if (r)
1094 return r;
1095
1096 // Perform further initialization
1097
1098 // Fetch UID/GID
1099 uid_t uid = getuid();
1100 gid_t gid = getgid();
1101
1102 // Fetch EUID/EGID
1103 uid_t euid = geteuid();
1104 gid_t egid = getegid();
1105
1106 DEBUG(jail->pakfire, " UID: %d (effective %d)\n", uid, euid);
1107 DEBUG(jail->pakfire, " GID: %d (effective %d)\n", gid, egid);
1108
1109 // Check if we are (effectively running as root)
1110 if (uid != 0 || gid != 0) {
1111 ERROR(jail->pakfire, "Child process is not running as root\n");
1112 return 126;
1113 }
1114
1115 const char* root = pakfire_get_path(jail->pakfire);
1116 const char* arch = pakfire_get_arch(jail->pakfire);
1117
1118 // Change root (unless root is /)
1119 if (!pakfire_on_root(jail->pakfire)) {
1120 // Mount everything
1121 r = pakfire_mount_all(jail->pakfire);
1122 if (r)
1123 return r;
1124
1125 // Log all mountpoints
1126 pakfire_mount_list(jail->pakfire);
1127
1128 // Call chroot()
1129 r = chroot(root);
1130 if (r) {
1131 ERROR(jail->pakfire, "chroot() to %s failed: %m\n", root);
1132 return 1;
1133 }
1134
1135 // Change directory to /
1136 r = chdir("/");
1137 if (r) {
1138 ERROR(jail->pakfire, "chdir() after chroot() failed: %m\n");
1139 return 1;
1140 }
1141 }
1142
1143 // Set personality
1144 unsigned long persona = pakfire_arch_personality(arch);
1145 if (persona) {
1146 r = personality(persona);
1147 if (r < 0) {
1148 ERROR(jail->pakfire, "Could not set personality (%x)\n", (unsigned int)persona);
1149 return 1;
1150 }
1151 }
1152
1153 // Set nice level
1154 if (jail->nice) {
1155 DEBUG(jail->pakfire, "Setting nice level to %d\n", jail->nice);
1156
1157 r = setpriority(PRIO_PROCESS, pid, jail->nice);
1158 if (r) {
1159 ERROR(jail->pakfire, "Could not set nice level: %m\n");
1160 return 1;
1161 }
1162 }
1163
1164 // Close other end of log pipes
1165 close(ctx->pipes.log_INFO[0]);
1166 close(ctx->pipes.log_ERROR[0]);
1167 #ifdef ENABLE_DEBUG
1168 close(ctx->pipes.log_DEBUG[0]);
1169 #endif /* ENABLE_DEBUG */
1170
1171 // Connect standard output and error
1172 if (ctx->pipes.stdout[1] && ctx->pipes.stderr[1]) {
1173 r = dup2(ctx->pipes.stdout[1], STDOUT_FILENO);
1174 if (r < 0) {
1175 ERROR(jail->pakfire, "Could not connect fd %d to stdout: %m\n",
1176 ctx->pipes.stdout[1]);
1177
1178 return 1;
1179 }
1180
1181 r = dup2(ctx->pipes.stderr[1], STDERR_FILENO);
1182 if (r < 0) {
1183 ERROR(jail->pakfire, "Could not connect fd %d to stderr: %m\n",
1184 ctx->pipes.stderr[1]);
1185
1186 return 1;
1187 }
1188
1189 // Close the pipe (as we have moved the original file descriptors)
1190 pakfire_jail_close_pipe(jail, ctx->pipes.stdout);
1191 pakfire_jail_close_pipe(jail, ctx->pipes.stderr);
1192 }
1193
1194 // Reset open file limit (http://0pointer.net/blog/file-descriptor-limits.html)
1195 r = pakfire_rlimit_reset_nofile(jail->pakfire);
1196 if (r)
1197 return r;
1198
1199 // Drop capabilities
1200 r = pakfire_jail_drop_capabilities(jail);
1201 if (r)
1202 return r;
1203
1204 // Filter syscalls
1205 r = pakfire_jail_limit_syscalls(jail);
1206 if (r)
1207 return r;
1208
1209 // exec() command
1210 r = execvpe(argv[0], (char**)argv, jail->env);
1211 if (r < 0)
1212 ERROR(jail->pakfire, "Could not execve(): %m\n");
1213
1214 // Translate errno into regular exit code
1215 switch (errno) {
1216 case ENOENT:
1217 r = 127;
1218 break;
1219
1220 default:
1221 r = 1;
1222 }
1223
1224 // We should not get here
1225 return r;
1226 }
1227
1228 // Run a command in the jail
1229 static int __pakfire_jail_exec(struct pakfire_jail* jail, const char* argv[]) {
1230 int exit = -1;
1231 int r;
1232
1233 // Check if argv is valid
1234 if (!argv || !argv[0]) {
1235 errno = EINVAL;
1236 return -1;
1237 }
1238
1239 // Initialize context for this call
1240 struct pakfire_jail_exec ctx = {
1241 .pipes = {
1242 .stdout = { 0, 0 },
1243 .stderr = { 0, 0 },
1244 },
1245 };
1246
1247 DEBUG(jail->pakfire, "Executing jail...\n");
1248
1249 /*
1250 Setup a file descriptor which can be used to notify the client that the parent
1251 has completed configuration.
1252 */
1253 ctx.completed_fd = eventfd(0, EFD_CLOEXEC);
1254 if (ctx.completed_fd < 0) {
1255 ERROR(jail->pakfire, "eventfd() failed: %m\n");
1256 return -1;
1257 }
1258
1259 // Create pipes to communicate with child process if we are not running interactively
1260 if (!pakfire_jail_has_flag(jail, PAKFIRE_JAIL_INTERACTIVE)) {
1261 // stdout
1262 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.stdout, 0);
1263 if (r)
1264 goto ERROR;
1265
1266 // stderr
1267 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.stderr, 0);
1268 if (r)
1269 goto ERROR;
1270 }
1271
1272 // Setup pipes for logging
1273 // INFO
1274 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.log_INFO, O_CLOEXEC);
1275 if (r)
1276 goto ERROR;
1277
1278 // ERROR
1279 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.log_ERROR, O_CLOEXEC);
1280 if (r)
1281 goto ERROR;
1282
1283 #ifdef ENABLE_DEBUG
1284 // DEBUG
1285 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.log_DEBUG, O_CLOEXEC);
1286 if (r)
1287 goto ERROR;
1288 #endif /* ENABLE_DEBUG */
1289
1290 // Configure child process
1291 struct clone_args args = {
1292 .flags =
1293 CLONE_NEWCGROUP |
1294 CLONE_NEWIPC |
1295 CLONE_NEWNS |
1296 CLONE_NEWPID |
1297 CLONE_NEWUSER |
1298 CLONE_NEWUTS |
1299 CLONE_PIDFD,
1300 .exit_signal = SIGCHLD,
1301 .pidfd = (long long unsigned int)&ctx.pidfd,
1302 };
1303
1304 // Launch the process in a cgroup that is a leaf of the configured cgroup
1305 if (jail->cgroup) {
1306 args.flags |= CLONE_INTO_CGROUP;
1307
1308 #warning TODO randomize the name
1309
1310 // Create a temporary cgroup
1311 r = pakfire_cgroup_child(&ctx.cgroup, jail->cgroup, "jail", 0);
1312 if (r) {
1313 ERROR(jail->pakfire, "Could not create cgroup for jail: %m\n");
1314 goto ERROR;
1315 }
1316
1317 // Clone into this cgroup
1318 args.cgroup = pakfire_cgroup_fd(ctx.cgroup);
1319 }
1320
1321 // Fork this process
1322 ctx.pid = clone3(&args, sizeof(args));
1323 if (ctx.pid < 0) {
1324 ERROR(jail->pakfire, "Could not clone: %m\n");
1325 return -1;
1326
1327 // Child process
1328 } else if (ctx.pid == 0) {
1329 r = pakfire_jail_child(jail, &ctx, argv);
1330 _exit(r);
1331 }
1332
1333 // Parent process
1334 r = pakfire_jail_parent(jail, &ctx);
1335 if (r)
1336 goto ERROR;
1337
1338 DEBUG(jail->pakfire, "Waiting for PID %d to finish its work\n", ctx.pid);
1339
1340 // Read output of the child process
1341 r = pakfire_jail_wait(jail, &ctx);
1342 if (r)
1343 goto ERROR;
1344
1345 // Handle exit status
1346 switch (ctx.status.si_code) {
1347 case CLD_EXITED:
1348 DEBUG(jail->pakfire, "The child process exited with code %d\n",
1349 ctx.status.si_status);
1350
1351 // Pass exit code
1352 exit = ctx.status.si_status;
1353 break;
1354
1355 case CLD_KILLED:
1356 case CLD_DUMPED:
1357 ERROR(jail->pakfire, "The child process was killed\n");
1358 break;
1359
1360 // Log anything else
1361 default:
1362 ERROR(jail->pakfire, "Unknown child exit code: %d\n", ctx.status.si_code);
1363 break;
1364 }
1365
1366 ERROR:
1367 // Destroy the temporary cgroup (if any)
1368 if (ctx.cgroup) {
1369 pakfire_cgroup_destroy(ctx.cgroup);
1370 pakfire_cgroup_unref(ctx.cgroup);
1371 }
1372
1373 // Close any file descriptors
1374 pakfire_jail_close_pipe(jail, ctx.pipes.stdout);
1375 pakfire_jail_close_pipe(jail, ctx.pipes.stderr);
1376 if (ctx.pidfd)
1377 close(ctx.pidfd);
1378 pakfire_jail_close_pipe(jail, ctx.pipes.log_INFO);
1379 pakfire_jail_close_pipe(jail, ctx.pipes.log_ERROR);
1380 pakfire_jail_close_pipe(jail, ctx.pipes.log_DEBUG);
1381
1382 // Umount everything
1383 if (!pakfire_on_root(jail->pakfire))
1384 pakfire_umount_all(jail->pakfire);
1385
1386 return exit;
1387 }
1388
1389 PAKFIRE_EXPORT int pakfire_jail_exec(struct pakfire_jail* jail,
1390 const char* argv[], char** output) {
1391 int r;
1392
1393 // Store logging callback
1394 pakfire_jail_log_callback log_callback = jail->log_callback;
1395 void* log_data = jail->log_data;
1396
1397 // Capture output if requested by user
1398 if (output)
1399 pakfire_jail_set_log_callback(jail, pakfire_jail_capture_stdout, output);
1400
1401 // Run exec()
1402 r = __pakfire_jail_exec(jail, argv);
1403
1404 // Restore log callback
1405 pakfire_jail_set_log_callback(jail, log_callback, log_data);
1406
1407 return r;
1408 }
1409
1410 PAKFIRE_EXPORT int pakfire_jail_exec_script(struct pakfire_jail* jail,
1411 const char* script, const size_t size, const char* args[], char** output) {
1412 char path[PATH_MAX];
1413 const char** argv = NULL;
1414 int r;
1415
1416 const char* root = pakfire_get_path(jail->pakfire);
1417
1418 // Write the scriptlet to disk
1419 r = pakfire_path_join(path, root, "pakfire-script.XXXXXX");
1420 if (r < 0)
1421 goto ERROR;
1422
1423 // Open a temporary file
1424 int fd = mkstemp(path);
1425 if (fd < 0) {
1426 ERROR(jail->pakfire, "Could not open a temporary file: %m\n");
1427 r = 1;
1428 goto ERROR;
1429 }
1430
1431 DEBUG(jail->pakfire, "Writing script to %s:\n%.*s\n", path, (int)size, script);
1432
1433 // Write data
1434 ssize_t bytes_written = write(fd, script, size);
1435 if (bytes_written < (ssize_t)size) {
1436 ERROR(jail->pakfire, "Could not write script to file %s: %m\n", path);
1437 r = 1;
1438 goto ERROR;
1439 }
1440
1441 // Make the script executable
1442 r = fchmod(fd, S_IRUSR|S_IWUSR|S_IXUSR);
1443 if (r) {
1444 ERROR(jail->pakfire, "Could not set executable permissions on %s: %m\n", path);
1445 goto ERROR;
1446 }
1447
1448 // Close file
1449 r = close(fd);
1450 if (r) {
1451 ERROR(jail->pakfire, "Could not close script file %s: %m\n", path);
1452 r = 1;
1453 goto ERROR;
1454 }
1455
1456 // Count how many arguments were passed
1457 unsigned int argc = 1;
1458 if (args) {
1459 for (const char** arg = args; *arg; arg++)
1460 argc++;
1461 }
1462
1463 argv = calloc(argc + 1, sizeof(*argv));
1464 if (!argv) {
1465 ERROR(jail->pakfire, "Could not allocate argv: %m\n");
1466 goto ERROR;
1467 }
1468
1469 // Set command
1470 argv[0] = (root) ? pakfire_path_relpath(root, path) : path;
1471
1472 // Copy args
1473 for (unsigned int i = 1; i < argc; i++)
1474 argv[i] = args[i-1];
1475
1476 // Run the script
1477 r = pakfire_jail_exec(jail, argv, output);
1478
1479 ERROR:
1480 if (argv)
1481 free(argv);
1482
1483 // Remove script from disk
1484 if (*path)
1485 unlink(path);
1486
1487 return r;
1488 }
1489
1490 /*
1491 A convenience function that creates a new jail, runs the given command and destroys
1492 the jail again.
1493 */
1494 int pakfire_jail_run(struct pakfire* pakfire, const char* argv[], int flags, char** output) {
1495 struct pakfire_jail* jail = NULL;
1496 int r;
1497
1498 // Create a new jail
1499 r = pakfire_jail_create(&jail, pakfire, flags);
1500 if (r)
1501 goto ERROR;
1502
1503 // Execute the command
1504 r = pakfire_jail_exec(jail, argv, output);
1505
1506 ERROR:
1507 if (jail)
1508 pakfire_jail_unref(jail);
1509
1510 return r;
1511 }
1512
1513 int pakfire_jail_run_script(struct pakfire* pakfire,
1514 const char* script, const size_t length, const char* argv[], int flags, char** output) {
1515 struct pakfire_jail* jail = NULL;
1516 int r;
1517
1518 // Create a new jail
1519 r = pakfire_jail_create(&jail, pakfire, flags);
1520 if (r)
1521 goto ERROR;
1522
1523 // Execute the command
1524 r = pakfire_jail_exec_script(jail, script, length, argv, output);
1525
1526 ERROR:
1527 if (jail)
1528 pakfire_jail_unref(jail);
1529
1530 return r;
1531 }
1532
1533
1534 int pakfire_jail_shell(struct pakfire* pakfire) {
1535 const char* argv[] = {
1536 "/bin/bash", "--login", NULL,
1537 };
1538
1539 // Execute /bin/bash
1540 return pakfire_jail_run(pakfire, argv, PAKFIRE_JAIL_INTERACTIVE, NULL);
1541 }
1542
1543 int pakfire_jail_ldconfig(struct pakfire* pakfire) {
1544 char path[PATH_MAX];
1545
1546 const char* ldconfig = "/sbin/ldconfig";
1547
1548 // Check if ldconfig exists before calling it to avoid overhead
1549 int r = pakfire_make_path(pakfire, path, ldconfig);
1550 if (r < 0)
1551 return 1;
1552
1553 // Check if ldconfig is executable
1554 r = access(path, X_OK);
1555 if (r) {
1556 DEBUG(pakfire, "%s is not executable. Skipping...\n", ldconfig);
1557 return 0;
1558 }
1559
1560 const char* argv[] = {
1561 ldconfig, NULL,
1562 };
1563
1564 // Run ldconfig
1565 return pakfire_jail_run(pakfire, argv, 0, NULL);
1566 }