src/libpakfire/jail.c

   1 /*#############################################################################
   2 #                                                                             #
   3 # Pakfire - The IPFire package management system                              #
   4 # Copyright (C) 2022 Pakfire development team                                 #
   5 #                                                                             #
   6 # This program is free software: you can redistribute it and/or modify        #
   7 # it under the terms of the GNU General Public License as published by        #
   8 # the Free Software Foundation, either version 3 of the License, or           #
   9 # (at your option) any later version.                                         #
  10 #                                                                             #
  11 # This program is distributed in the hope that it will be useful,             #
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of              #
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
  14 # GNU General Public License for more details.                                #
  15 #                                                                             #
  16 # You should have received a copy of the GNU General Public License           #
  17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
  18 #                                                                             #
  19 #############################################################################*/
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <linux/capability.h>
  24 #include <linux/sched.h>
  25 #include <sys/wait.h>
  26 #include <linux/wait.h>
  27 #include <sched.h>
  28 #include <signal.h>
  29 #include <stdlib.h>
  30 #include <syscall.h>
  31 #include <sys/capability.h>
  32 #include <sys/epoll.h>
  33 #include <sys/eventfd.h>
  34 #include <sys/mount.h>
  35 #include <sys/personality.h>
  36 #include <sys/prctl.h>
  37 #include <sys/resource.h>
  38 #include <sys/signalfd.h>
  39 #include <sys/timerfd.h>
  40 #include <sys/types.h>
  41 #include <sys/wait.h>
  42
  43 // libnl3
  44 #include <net/if.h>
  45 #include <netlink/route/link.h>
  46
  47 // libseccomp
  48 #include <seccomp.h>
  49
  50 // libuuid
  51 #include <uuid.h>
  52
  53 #include <pakfire/arch.h>
  54 #include <pakfire/cgroup.h>
  55 #include <pakfire/jail.h>
  56 #include <pakfire/logging.h>
  57 #include <pakfire/mount.h>
  58 #include <pakfire/os.h>
  59 #include <pakfire/pakfire.h>
  60 #include <pakfire/path.h>
  61 #include <pakfire/private.h>
  62 #include <pakfire/pwd.h>
  63 #include <pakfire/string.h>
  64 #include <pakfire/util.h>
  65
  66 #define BUFFER_SIZE      1024 * 64
  67 #define ENVIRON_SIZE     128
  68 #define EPOLL_MAX_EVENTS 2
  69 #define MAX_MOUNTPOINTS  8
  70
  71 // The default environment that will be set for every command
  72 static const struct environ {
  73         const char* key;
  74         const char* val;
  75 } ENV[] = {
  76         { "HOME", "/root" },
  77         { "LANG", "C.utf-8" },
  78         { "PATH", "/usr/local/sbin:/usr/sbin:/sbin:/usr/local/bin:/usr/bin:/bin", },
  79         { "TERM", "vt100" },
  80
  81         // Tell everything that it is running inside a Pakfire container
  82         { "container", "pakfire" },
  83         { NULL, NULL },
  84 };
  85
  86 struct pakfire_jail_mountpoint {
  87         char source[PATH_MAX];
  88         char target[PATH_MAX];
  89         int flags;
  90 };
  91
  92 struct pakfire_jail {
  93         struct pakfire_ctx* ctx;
  94         struct pakfire* pakfire;
  95         int nrefs;
  96
  97         // A unique ID for each jail
  98         uuid_t uuid;
  99         char __uuid[UUID_STR_LEN];
 100
 101         // Resource Limits
 102         int nice;
 103
 104         // Timeout
 105         struct itimerspec timeout;
 106
 107         // CGroup
 108         struct pakfire_cgroup* cgroup;
 109
 110         // Environment
 111         char* env[ENVIRON_SIZE];
 112
 113         // Mountpoints
 114         struct pakfire_jail_mountpoint mountpoints[MAX_MOUNTPOINTS];
 115         unsigned int num_mountpoints;
 116
 117         // Callbacks
 118         struct pakfire_jail_callbacks {
 119                 // Log
 120                 pakfire_jail_log_callback log;
 121                 void* log_data;
 122         } callbacks;
 123 };
 124
 125 struct pakfire_log_buffer {
 126         char data[BUFFER_SIZE];
 127         size_t used;
 128 };
 129
 130 struct pakfire_jail_exec {
 131         int flags;
 132
 133         // PIDs (of the children)
 134         int pidfd1;
 135         int pidfd2;
 136
 137         // Socket to pass FDs
 138         int socket[2];
 139
 140         // FD to notify the client that the parent has finished initialization
 141         int completed_fd;
 142
 143         // Log pipes
 144         struct pakfire_jail_pipes {
 145                 int stdin[2];
 146                 int stdout[2];
 147                 int stderr[2];
 148
 149                 // Logging
 150                 int log_INFO[2];
 151                 int log_ERROR[2];
 152 #ifdef ENABLE_DEBUG
 153                 int log_DEBUG[2];
 154 #endif /* ENABLE_DEBUG */
 155         } pipes;
 156
 157         // Communicate
 158         struct pakfire_jail_communicate {
 159                 pakfire_jail_communicate_in  in;
 160                 pakfire_jail_communicate_out out;
 161                 void* data;
 162         } communicate;
 163
 164         // Log buffers
 165         struct pakfire_jail_buffers {
 166                 struct pakfire_log_buffer stdout;
 167                 struct pakfire_log_buffer stderr;
 168
 169                 // Logging
 170                 struct pakfire_log_buffer log_INFO;
 171                 struct pakfire_log_buffer log_ERROR;
 172 #ifdef ENABLE_DEBUG
 173                 struct pakfire_log_buffer log_DEBUG;
 174 #endif /* ENABLE_DEBUG */
 175         } buffers;
 176
 177         struct pakfire_cgroup* cgroup;
 178         struct pakfire_cgroup_stats cgroup_stats;
 179 };
 180
 181 static int clone3(struct clone_args* args, size_t size) {
 182         return syscall(__NR_clone3, args, size);
 183 }
 184
 185 static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) {
 186         return syscall(SYS_pidfd_send_signal, pidfd, sig, info, flags);
 187 }
 188
 189 static int pivot_root(const char* new_root, const char* old_root) {
 190         return syscall(SYS_pivot_root, new_root, old_root);
 191 }
 192
 193 static int pakfire_jail_exec_has_flag(
 194                 const struct pakfire_jail_exec* ctx, const enum pakfire_jail_exec_flags flag) {
 195         return ctx->flags & flag;
 196 }
 197
 198 static void pakfire_jail_free(struct pakfire_jail* jail) {
 199         DEBUG(jail->pakfire, "Freeing jail at %p\n", jail);
 200
 201         // Free environment
 202         for (unsigned int i = 0; jail->env[i]; i++)
 203                 free(jail->env[i]);
 204
 205         if (jail->cgroup)
 206                 pakfire_cgroup_unref(jail->cgroup);
 207         if (jail->pakfire)
 208                 pakfire_unref(jail->pakfire);
 209         if (jail->ctx)
 210                 pakfire_ctx_unref(jail->ctx);
 211         free(jail);
 212 }
 213
 214 /*
 215         Passes any log messages on to the default pakfire log callback
 216 */
 217 static int pakfire_jail_default_log_callback(struct pakfire* pakfire, void* data,
 218                 int priority, const char* line, size_t length) {
 219         switch (priority) {
 220                 case LOG_INFO:
 221                         INFO(pakfire, "%s", line);
 222                         break;
 223
 224                 case LOG_ERR:
 225                         ERROR(pakfire, "%s", line);
 226                         break;
 227
 228 #ifdef ENABLE_DEBUG
 229                 case LOG_DEBUG:
 230                         DEBUG(pakfire, "%s", line);
 231                         break;
 232 #endif
 233         }
 234
 235         return 0;
 236 }
 237
 238 static const char* pakfire_jail_uuid(struct pakfire_jail* jail) {
 239         if (!*jail->__uuid)
 240                 uuid_unparse_lower(jail->uuid, jail->__uuid);
 241
 242         return jail->__uuid;
 243 }
 244
 245 static int pakfire_jail_setup_interactive_env(struct pakfire_jail* jail) {
 246         // Set PS1
 247         int r = pakfire_jail_set_env(jail, "PS1", "pakfire-jail \\w> ");
 248         if (r)
 249                 return r;
 250
 251         // Copy TERM
 252         char* TERM = secure_getenv("TERM");
 253         if (TERM) {
 254                 r = pakfire_jail_set_env(jail, "TERM", TERM);
 255                 if (r)
 256                         return r;
 257         }
 258
 259         // Copy LANG
 260         char* LANG = secure_getenv("LANG");
 261         if (LANG) {
 262                 r = pakfire_jail_set_env(jail, "LANG", LANG);
 263                 if (r)
 264                         return r;
 265         }
 266
 267         return 0;
 268 }
 269
 270 PAKFIRE_EXPORT int pakfire_jail_create(struct pakfire_jail** jail, struct pakfire* pakfire) {
 271         int r;
 272
 273         const char* arch = pakfire_get_effective_arch(pakfire);
 274
 275         // Allocate a new jail
 276         struct pakfire_jail* j = calloc(1, sizeof(*j));
 277         if (!j)
 278                 return 1;
 279
 280         // Reference context
 281         j->ctx = pakfire_ctx(pakfire);
 282
 283         // Reference Pakfire
 284         j->pakfire = pakfire_ref(pakfire);
 285
 286         // Initialize reference counter
 287         j->nrefs = 1;
 288
 289         // Generate a random UUID
 290         uuid_generate_random(j->uuid);
 291
 292         DEBUG(j->pakfire, "Allocated new jail at %p\n", j);
 293
 294         // Set the default logging callback
 295         pakfire_jail_set_log_callback(j, pakfire_jail_default_log_callback, NULL);
 296
 297         // Set default environment
 298         for (const struct environ* e = ENV; e->key; e++) {
 299                 r = pakfire_jail_set_env(j, e->key, e->val);
 300                 if (r)
 301                         goto ERROR;
 302         }
 303
 304         // Enable all CPU features that CPU has to offer
 305         if (!pakfire_arch_is_supported_by_host(arch)) {
 306                 r = pakfire_jail_set_env(j, "QEMU_CPU", "max");
 307                 if (r)
 308                         goto ERROR;
 309         }
 310
 311         // Set container UUID
 312         r = pakfire_jail_set_env(j, "container_uuid", pakfire_jail_uuid(j));
 313         if (r)
 314                 goto ERROR;
 315
 316         // Disable systemctl to talk to systemd
 317         if (!pakfire_on_root(j->pakfire)) {
 318                 r = pakfire_jail_set_env(j, "SYSTEMD_OFFLINE", "1");
 319                 if (r)
 320                         goto ERROR;
 321         }
 322
 323         // Done
 324         *jail = j;
 325         return 0;
 326
 327 ERROR:
 328         pakfire_jail_free(j);
 329
 330         return r;
 331 }
 332
 333 PAKFIRE_EXPORT struct pakfire_jail* pakfire_jail_ref(struct pakfire_jail* jail) {
 334         ++jail->nrefs;
 335
 336         return jail;
 337 }
 338
 339 PAKFIRE_EXPORT struct pakfire_jail* pakfire_jail_unref(struct pakfire_jail* jail) {
 340         if (--jail->nrefs > 0)
 341                 return jail;
 342
 343         pakfire_jail_free(jail);
 344         return NULL;
 345 }
 346
 347 // Logging Callback
 348
 349 PAKFIRE_EXPORT void pakfire_jail_set_log_callback(struct pakfire_jail* jail,
 350                 pakfire_jail_log_callback callback, void* data) {
 351         jail->callbacks.log = callback;
 352         jail->callbacks.log_data = data;
 353 }
 354
 355 // Resource Limits
 356
 357 PAKFIRE_EXPORT int pakfire_jail_nice(struct pakfire_jail* jail, int nice) {
 358         // Check if nice level is in range
 359         if (nice < -19 || nice > 20) {
 360                 errno = EINVAL;
 361                 return 1;
 362         }
 363
 364         // Store nice level
 365         jail->nice = nice;
 366
 367         return 0;
 368 }
 369
 370 int pakfire_jail_set_cgroup(struct pakfire_jail* jail, struct pakfire_cgroup* cgroup) {
 371         // Free any previous cgroup
 372         if (jail->cgroup) {
 373                 pakfire_cgroup_unref(jail->cgroup);
 374                 jail->cgroup = NULL;
 375         }
 376
 377         // Set any new cgroup
 378         if (cgroup) {
 379                 DEBUG(jail->pakfire, "Setting cgroup %p\n", cgroup);
 380
 381                 jail->cgroup = pakfire_cgroup_ref(cgroup);
 382         }
 383
 384         // Done
 385         return 0;
 386 }
 387
 388 // Environment
 389
 390 // Returns the length of the environment
 391 static unsigned int pakfire_jail_env_length(struct pakfire_jail* jail) {
 392         unsigned int i = 0;
 393
 394         // Count everything in the environment
 395         for (char** e = jail->env; *e; e++)
 396                 i++;
 397
 398         return i;
 399 }
 400
 401 // Finds an existing environment variable and returns its index or -1 if not found
 402 static int pakfire_jail_find_env(struct pakfire_jail* jail, const char* key) {
 403         if (!key) {
 404                 errno = EINVAL;
 405                 return -1;
 406         }
 407
 408         const size_t length = strlen(key);
 409
 410         for (unsigned int i = 0; jail->env[i]; i++) {
 411                 if ((pakfire_string_startswith(jail->env[i], key)
 412                                 && *(jail->env[i] + length) == '=')) {
 413                         return i;
 414                 }
 415         }
 416
 417         // Nothing found
 418         return -1;
 419 }
 420
 421 // Returns the value of an environment variable or NULL
 422 PAKFIRE_EXPORT const char* pakfire_jail_get_env(struct pakfire_jail* jail,
 423                 const char* key) {
 424         int i = pakfire_jail_find_env(jail, key);
 425         if (i < 0)
 426                 return NULL;
 427
 428         return jail->env[i] + strlen(key) + 1;
 429 }
 430
 431 // Sets an environment variable
 432 PAKFIRE_EXPORT int pakfire_jail_set_env(struct pakfire_jail* jail,
 433                 const char* key, const char* value) {
 434         // Find the index where to write this value to
 435         int i = pakfire_jail_find_env(jail, key);
 436         if (i < 0)
 437                 i = pakfire_jail_env_length(jail);
 438
 439         // Return -ENOSPC when the environment is full
 440         if (i >= ENVIRON_SIZE) {
 441                 errno = ENOSPC;
 442                 return -1;
 443         }
 444
 445         // Free any previous value
 446         if (jail->env[i])
 447                 free(jail->env[i]);
 448
 449         // Format and set environment variable
 450         asprintf(&jail->env[i], "%s=%s", key, value);
 451
 452         DEBUG(jail->pakfire, "Set environment variable: %s\n", jail->env[i]);
 453
 454         return 0;
 455 }
 456
 457 // Imports an environment
 458 PAKFIRE_EXPORT int pakfire_jail_import_env(struct pakfire_jail* jail, const char* env[]) {
 459         if (!env)
 460                 return 0;
 461
 462         char* key;
 463         char* val;
 464         int r;
 465
 466         // Copy environment variables
 467         for (unsigned int i = 0; env[i]; i++) {
 468                 r = pakfire_string_partition(env[i], "=", &key, &val);
 469                 if (r)
 470                         continue;
 471
 472                 // Set value
 473                 r = pakfire_jail_set_env(jail, key, val);
 474
 475                 if (key)
 476                         free(key);
 477                 if (val)
 478                         free(val);
 479
 480                 // Break on error
 481                 if (r)
 482                         return r;
 483         }
 484
 485         return 0;
 486 }
 487
 488 // Timeout
 489
 490 PAKFIRE_EXPORT int pakfire_jail_set_timeout(
 491                 struct pakfire_jail* jail, unsigned int timeout) {
 492         // Store value
 493         jail->timeout.it_value.tv_sec = timeout;
 494
 495         if (timeout > 0)
 496                 DEBUG(jail->pakfire, "Timeout set to %u second(s)\n", timeout);
 497         else
 498                 DEBUG(jail->pakfire, "Timeout disabled\n");
 499
 500         return 0;
 501 }
 502
 503 static int pakfire_jail_create_timer(struct pakfire_jail* jail) {
 504         int r;
 505
 506         // Nothing to do if no timeout has been set
 507         if (!jail->timeout.it_value.tv_sec)
 508                 return -1;
 509
 510         // Create a new timer
 511         const int fd = timerfd_create(CLOCK_MONOTONIC, 0);
 512         if (fd < 0) {
 513                 ERROR(jail->pakfire, "Could not create timer: %m\n");
 514                 goto ERROR;
 515         }
 516
 517         // Arm timer
 518         r = timerfd_settime(fd, 0, &jail->timeout, NULL);
 519         if (r) {
 520                 ERROR(jail->pakfire, "Could not arm timer: %m\n");
 521                 goto ERROR;
 522         }
 523
 524         return fd;
 525
 526 ERROR:
 527         if (fd >= 0)
 528                 close(fd);
 529
 530         return -1;
 531 }
 532
 533 // Signals
 534
 535 #if 0
 536 static int pakfire_jail_handle_signals(struct pakfire_jail* jail) {
 537         sigset_t mask;
 538         int r;
 539
 540         sigemptyset(&mask);
 541         sigaddset(&mask, SIGINT);
 542
 543         // Block signals
 544         r = sigprocmask(SIG_BLOCK, &mask, NULL);
 545         if (r < 0) {
 546                 ERROR(jail->pakfire, "Failed to block signals: %m\n");
 547                 return r;
 548         }
 549
 550         // Create a file descriptor
 551         r = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC);
 552         if (r < 0) {
 553                 ERROR(jail->pakfire, "Failed to create signalfd: %m\n");
 554                 return r;
 555         }
 556
 557         return r;
 558 }
 559 #endif
 560
 561 /*
 562         This function replaces any logging in the child process.
 563
 564         All log messages will be sent to the parent process through their respective pipes.
 565 */
 566 static void pakfire_jail_log_redirect(void* data, int priority, const char* file,
 567                 int line, const char* fn, const char* format, va_list args) {
 568         struct pakfire_jail_pipes* pipes = (struct pakfire_jail_pipes*)data;
 569         int fd;
 570
 571         switch (priority) {
 572                 case LOG_INFO:
 573                         fd = pipes->log_INFO[1];
 574                         break;
 575
 576                 case LOG_ERR:
 577                         fd = pipes->log_ERROR[1];
 578                         break;
 579
 580 #ifdef ENABLE_DEBUG
 581                 case LOG_DEBUG:
 582                         fd = pipes->log_DEBUG[1];
 583                         break;
 584 #endif /* ENABLE_DEBUG */
 585
 586                 // Ignore any messages of an unknown priority
 587                 default:
 588                         return;
 589         }
 590
 591         // Send the log message
 592         if (fd >= 0)
 593                 vdprintf(fd, format, args);
 594 }
 595
 596 static int pakfire_jail_log_buffer_is_full(const struct pakfire_log_buffer* buffer) {
 597         return (sizeof(buffer->data) == buffer->used);
 598 }
 599
 600 /*
 601         This function reads as much data as it can from the file descriptor.
 602         If it finds a whole line in it, it will send it to the logger and repeat the process.
 603         If not newline character is found, it will try to read more data until it finds one.
 604 */
 605 static int pakfire_jail_handle_log(struct pakfire_jail* jail,
 606                 struct pakfire_jail_exec* ctx, int priority, int fd,
 607                 struct pakfire_log_buffer* buffer, pakfire_jail_communicate_out callback, void* data) {
 608         char line[BUFFER_SIZE + 1];
 609
 610         // Fill up buffer from fd
 611         if (buffer->used < sizeof(buffer->data)) {
 612                 ssize_t bytes_read = read(fd, buffer->data + buffer->used,
 613                                 sizeof(buffer->data) - buffer->used);
 614
 615                 // Handle errors
 616                 if (bytes_read < 0) {
 617                         ERROR(jail->pakfire, "Could not read from fd %d: %m\n", fd);
 618                         return -1;
 619                 }
 620
 621                 // Update buffer size
 622                 buffer->used += bytes_read;
 623         }
 624
 625         // See if we have any lines that we can write
 626         while (buffer->used) {
 627                 // Search for the end of the first line
 628                 char* eol = memchr(buffer->data, '\n', buffer->used);
 629
 630                 // No newline found
 631                 if (!eol) {
 632                         // If the buffer is full, we send the content to the logger and try again
 633                         // This should not happen in practise
 634                         if (pakfire_jail_log_buffer_is_full(buffer)) {
 635                                 DEBUG(jail->pakfire, "Logging buffer is full. Sending all content\n");
 636
 637                                 eol = buffer->data + sizeof(buffer->data) - 1;
 638
 639                         // Otherwise we might have only read parts of the output
 640                         } else
 641                                 break;
 642                 }
 643
 644                 // Find the length of the string
 645                 size_t length = eol - buffer->data + 1;
 646
 647                 // Copy the line into the buffer
 648                 memcpy(line, buffer->data, length);
 649
 650                 // Terminate the string
 651                 line[length] = '\0';
 652
 653                 // Log the line
 654                 if (callback) {
 655                         int r = callback(jail->pakfire, data, priority, line, length);
 656                         if (r) {
 657                                 ERROR(jail->pakfire, "The logging callback returned an error: %d\n", r);
 658                                 return r;
 659                         }
 660                 }
 661
 662                 // Remove line from buffer
 663                 memmove(buffer->data, buffer->data + length, buffer->used - length);
 664                 buffer->used -= length;
 665         }
 666
 667         return 0;
 668 }
 669
 670 static int pakfire_jail_stream_stdin(struct pakfire_jail* jail,
 671                 struct pakfire_jail_exec* ctx, const int fd) {
 672         int r;
 673
 674         // Nothing to do if there is no stdin callback set
 675         if (!ctx->communicate.in) {
 676                 DEBUG(jail->pakfire, "Callback for standard input is not set\n");
 677                 return 0;
 678         }
 679
 680         // Skip if the writing pipe has already been closed
 681         if (!ctx->pipes.stdin[1])
 682                 return 0;
 683
 684         DEBUG(jail->pakfire, "Streaming standard input...\n");
 685
 686         // Calling the callback
 687         r = ctx->communicate.in(jail->pakfire, ctx->communicate.data, fd);
 688
 689         DEBUG(jail->pakfire, "Standard input callback finished: %d\n", r);
 690
 691         // The callback signaled that it has written everything
 692         if (r == EOF) {
 693                 DEBUG(jail->pakfire, "Closing standard input pipe\n");
 694
 695                 // Close the file-descriptor
 696                 close(fd);
 697
 698                 // Reset the file-descriptor so it won't be closed again later
 699                 ctx->pipes.stdin[1] = -1;
 700
 701                 // Report success
 702                 r = 0;
 703         }
 704
 705         return r;
 706 }
 707
 708 static int pakfire_jail_setup_pipe(struct pakfire_jail* jail, int (*fds)[2], const int flags) {
 709         int r = pipe2(*fds, flags);
 710         if (r < 0) {
 711                 ERROR(jail->pakfire, "Could not setup pipe: %m\n");
 712                 return 1;
 713         }
 714
 715         return 0;
 716 }
 717
 718 static void pakfire_jail_close_pipe(struct pakfire_jail* jail, int fds[2]) {
 719         for (unsigned int i = 0; i < 2; i++)
 720                 if (fds[i] >= 0)
 721                         close(fds[i]);
 722 }
 723
 724 /*
 725         This is a convenience function to fetch the reading end of a pipe and
 726         closes the write end.
 727 */
 728 static int pakfire_jail_get_pipe_to_read(struct pakfire_jail* jail, int (*fds)[2]) {
 729         // Give the variables easier names to avoid confusion
 730         int* fd_read  = &(*fds)[0];
 731         int* fd_write = &(*fds)[1];
 732
 733         // Close the write end of the pipe
 734         if (*fd_write >= 0) {
 735                 close(*fd_write);
 736                 *fd_write = -1;
 737         }
 738
 739         // Return the read end
 740         if (*fd_read >= 0)
 741                 return *fd_read;
 742
 743         return -1;
 744 }
 745
 746 static int pakfire_jail_get_pipe_to_write(struct pakfire_jail* jail, int (*fds)[2]) {
 747         // Give the variables easier names to avoid confusion
 748         int* fd_read  = &(*fds)[0];
 749         int* fd_write = &(*fds)[1];
 750
 751         // Close the read end of the pipe
 752         if (*fd_read >= 0) {
 753                 close(*fd_read);
 754                 *fd_read = -1;
 755         }
 756
 757         // Return the write end
 758         if (*fd_write >= 0)
 759                 return *fd_write;
 760
 761         return -1;
 762 }
 763
 764 static int pakfire_jail_recv_fd(struct pakfire_jail* jail, int socket, int* fd) {
 765         const size_t payload_length = sizeof(fd);
 766         char buffer[CMSG_SPACE(payload_length)];
 767         int r;
 768
 769         struct msghdr msg = {
 770                 .msg_control    = buffer,
 771                 .msg_controllen = sizeof(buffer),
 772         };
 773
 774         // Receive the message
 775         r = recvmsg(socket, &msg, 0);
 776         if (r) {
 777                 CTX_ERROR(jail->ctx, "Could not receive file descriptor: %s\n", strerror(errno));
 778                 return -errno;
 779         }
 780
 781         // Fetch the payload
 782         struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
 783         if (!cmsg)
 784                 return -EBADMSG;
 785
 786         *fd = *((int*)CMSG_DATA(cmsg));
 787
 788         CTX_DEBUG(jail->ctx, "Received fd %d from socket %d\n", *fd, socket);
 789
 790         return 0;
 791 }
 792
 793 static int pakfire_jail_send_fd(struct pakfire_jail* jail, int socket, int fd) {
 794         const size_t payload_length = sizeof(fd);
 795         char buffer[CMSG_SPACE(payload_length)];
 796         int r;
 797
 798         CTX_DEBUG(jail->ctx, "Sending fd %d to socket %d\n", fd, socket);
 799
 800         // Header
 801         struct msghdr msg = {
 802                 .msg_control    = buffer,
 803                 .msg_controllen = sizeof(buffer),
 804         };
 805
 806         // Payload
 807         struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
 808         cmsg->cmsg_level = SOL_SOCKET;
 809         cmsg->cmsg_type  = SCM_RIGHTS;
 810         cmsg->cmsg_len   = CMSG_LEN(payload_length);
 811
 812         // Set payload
 813         *((int*)CMSG_DATA(cmsg)) = fd;
 814
 815         // Send the message
 816         r = sendmsg(socket, &msg, 0);
 817         if (r) {
 818                 CTX_ERROR(jail->ctx, "Could not send file descriptor: %s\n", strerror(errno));
 819                 return -errno;
 820         }
 821
 822         return 0;
 823 }
 824
 825 static int pakfire_jail_log(struct pakfire* pakfire, void* data, int priority,
 826                 const char* line, const size_t length) {
 827         // Pass everything to the parent logger
 828         pakfire_log_condition(pakfire, priority, 0, "%.*s", (int)length, line);
 829
 830         return 0;
 831 }
 832
 833 static int pakfire_jail_epoll_add_fd(struct pakfire_jail* jail, int epollfd, int fd, int events) {
 834         struct epoll_event event = {
 835                 .events = events|EPOLLHUP,
 836                 .data   = {
 837                         .fd = fd,
 838                 },
 839         };
 840         int r;
 841
 842         // Read flags
 843         int flags = fcntl(fd, F_GETFL, 0);
 844
 845         // Set modified flags
 846         r  = fcntl(fd, F_SETFL, flags|O_NONBLOCK);
 847         if (r < 0) {
 848                 CTX_ERROR(jail->ctx, "Could not set file descriptor %d into non-blocking mode: %s\n",
 849                         fd, strerror(errno));
 850                 return -errno;
 851         }
 852
 853         // Add the file descriptor to the loop
 854         r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event);
 855         if (r < 0) {
 856                 ERROR(jail->pakfire, "Could not add file descriptor %d to epoll(): %s\n",
 857                         fd, strerror(errno));
 858                 return -errno;
 859         }
 860
 861         return 0;
 862 }
 863
 864 static int pakfire_jail_setup_child2(struct pakfire_jail* jail, struct pakfire_jail_exec* ctx);
 865
 866 static int pakfire_jail_wait_on_child(struct pakfire_jail* jail, int pidfd) {
 867         siginfo_t status = {};
 868         int r;
 869
 870         // Call waitid() and store the result
 871         r = waitid(P_PIDFD, pidfd, &status, WEXITED);
 872         if (r) {
 873                 CTX_ERROR(jail->ctx, "waitid() failed: %s\n", strerror(errno));
 874                 return -errno;
 875         }
 876
 877         switch (status.si_code) {
 878                 // If the process exited normally, we return the exit code
 879                 case CLD_EXITED:
 880                         CTX_DEBUG(jail->ctx, "The child process exited with code %d\n", status.si_status);
 881                         return status.si_status;
 882
 883                 case CLD_KILLED:
 884                         CTX_ERROR(jail->ctx, "The child process was killed\n");
 885                         return 139;
 886
 887                 case CLD_DUMPED:
 888                         CTX_ERROR(jail->ctx, "The child process terminated abnormally\n");
 889                         return 139;
 890
 891                 // Log anything else
 892                 default:
 893                         CTX_ERROR(jail->ctx, "Unknown child exit code: %d\n", status.si_code);
 894                         break;
 895         }
 896
 897         return -EBADMSG;
 898 }
 899
 900 static int pakfire_jail_wait(struct pakfire_jail* jail, struct pakfire_jail_exec* ctx) {
 901         int epollfd = -1;
 902         struct epoll_event events[EPOLL_MAX_EVENTS];
 903         char garbage[8];
 904         int r = 0;
 905
 906         // Fetch the UNIX domain socket
 907         const int socket_recv = pakfire_jail_get_pipe_to_read(jail, &ctx->socket);
 908
 909         // Fetch file descriptors from context
 910         const int stdin = pakfire_jail_get_pipe_to_write(jail, &ctx->pipes.stdin);
 911         const int stdout = pakfire_jail_get_pipe_to_read(jail, &ctx->pipes.stdout);
 912         const int stderr = pakfire_jail_get_pipe_to_read(jail, &ctx->pipes.stderr);
 913
 914         // Timer
 915         const int timerfd = pakfire_jail_create_timer(jail);
 916
 917         // Logging
 918         const int log_INFO  = pakfire_jail_get_pipe_to_read(jail, &ctx->pipes.log_INFO);
 919         const int log_ERROR = pakfire_jail_get_pipe_to_read(jail, &ctx->pipes.log_ERROR);
 920 #ifdef ENABLE_DEBUG
 921         const int log_DEBUG = pakfire_jail_get_pipe_to_read(jail, &ctx->pipes.log_DEBUG);
 922 #endif /* ENABLE_DEBUG */
 923
 924 #if 0
 925         // Signals
 926         const int signalfd = pakfire_jail_handle_signals(jail);
 927 #endif
 928
 929         // Make a list of all file descriptors we are interested in
 930         const struct pakfire_wait_fds {
 931                 const int fd;
 932                 const int events;
 933         } fds[] = {
 934                 { socket_recv, EPOLLIN },
 935
 936                 // Standard input/output
 937                 { stdin,  EPOLLOUT },
 938                 { stdout, EPOLLIN },
 939                 { stderr, EPOLLIN },
 940
 941                 // Timer
 942                 { timerfd, EPOLLIN },
 943
 944                 // Child Processes
 945                 { ctx->pidfd1, EPOLLIN },
 946
 947 #if 0
 948                 // Signals
 949                 { signafd, EPOLLIN },
 950 #endif
 951
 952                 // Log Pipes
 953                 { log_INFO, EPOLLIN },
 954                 { log_ERROR, EPOLLIN },
 955 #ifdef ENABLE_DEBUG
 956                 { log_DEBUG, EPOLLIN },
 957 #endif /* ENABLE_DEBUG */
 958
 959                 // Sentinel
 960                 { -1, 0 },
 961         };
 962
 963         // Setup epoll
 964         epollfd = epoll_create1(0);
 965         if (epollfd < 0) {
 966                 ERROR(jail->pakfire, "Could not initialize epoll(): %m\n");
 967                 r = 1;
 968                 goto ERROR;
 969         }
 970
 971         // Turn file descriptors into non-blocking mode and add them to epoll()
 972         for (const struct pakfire_wait_fds* fd = fds; fd->events; fd++) {
 973                 // Skip fds which were not initialized
 974                 if (fd->fd < 0)
 975                         continue;
 976
 977                 // Add the FD to the event loop
 978                 r = pakfire_jail_epoll_add_fd(jail, epollfd, fd->fd, fd->events);
 979                 if (r)
 980                         goto ERROR;
 981         }
 982
 983         int ended = 0;
 984         int exit = 0;
 985
 986         CTX_DEBUG(jail->ctx, "Launching main loop...\n");
 987
 988         // Loop for as long as the process is alive
 989         while (!ended) {
 990                 int num = epoll_wait(epollfd, events, EPOLL_MAX_EVENTS, -1);
 991                 if (num < 1) {
 992                         // Ignore if epoll_wait() has been interrupted
 993                         if (errno == EINTR)
 994                                 continue;
 995
 996                         ERROR(jail->pakfire, "epoll_wait() failed: %m\n");
 997                         r = 1;
 998
 999                         goto ERROR;
1000                 }
1001
1002                 for (int i = 0; i < num; i++) {
1003                         int e  = events[i].events;
1004                         int fd = events[i].data.fd;
1005
1006                         struct pakfire_log_buffer* buffer = NULL;
1007                         pakfire_jail_communicate_out callback = NULL;
1008                         void* data = NULL;
1009                         int priority;
1010
1011                         // Check if there is any data to be read
1012                         if (e & EPOLLIN) {
1013                                 // Monitor the first child process
1014                                 if (fd == ctx->pidfd1) {
1015                                         r = pakfire_jail_wait_on_child(jail, ctx->pidfd1);
1016                                         if (r) {
1017                                                 CTX_ERROR(jail->ctx, "The first child exited with an error\n");
1018                                                 goto ERROR;
1019                                         }
1020
1021                                         close(ctx->pidfd1);
1022                                         ctx->pidfd1 = -1;
1023
1024                                         continue;
1025
1026                                 // Monitor the second child process
1027                                 } else if (fd == ctx->pidfd2) {
1028                                         exit = pakfire_jail_wait_on_child(jail, ctx->pidfd2);
1029                                         if (exit < 0) {
1030                                                 CTX_ERROR(jail->ctx, "The second child exited with an error\n");
1031                                                 goto ERROR;
1032                                         }
1033
1034                                         close(ctx->pidfd2);
1035                                         ctx->pidfd2 = -1;
1036
1037                                         // Mark that we have ended so that we will process the remaining
1038                                         // events from epoll() now, but won't restart the outer loop.
1039                                         ended = 1;
1040
1041                                         continue;
1042
1043                                 // Handle timer events
1044                                 } else if (fd == timerfd) {
1045                                         DEBUG(jail->pakfire, "Timer event received\n");
1046
1047                                         // Disarm the timer
1048                                         r = read(timerfd, garbage, sizeof(garbage));
1049                                         if (r < 1) {
1050                                                 ERROR(jail->pakfire, "Could not disarm timer: %m\n");
1051                                                 r = 1;
1052                                                 goto ERROR;
1053                                         }
1054
1055                                         // Terminate the process if it hasn't already ended
1056                                         if (!ended) {
1057                                                 DEBUG(jail->pakfire, "Terminating process...\n");
1058
1059                                                 // Send SIGTERM to the process
1060                                                 r = pidfd_send_signal(ctx->pidfd2, SIGKILL, NULL, 0);
1061                                                 if (r) {
1062                                                         ERROR(jail->pakfire, "Could not kill process: %m\n");
1063                                                         goto ERROR;
1064                                                 }
1065                                         }
1066
1067                                         // There is nothing else to do
1068                                         continue;
1069
1070 #if 0
1071                                 // Handle signals
1072                                 } else if (fd == signalfd) {
1073                                         // Read the signal
1074                                         r = read(signalfd, &siginfo, sizeof(siginfo));
1075                                         if (r < 1) {
1076                                                 ERROR(jail->pakfire, "Could not read signal: %m\n");
1077                                                 goto ERROR;
1078                                         }
1079
1080                                         DEBUG(jail->pakfire, "Received signal %u\n", siginfo.ssi_signo);
1081
1082                                         // Handle signals
1083                                         switch (siginfo.ssi_signo) {
1084                                                 // Pass SIGINT down to the child process
1085                                                 case SIGINT:
1086                                                         r = pidfd_send_signal(pidfd, siginfo.ssi_signo, NULL, 0);
1087                                                         if (r) {
1088                                                                 ERROR(jail->pakfire, "Could not send signal to process: %m\n");
1089                                                                 goto ERROR;
1090                                                         }
1091                                                         break;
1092
1093                                                 default:
1094                                                         ERROR(jail->pakfire, "Received unhandled signal %u\n",
1095                                                                 siginfo.ssi_signo);
1096                                                         break;
1097                                         }
1098
1099                                         // Don't fall through to log processing
1100                                         continue;
1101 #endif
1102
1103                                 // Handle socket messages
1104                                 } else if (fd == socket_recv) {
1105                                         // Receive the FD of the second child process
1106                                         r = pakfire_jail_recv_fd(jail, socket_recv, &ctx->pidfd2);
1107                                         if (r)
1108                                                 goto ERROR;
1109
1110                                         // Add it to the event loop
1111                                         r = pakfire_jail_epoll_add_fd(jail, epollfd, ctx->pidfd2, EPOLLIN);
1112                                         if (r)
1113                                                 goto ERROR;
1114
1115                                         // Setup the child process
1116                                         r = pakfire_jail_setup_child2(jail, ctx);
1117                                         if (r)
1118                                                 goto ERROR;
1119
1120                                         // Don't fall through to log processing
1121                                         continue;
1122
1123                                 // Handle logging messages
1124                                 } else if (fd == log_INFO) {
1125                                         buffer = &ctx->buffers.log_INFO;
1126                                         priority = LOG_INFO;
1127
1128                                         callback = pakfire_jail_log;
1129
1130                                 } else if (fd == log_ERROR) {
1131                                         buffer = &ctx->buffers.log_ERROR;
1132                                         priority = LOG_ERR;
1133
1134                                         callback = pakfire_jail_log;
1135
1136 #ifdef ENABLE_DEBUG
1137                                 } else if (fd == log_DEBUG) {
1138                                         buffer = &ctx->buffers.log_DEBUG;
1139                                         priority = LOG_DEBUG;
1140
1141                                         callback = pakfire_jail_log;
1142 #endif /* ENABLE_DEBUG */
1143
1144                                 // Handle anything from the log pipes
1145                                 } else if (fd == stdout) {
1146                                         buffer = &ctx->buffers.stdout;
1147                                         priority = LOG_INFO;
1148
1149                                         // Send any output to the default logger if no callback is set
1150                                         if (ctx->communicate.out) {
1151                                                 callback = ctx->communicate.out;
1152                                                 data     = ctx->communicate.data;
1153                                         } else {
1154                                                 callback = jail->callbacks.log;
1155                                                 data     = jail->callbacks.log_data;
1156                                         }
1157
1158                                 } else if (fd == stderr) {
1159                                         buffer = &ctx->buffers.stderr;
1160                                         priority = LOG_ERR;
1161
1162                                         // Send any output to the default logger if no callback is set
1163                                         if (ctx->communicate.out) {
1164                                                 callback = ctx->communicate.out;
1165                                                 data     = ctx->communicate.data;
1166                                         } else {
1167                                                 callback = jail->callbacks.log;
1168                                                 data     = jail->callbacks.log_data;
1169                                         }
1170
1171                                 } else {
1172                                         DEBUG(jail->pakfire, "Received invalid file descriptor %d\n", fd);
1173                                         continue;
1174                                 }
1175
1176                                 // Handle log event
1177                                 r = pakfire_jail_handle_log(jail, ctx, priority, fd, buffer, callback, data);
1178                                 if (r)
1179                                         goto ERROR;
1180                         }
1181
1182                         if (e & EPOLLOUT) {
1183                                 // Handle standard input
1184                                 if (fd == stdin) {
1185                                         r = pakfire_jail_stream_stdin(jail, ctx, fd);
1186                                         if (r) {
1187                                                 switch (errno) {
1188                                                         // Ignore if we filled up the buffer
1189                                                         case EAGAIN:
1190                                                                 break;
1191
1192                                                         default:
1193                                                                 ERROR(jail->pakfire, "Could not write to stdin: %m\n");
1194                                                                 goto ERROR;
1195                                                 }
1196                                         }
1197                                 }
1198                         }
1199
1200                         // Check if any file descriptors have been closed
1201                         if (e & EPOLLHUP) {
1202                                 // Remove the file descriptor
1203                                 r = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, NULL);
1204                                 if (r) {
1205                                         ERROR(jail->pakfire, "Could not remove closed file-descriptor %d: %m\n", fd);
1206                                         goto ERROR;
1207                                 }
1208                         }
1209                 }
1210         }
1211
1212         // Return the exit code
1213         r = exit;
1214
1215 ERROR:
1216         CTX_DEBUG(jail->ctx, "Main loop terminated\n");
1217
1218         if (epollfd >= 0)
1219                 close(epollfd);
1220         if (timerfd >= 0)
1221                 close(timerfd);
1222 #if 0
1223         if (signalfd >= 0)
1224                 close(signalfd);
1225 #endif
1226
1227         return r;
1228 }
1229
1230 int pakfire_jail_capture_stdout(struct pakfire* pakfire, void* data,
1231                 int priority, const char* line, size_t length) {
1232         char** output = (char**)data;
1233         int r;
1234
1235         // Append everything from stdout to a buffer
1236         if (output && priority == LOG_INFO) {
1237                 r = asprintf(output, "%s%s", (output && *output) ? *output : "", line);
1238                 if (r < 0)
1239                         return 1;
1240                 return 0;
1241         }
1242
1243         // Send everything else to the default logger
1244         return pakfire_jail_default_log_callback(pakfire, NULL, priority, line, length);
1245 }
1246
1247 // Capabilities
1248
1249 // Logs all capabilities of the current process
1250 static int pakfire_jail_show_capabilities(struct pakfire_jail* jail) {
1251         cap_t caps = NULL;
1252         char* name = NULL;
1253         cap_flag_value_t value_e;
1254         cap_flag_value_t value_i;
1255         cap_flag_value_t value_p;
1256         int r;
1257
1258         // Fetch PID
1259         pid_t pid = getpid();
1260
1261         // Fetch all capabilities
1262         caps = cap_get_proc();
1263         if (!caps) {
1264                 ERROR(jail->pakfire, "Could not fetch capabilities: %m\n");
1265                 r = 1;
1266                 goto ERROR;
1267         }
1268
1269         DEBUG(jail->pakfire, "Capabilities of PID %d:\n", pid);
1270
1271         // Iterate over all capabilities
1272         for (unsigned int cap = 0; cap_valid(cap); cap++) {
1273                 name = cap_to_name(cap);
1274
1275                 // Fetch effective value
1276                 r = cap_get_flag(caps, cap, CAP_EFFECTIVE, &value_e);
1277                 if (r)
1278                         goto ERROR;
1279
1280                 // Fetch inheritable value
1281                 r = cap_get_flag(caps, cap, CAP_INHERITABLE, &value_i);
1282                 if (r)
1283                         goto ERROR;
1284
1285                 // Fetch permitted value
1286                 r = cap_get_flag(caps, cap, CAP_PERMITTED, &value_p);
1287                 if (r)
1288                         goto ERROR;
1289
1290                 DEBUG(jail->pakfire,
1291                         "  %-24s : %c%c%c\n",
1292                         name,
1293                         (value_e == CAP_SET) ? 'e' : '-',
1294                         (value_i == CAP_SET) ? 'i' : '-',
1295                         (value_p == CAP_SET) ? 'p' : '-'
1296                 );
1297
1298                 // Free name
1299                 cap_free(name);
1300                 name = NULL;
1301         }
1302
1303         // Success
1304         r = 0;
1305
1306 ERROR:
1307         if (name)
1308                 cap_free(name);
1309         if (caps)
1310                 cap_free(caps);
1311
1312         return r;
1313 }
1314
1315 static int pakfire_jail_set_capabilities(struct pakfire_jail* jail) {
1316         cap_t caps = NULL;
1317         char* name = NULL;
1318         int r;
1319
1320         // Fetch capabilities
1321         caps = cap_get_proc();
1322         if (!caps) {
1323                 ERROR(jail->pakfire, "Could not read capabilities: %m\n");
1324                 r = 1;
1325                 goto ERROR;
1326         }
1327
1328         // Walk through all capabilities
1329         for (cap_value_t cap = 0; cap_valid(cap); cap++) {
1330                 cap_value_t _caps[] = { cap };
1331
1332                 // Fetch the name of the capability
1333                 name = cap_to_name(cap);
1334
1335                 r = cap_set_flag(caps, CAP_EFFECTIVE, 1, _caps, CAP_SET);
1336                 if (r) {
1337                         ERROR(jail->pakfire, "Could not set %s: %m\n", name);
1338                         goto ERROR;
1339                 }
1340
1341                 r = cap_set_flag(caps, CAP_INHERITABLE, 1, _caps, CAP_SET);
1342                 if (r) {
1343                         ERROR(jail->pakfire, "Could not set %s: %m\n", name);
1344                         goto ERROR;
1345                 }
1346
1347                 r = cap_set_flag(caps, CAP_PERMITTED, 1, _caps, CAP_SET);
1348                 if (r) {
1349                         ERROR(jail->pakfire, "Could not set %s: %m\n", name);
1350                         goto ERROR;
1351                 }
1352
1353                 // Free name
1354                 cap_free(name);
1355                 name = NULL;
1356         }
1357
1358         // Restore all capabilities
1359         r = cap_set_proc(caps);
1360         if (r) {
1361                 ERROR(jail->pakfire, "Restoring capabilities failed: %m\n");
1362                 goto ERROR;
1363         }
1364
1365         // Add all capabilities to the ambient set
1366         for (unsigned int cap = 0; cap_valid(cap); cap++) {
1367                 name = cap_to_name(cap);
1368
1369                 // Raise the capability
1370                 r = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0);
1371                 if (r) {
1372                         ERROR(jail->pakfire, "Could not set ambient capability %s: %m\n", name);
1373                         goto ERROR;
1374                 }
1375
1376                 // Free name
1377                 cap_free(name);
1378                 name = NULL;
1379         }
1380
1381         // Success
1382         r = 0;
1383
1384 ERROR:
1385         if (name)
1386                 cap_free(name);
1387         if (caps)
1388                 cap_free(caps);
1389
1390         return r;
1391 }
1392
1393 // Syscall Filter
1394
1395 static int pakfire_jail_limit_syscalls(struct pakfire_jail* jail) {
1396         const int syscalls[] = {
1397                 // The kernel's keyring isn't namespaced
1398                 SCMP_SYS(keyctl),
1399                 SCMP_SYS(add_key),
1400                 SCMP_SYS(request_key),
1401
1402                 // Disable userfaultfd
1403                 SCMP_SYS(userfaultfd),
1404
1405                 // Disable perf which could leak a lot of information about the host
1406                 SCMP_SYS(perf_event_open),
1407
1408                 0,
1409         };
1410         int r = 1;
1411
1412         DEBUG(jail->pakfire, "Applying syscall filter...\n");
1413
1414         // Setup a syscall filter which allows everything by default
1415         scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW);
1416         if (!ctx) {
1417                 ERROR(jail->pakfire, "Could not setup seccomp filter: %m\n");
1418                 goto ERROR;
1419         }
1420
1421         // All all syscalls
1422         for (const int* syscall = syscalls; *syscall; syscall++) {
1423                 r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), *syscall, 0);
1424                 if (r) {
1425                         ERROR(jail->pakfire, "Could not configure syscall %d: %m\n", *syscall);
1426                         goto ERROR;
1427                 }
1428         }
1429
1430         // Load syscall filter into the kernel
1431         r = seccomp_load(ctx);
1432         if (r) {
1433                 ERROR(jail->pakfire, "Could not load syscall filter into the kernel: %m\n");
1434                 goto ERROR;
1435         }
1436
1437 ERROR:
1438         if (ctx)
1439                 seccomp_release(ctx);
1440
1441         return r;
1442 }
1443
1444 // Mountpoints
1445
1446 PAKFIRE_EXPORT int pakfire_jail_bind(struct pakfire_jail* jail,
1447                 const char* source, const char* target, int flags) {
1448         struct pakfire_jail_mountpoint* mp = NULL;
1449         int r;
1450
1451         // Check if there is any space left
1452         if (jail->num_mountpoints >= MAX_MOUNTPOINTS) {
1453                 errno = ENOSPC;
1454                 return 1;
1455         }
1456
1457         // Check for valid inputs
1458         if (!source || !target) {
1459                 errno = EINVAL;
1460                 return 1;
1461         }
1462
1463         // Select the next free slot
1464         mp = &jail->mountpoints[jail->num_mountpoints];
1465
1466         // Copy source
1467         r = pakfire_string_set(mp->source, source);
1468         if (r) {
1469                 ERROR(jail->pakfire, "Could not copy source: %m\n");
1470                 return r;
1471         }
1472
1473         // Copy target
1474         r = pakfire_string_set(mp->target, target);
1475         if (r) {
1476                 ERROR(jail->pakfire, "Could not copy target: %m\n");
1477                 return r;
1478         }
1479
1480         // Copy flags
1481         mp->flags = flags;
1482
1483         // Increment counter
1484         jail->num_mountpoints++;
1485
1486         return 0;
1487 }
1488
1489 static int pakfire_jail_mount_networking(struct pakfire_jail* jail) {
1490         int r;
1491
1492         const char* paths[] = {
1493                 "/etc/hosts",
1494                 "/etc/resolv.conf",
1495                 NULL,
1496         };
1497
1498         // Bind-mount all paths read-only
1499         for (const char** path = paths; *path; path++) {
1500                 r = pakfire_bind(jail->pakfire, *path, NULL, MS_RDONLY);
1501                 if (r) {
1502                         switch (errno) {
1503                                 // Ignore if we don't have permission
1504                                 case EPERM:
1505                                         continue;
1506
1507                                 default:
1508                                         break;
1509                         }
1510                         return r;
1511                 }
1512         }
1513
1514         return 0;
1515 }
1516
1517 /*
1518         Mounts everything that we require in the new namespace
1519 */
1520 static int pakfire_jail_mount(struct pakfire_jail* jail, struct pakfire_jail_exec* ctx) {
1521         struct pakfire_jail_mountpoint* mp = NULL;
1522         int flags = 0;
1523         int r;
1524
1525         // Enable loop devices
1526         if (pakfire_jail_exec_has_flag(ctx, PAKFIRE_JAIL_HAS_LOOP_DEVICES))
1527                 flags |= PAKFIRE_MOUNT_LOOP_DEVICES;
1528
1529         // Mount all default stuff
1530         r = pakfire_mount_all(jail->pakfire, flags);
1531         if (r)
1532                 return r;
1533
1534         // Mount networking stuff
1535         if (pakfire_jail_exec_has_flag(ctx, PAKFIRE_JAIL_HAS_NETWORKING)) {
1536                 r = pakfire_jail_mount_networking(jail);
1537                 if (r)
1538                         return r;
1539         }
1540
1541         // Mount all custom stuff
1542         for (unsigned int i = 0; i < jail->num_mountpoints; i++) {
1543                 // Fetch mountpoint
1544                 mp = &jail->mountpoints[i];
1545
1546                 // Mount it
1547                 r = pakfire_bind(jail->pakfire, mp->source, mp->target, mp->flags);
1548                 if (r)
1549                         return r;
1550         }
1551
1552         // Log all mountpoints
1553         pakfire_mount_list(jail->pakfire);
1554
1555         return 0;
1556 }
1557
1558 // Networking
1559
1560 static int pakfire_jail_setup_loopback(struct pakfire_jail* jail) {
1561         struct nl_sock* nl = NULL;
1562         struct nl_cache* cache = NULL;
1563         struct rtnl_link* link = NULL;
1564         struct rtnl_link* change = NULL;
1565         int r;
1566
1567         DEBUG(jail->pakfire, "Setting up loopback...\n");
1568
1569         // Allocate a netlink socket
1570         nl = nl_socket_alloc();
1571         if (!nl) {
1572                 ERROR(jail->pakfire, "Could not allocate a netlink socket: %m\n");
1573                 r = 1;
1574                 goto ERROR;
1575         }
1576
1577         // Connect the socket
1578         r = nl_connect(nl, NETLINK_ROUTE);
1579         if (r) {
1580                 ERROR(jail->pakfire, "Could not connect netlink socket: %s\n", nl_geterror(r));
1581                 goto ERROR;
1582         }
1583
1584         // Allocate the netlink cache
1585         r = rtnl_link_alloc_cache(nl, AF_UNSPEC, &cache);
1586         if (r < 0) {
1587                 ERROR(jail->pakfire, "Unable to allocate netlink cache: %s\n", nl_geterror(r));
1588                 goto ERROR;
1589         }
1590
1591         // Fetch loopback interface
1592         link = rtnl_link_get_by_name(cache, "lo");
1593         if (!link) {
1594                 ERROR(jail->pakfire, "Could not find lo interface. Ignoring.\n");
1595                 r = 0;
1596                 goto ERROR;
1597         }
1598
1599         // Allocate a new link
1600         change = rtnl_link_alloc();
1601         if (!change) {
1602                 ERROR(jail->pakfire, "Could not allocate change link\n");
1603                 r = 1;
1604                 goto ERROR;
1605         }
1606
1607         // Set the link to UP
1608         rtnl_link_set_flags(change, IFF_UP);
1609
1610         // Apply any changes
1611         r = rtnl_link_change(nl, link, change, 0);
1612         if (r) {
1613                 ERROR(jail->pakfire, "Unable to activate loopback: %s\n", nl_geterror(r));
1614                 goto ERROR;
1615         }
1616
1617         // Success
1618         r = 0;
1619
1620 ERROR:
1621         if (nl)
1622                 nl_socket_free(nl);
1623
1624         return r;
1625 }
1626
1627 // UID/GID Mapping
1628
1629 static int pakfire_jail_setup_uid_mapping(struct pakfire_jail* jail, pid_t pid) {
1630         char path[PATH_MAX];
1631         int r;
1632
1633         // Skip mapping anything when running on /
1634         if (pakfire_on_root(jail->pakfire))
1635                 return 0;
1636
1637         // Make path
1638         r = pakfire_string_format(path, "/proc/%d/uid_map", pid);
1639         if (r)
1640                 return r;
1641
1642         // Fetch UID
1643         const uid_t uid = pakfire_uid(jail->pakfire);
1644
1645         // Fetch SUBUID
1646         const struct pakfire_subid* subuid = pakfire_subuid(jail->pakfire);
1647         if (!subuid)
1648                 return 1;
1649
1650         /* When running as root, we will map the entire range.
1651
1652            When running as a non-privileged user, we will map the root user inside the jail
1653            to the user's UID outside of the jail, and we will map the rest starting from one.
1654         */
1655
1656         // Running as root
1657         if (uid == 0) {
1658                 r = pakfire_file_write(jail->pakfire, path, 0, 0, 0,
1659                         "0 %lu %lu\n", subuid->id, subuid->length);
1660         } else {
1661                 r = pakfire_file_write(jail->pakfire, path, 0, 0, 0,
1662                         "0 %lu 1\n1 %lu %lu\n", uid, subuid->id, subuid->length);
1663         }
1664
1665         if (r) {
1666                 ERROR(jail->pakfire, "Could not map UIDs: %m\n");
1667                 return r;
1668         }
1669
1670         return r;
1671 }
1672
1673 static int pakfire_jail_setup_gid_mapping(struct pakfire_jail* jail, pid_t pid) {
1674         char path[PATH_MAX];
1675         int r;
1676
1677         // Skip mapping anything when running on /
1678         if (pakfire_on_root(jail->pakfire))
1679                 return 0;
1680
1681         // Fetch GID
1682         const gid_t gid = pakfire_gid(jail->pakfire);
1683
1684         // Fetch SUBGID
1685         const struct pakfire_subid* subgid = pakfire_subgid(jail->pakfire);
1686         if (!subgid)
1687                 return 1;
1688
1689         // Make path
1690         r = pakfire_string_format(path, "/proc/%d/gid_map", pid);
1691         if (r)
1692                 return r;
1693
1694         // Running as root
1695         if (gid == 0) {
1696                 r = pakfire_file_write(jail->pakfire, path, 0, 0, 0,
1697                         "0 %lu %lu\n", subgid->id, subgid->length);
1698         } else {
1699                 r = pakfire_file_write(jail->pakfire, path, 0, 0, 0,
1700                         "0 %lu 1\n%1 %lu %lu\n", gid, subgid->id, subgid->length);
1701         }
1702
1703         if (r) {
1704                 ERROR(jail->pakfire, "Could not map GIDs: %m\n");
1705                 return r;
1706         }
1707
1708         return r;
1709 }
1710
1711 static int pakfire_jail_setgroups(struct pakfire_jail* jail, pid_t pid) {
1712         char path[PATH_MAX];
1713         int r = 1;
1714
1715         // Make path
1716         r = pakfire_string_format(path, "/proc/%d/setgroups", pid);
1717         if (r)
1718                 return r;
1719
1720         // Open file for writing
1721         FILE* f = fopen(path, "w");
1722         if (!f) {
1723                 ERROR(jail->pakfire, "Could not open %s for writing: %m\n", path);
1724                 goto ERROR;
1725         }
1726
1727         // Write content
1728         int bytes_written = fprintf(f, "deny\n");
1729         if (bytes_written <= 0) {
1730                 ERROR(jail->pakfire, "Could not write to %s: %m\n", path);
1731                 goto ERROR;
1732         }
1733
1734         r = fclose(f);
1735         f = NULL;
1736         if (r) {
1737                 ERROR(jail->pakfire, "Could not close %s: %m\n", path);
1738                 goto ERROR;
1739         }
1740
1741 ERROR:
1742         if (f)
1743                 fclose(f);
1744
1745         return r;
1746 }
1747
1748 static int pakfire_jail_send_signal(struct pakfire_jail* jail, int fd) {
1749         const uint64_t val = 1;
1750         int r = 0;
1751
1752         DEBUG(jail->pakfire, "Sending signal...\n");
1753
1754         // Write to the file descriptor
1755         r = eventfd_write(fd, val);
1756         if (r < 0) {
1757                 ERROR(jail->pakfire, "Could not send signal: %s\n", strerror(errno));
1758                 r = -errno;
1759         }
1760
1761         // Close the file descriptor
1762         close(fd);
1763
1764         return r;
1765 }
1766
1767 static int pakfire_jail_wait_for_signal(struct pakfire_jail* jail, int fd) {
1768         uint64_t val = 0;
1769         int r = 0;
1770
1771         DEBUG(jail->pakfire, "Waiting for signal...\n");
1772
1773         r = eventfd_read(fd, &val);
1774         if (r < 0) {
1775                 ERROR(jail->pakfire, "Error waiting for signal: %s\n", strerror(errno));
1776                 r = -errno;
1777         }
1778
1779         // Close the file descriptor
1780         close(fd);
1781
1782         return r;
1783 }
1784
1785 static int pakfire_jail_switch_root(struct pakfire_jail* jail, const char* root) {
1786         int r;
1787
1788         // Change to the new root
1789         r = chdir(root);
1790         if (r) {
1791                 ERROR(jail->pakfire, "chdir(%s) failed: %m\n", root);
1792                 return r;
1793         }
1794
1795         // Switch Root!
1796         r = pivot_root(".", ".");
1797         if (r) {
1798                 ERROR(jail->pakfire, "Failed changing into the new root directory %s: %m\n", root);
1799                 return r;
1800         }
1801
1802         // Umount the old root
1803         r = umount2(".", MNT_DETACH);
1804         if (r) {
1805                 ERROR(jail->pakfire, "Could not umount the old root filesystem: %m\n");
1806                 return r;
1807         }
1808
1809         return 0;
1810 }
1811
1812 /*
1813         Called by the parent that sets up the second child process...
1814 */
1815 static int pakfire_jail_setup_child2(
1816                 struct pakfire_jail* jail, struct pakfire_jail_exec* ctx) {
1817         pid_t pid = -1;
1818         int r;
1819
1820         // Fetch the PID
1821         r = pidfd_get_pid(ctx->pidfd2, &pid);
1822         if (r) {
1823                 CTX_ERROR(jail->ctx, "Could not fetch PID: %s\n", strerror(-r));
1824                 return r;
1825         }
1826
1827         // Setup UID mapping
1828         r = pakfire_jail_setup_uid_mapping(jail, pid);
1829         if (r)
1830                 return r;
1831
1832         // Write "deny" to /proc/PID/setgroups
1833         r = pakfire_jail_setgroups(jail, pid);
1834         if (r)
1835                 return r;
1836
1837         // Setup GID mapping
1838         r = pakfire_jail_setup_gid_mapping(jail, pid);
1839         if (r)
1840                 return r;
1841
1842         // Parent has finished initialisation
1843         DEBUG(jail->pakfire, "Parent has finished initialization\n");
1844
1845         // Send signal to client
1846         r = pakfire_jail_send_signal(jail, ctx->completed_fd);
1847         if (r)
1848                 return r;
1849
1850         return 0;
1851 }
1852
1853 /*
1854         Child 2 is launched in their own user/mount/etc. namespace.
1855 */
1856 static int pakfire_jail_child2(struct pakfire_jail* jail,
1857                 struct pakfire_jail_exec* ctx, const char* argv[]) {
1858         int r;
1859
1860         // Fetch my own PID
1861         pid_t pid = getpid();
1862
1863         CTX_DEBUG(jail->ctx, "Launched child process in jail with PID %d\n", pid);
1864
1865         // Die with parent
1866         r = prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
1867         if (r) {
1868                 CTX_ERROR(jail->ctx, "Could not configure to die with parent: %m\n");
1869                 return 126;
1870         }
1871
1872         // Make this process dumpable
1873         r = prctl (PR_SET_DUMPABLE, 1, 0, 0, 0);
1874         if (r) {
1875                 CTX_ERROR(jail->ctx, "Could not make the process dumpable: %m\n");
1876                 return 126;
1877         }
1878
1879         // Don't drop any capabilities on setuid()
1880         r = prctl(PR_SET_KEEPCAPS, 1);
1881         if (r) {
1882                 CTX_ERROR(jail->ctx, "Could not set PR_SET_KEEPCAPS: %m\n");
1883                 return 126;
1884         }
1885
1886         // Wait for the parent to finish initialization
1887         r = pakfire_jail_wait_for_signal(jail, ctx->completed_fd);
1888         if (r)
1889                 return r;
1890
1891         // Fetch UID/GID
1892         uid_t uid = getuid();
1893         gid_t gid = getgid();
1894
1895         // Fetch EUID/EGID
1896         uid_t euid = geteuid();
1897         gid_t egid = getegid();
1898
1899         DEBUG(jail->pakfire, "  UID: %u (effective %u)\n", uid, euid);
1900         DEBUG(jail->pakfire, "  GID: %u (effective %u)\n", gid, egid);
1901
1902         // Fail if we are not PID 1
1903         if (pid != 1) {
1904                 CTX_ERROR(jail->ctx, "Child process is not PID 1\n");
1905                 //return 126;
1906         }
1907
1908         // Fail if we are not running as root
1909         if (uid || gid || euid || egid) {
1910                 ERROR(jail->pakfire, "Child process is not running as root\n");
1911                 //return 126;
1912         }
1913
1914         const char* arch = pakfire_get_effective_arch(jail->pakfire);
1915
1916         // Set personality
1917         unsigned long persona = pakfire_arch_personality(arch);
1918         if (persona) {
1919                 r = personality(persona);
1920                 if (r < 0) {
1921                         ERROR(jail->pakfire, "Could not set personality (%x)\n", (unsigned int)persona);
1922                         return 126;
1923                 }
1924         }
1925
1926         // Setup networking
1927         if (!pakfire_jail_exec_has_flag(ctx, PAKFIRE_JAIL_HAS_NETWORKING)) {
1928                 r = pakfire_jail_setup_loopback(jail);
1929                 if (r)
1930                         return 1;
1931         }
1932
1933         // Set nice level
1934         if (jail->nice) {
1935                 DEBUG(jail->pakfire, "Setting nice level to %d\n", jail->nice);
1936
1937                 r = setpriority(PRIO_PROCESS, pid, jail->nice);
1938                 if (r) {
1939                         ERROR(jail->pakfire, "Could not set nice level: %m\n");
1940                         return 1;
1941                 }
1942         }
1943
1944         // Close other end of log pipes
1945         close(ctx->pipes.log_INFO[0]);
1946         close(ctx->pipes.log_ERROR[0]);
1947 #ifdef ENABLE_DEBUG
1948         close(ctx->pipes.log_DEBUG[0]);
1949 #endif /* ENABLE_DEBUG */
1950
1951         // Connect standard input
1952         if (ctx->pipes.stdin[0] >= 0) {
1953                 r = dup2(ctx->pipes.stdin[0], STDIN_FILENO);
1954                 if (r < 0) {
1955                         ERROR(jail->pakfire, "Could not connect fd %d to stdin: %m\n",
1956                                 ctx->pipes.stdin[0]);
1957
1958                         return 1;
1959                 }
1960         }
1961
1962         // Connect standard output and error
1963         if (ctx->pipes.stdout[1] >= 0 && ctx->pipes.stderr[1] >= 0) {
1964                 r = dup2(ctx->pipes.stdout[1], STDOUT_FILENO);
1965                 if (r < 0) {
1966                         ERROR(jail->pakfire, "Could not connect fd %d to stdout: %m\n",
1967                                 ctx->pipes.stdout[1]);
1968
1969                         return 1;
1970                 }
1971
1972                 r = dup2(ctx->pipes.stderr[1], STDERR_FILENO);
1973                 if (r < 0) {
1974                         ERROR(jail->pakfire, "Could not connect fd %d to stderr: %m\n",
1975                                 ctx->pipes.stderr[1]);
1976
1977                         return 1;
1978                 }
1979
1980                 // Close the pipe (as we have moved the original file descriptors)
1981                 pakfire_jail_close_pipe(jail, ctx->pipes.stdin);
1982                 pakfire_jail_close_pipe(jail, ctx->pipes.stdout);
1983                 pakfire_jail_close_pipe(jail, ctx->pipes.stderr);
1984         }
1985
1986         // Reset open file limit (http://0pointer.net/blog/file-descriptor-limits.html)
1987         r = pakfire_rlimit_reset_nofile(jail->pakfire);
1988         if (r)
1989                 return r;
1990
1991         // Set capabilities
1992         r = pakfire_jail_set_capabilities(jail);
1993         if (r)
1994                 return r;
1995
1996         // Show capabilities
1997         r = pakfire_jail_show_capabilities(jail);
1998         if (r)
1999                 return r;
2000
2001         // Filter syscalls
2002         r = pakfire_jail_limit_syscalls(jail);
2003         if (r)
2004                 return r;
2005
2006         CTX_DEBUG(jail->ctx, "Child process initialization done\n");
2007         CTX_DEBUG(jail->ctx, "Launching command:\n");
2008
2009         // Log argv
2010         for (unsigned int i = 0; argv[i]; i++)
2011                 CTX_DEBUG(jail->ctx, "  argv[%u] = %s\n", i, argv[i]);
2012
2013         // exec() command
2014         r = execvpe(argv[0], (char**)argv, jail->env);
2015         if (r < 0) {
2016                 // Translate errno into regular exit code
2017                 switch (errno) {
2018                         case ENOENT:
2019                                 // Ignore if the command doesn't exist
2020                                 if (ctx->flags & PAKFIRE_JAIL_NOENT_OK)
2021                                         r = 0;
2022                                 else
2023                                         r = 127;
2024
2025                                 break;
2026
2027                         default:
2028                                 r = 1;
2029                 }
2030
2031                 CTX_ERROR(jail->ctx, "Could not execve(%s): %m\n", argv[0]);
2032         }
2033
2034         // We should not get here
2035         return r;
2036 }
2037
2038 /*
2039         Child 1 is launched in a new mount namespace...
2040 */
2041 static int pakfire_jail_child1(struct pakfire_jail* jail,
2042                 struct pakfire_jail_exec* ctx, const char* argv[]) {
2043         int r;
2044
2045         // Redirect any logging to our log pipe
2046         pakfire_ctx_set_log_callback(jail->ctx, pakfire_jail_log_redirect, &ctx->pipes);
2047
2048         CTX_DEBUG(jail->ctx, "First child process launched\n");
2049
2050         const int socket_send = pakfire_jail_get_pipe_to_write(jail, &ctx->socket);
2051
2052         const char* root = pakfire_get_path(jail->pakfire);
2053
2054         // Die with parent
2055         r = prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
2056         if (r) {
2057                 CTX_ERROR(jail->ctx, "Could not configure to die with parent: %s\n", strerror(errno));
2058                 goto ERROR;
2059         }
2060
2061         // Change mount propagation so that we will receive, but don't propagate back
2062         r = pakfire_mount_change_propagation(jail->ctx, "/", MS_SLAVE);
2063         if (r) {
2064                 CTX_ERROR(jail->ctx, "Could not change mount propagation to SLAVE: %s\n", strerror(r));
2065                 goto ERROR;
2066         }
2067
2068         // Make root a mountpoint in the new mount namespace
2069         r = pakfire_mount_make_mounpoint(jail->pakfire, root);
2070         if (r)
2071                 goto ERROR;
2072
2073         // Make everything private
2074         r = pakfire_mount_change_propagation(jail->ctx, root, MS_PRIVATE);
2075         if (r) {
2076                 CTX_ERROR(jail->ctx, "Could not change mount propagation to PRIVATE: %s\n", strerror(r));
2077                 goto ERROR;
2078         }
2079
2080         // Mount everything
2081         r = pakfire_jail_mount(jail, ctx);
2082         if (r)
2083                 goto ERROR;
2084
2085         // chroot()
2086         r = pakfire_jail_switch_root(jail, root);
2087         if (r)
2088                 goto ERROR;
2089
2090         // Change mount propagation so that we will propagate everything down
2091         r = pakfire_mount_change_propagation(jail->ctx, "/", MS_SHARED);
2092         if (r) {
2093                 CTX_ERROR(jail->ctx, "Could not change mount propagation to SHARED: %s\n", strerror(r));
2094                 goto ERROR;
2095         }
2096
2097         // Configure child process
2098         struct clone_args args = {
2099                 .flags =
2100                         CLONE_NEWCGROUP |
2101                         CLONE_NEWIPC |
2102                         CLONE_NEWNS |
2103                         CLONE_NEWPID |
2104                         CLONE_NEWTIME |
2105                         CLONE_NEWUSER |
2106                         CLONE_NEWUTS |
2107                         CLONE_PIDFD,
2108                 .exit_signal = SIGCHLD,
2109                 .pidfd = (long long unsigned int)&ctx->pidfd2,
2110         };
2111
2112         // Launch the process into the configured cgroup
2113         if (ctx->cgroup) {
2114                 args.flags |= CLONE_INTO_CGROUP;
2115
2116                 // Clone into this cgroup
2117                 args.cgroup = pakfire_cgroup_fd(ctx->cgroup);
2118         }
2119
2120         // Setup networking
2121         if (!pakfire_jail_exec_has_flag(ctx, PAKFIRE_JAIL_HAS_NETWORKING))
2122                 args.flags |= CLONE_NEWNET;
2123
2124         // Fork the second child process
2125         pid_t pid = clone3(&args, sizeof(args));
2126         if (pid < 0) {
2127                 CTX_ERROR(jail->ctx, "Could not fork the first child process: %s\n", strerror(errno));
2128                 r = -errno;
2129                 goto ERROR;
2130
2131         // Child process
2132         } else if (pid == 0) {
2133                 r = pakfire_jail_child2(jail, ctx, argv);
2134                 _exit(r);
2135         }
2136
2137         // Send the pidfd of the child to the first parent
2138         r = pakfire_jail_send_fd(jail, socket_send, ctx->pidfd2);
2139         if (r)
2140                 goto ERROR;
2141
2142 ERROR:
2143         return r;
2144 }
2145
2146 // Run a command in the jail
2147 static int __pakfire_jail_exec(struct pakfire_jail* jail, const char* argv[],
2148                 const int interactive,
2149                 pakfire_jail_communicate_in  communicate_in,
2150                 pakfire_jail_communicate_out communicate_out,
2151                 void* data, int flags) {
2152         int r;
2153
2154         // Check if argv is valid
2155         if (!argv || !argv[0]) {
2156                 errno = EINVAL;
2157                 return -1;
2158         }
2159
2160         // Initialize context for this call
2161         struct pakfire_jail_exec ctx = {
2162                 .flags = flags,
2163
2164                 .socket = { -1, -1 },
2165
2166                 .pipes = {
2167                         .stdin     = { -1, -1 },
2168                         .stdout    = { -1, -1 },
2169                         .stderr    = { -1, -1 },
2170                         .log_INFO  = { -1, -1 },
2171                         .log_ERROR = { -1, -1 },
2172 #ifdef ENABLE_DEBUG
2173                         .log_DEBUG = { -1, -1 },
2174 #endif /* ENABLE_DEBUG */
2175                 },
2176
2177                 .communicate = {
2178                         .in   = communicate_in,
2179                         .out  = communicate_out,
2180                         .data = data,
2181                 },
2182
2183                 // PIDs
2184                 .pidfd1 = -1,
2185                 .pidfd2 = -1,
2186         };
2187
2188         DEBUG(jail->pakfire, "Executing jail...\n");
2189
2190         // Become the subreaper
2191         r = prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0);
2192         if (r < 0) {
2193                 CTX_ERROR(jail->ctx, "Failed to become the sub-reaper: %s\n", strerror(errno));
2194                 r = -errno;
2195                 goto ERROR;
2196         }
2197
2198         // Enable networking in interactive mode
2199         if (interactive)
2200                 ctx.flags |= PAKFIRE_JAIL_HAS_NETWORKING;
2201
2202         // Create a UNIX domain socket
2203         r = socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ctx.socket);
2204         if (r < 0) {
2205                 CTX_ERROR(jail->ctx, "Could not create UNIX socket: %s\n", strerror(errno));
2206                 r = -errno;
2207                 goto ERROR;
2208         }
2209
2210         /*
2211                 Setup a file descriptor which can be used to notify the client that the parent
2212                 has completed configuration.
2213         */
2214         ctx.completed_fd = eventfd(0, EFD_CLOEXEC);
2215         if (ctx.completed_fd < 0) {
2216                 ERROR(jail->pakfire, "eventfd() failed: %m\n");
2217                 return -1;
2218         }
2219
2220         // Create pipes to communicate with child process if we are not running interactively
2221         if (!interactive) {
2222                 // stdin (only if callback is set)
2223                 if (ctx.communicate.in) {
2224                         r = pakfire_jail_setup_pipe(jail, &ctx.pipes.stdin, 0);
2225                         if (r)
2226                                 goto ERROR;
2227                 }
2228
2229                 // stdout
2230                 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.stdout, 0);
2231                 if (r)
2232                         goto ERROR;
2233
2234                 // stderr
2235                 r = pakfire_jail_setup_pipe(jail, &ctx.pipes.stderr, 0);
2236                 if (r)
2237                         goto ERROR;
2238         }
2239
2240         // Setup pipes for logging
2241         // INFO
2242         r = pakfire_jail_setup_pipe(jail, &ctx.pipes.log_INFO, O_CLOEXEC);
2243         if (r)
2244                 goto ERROR;
2245
2246         // ERROR
2247         r = pakfire_jail_setup_pipe(jail, &ctx.pipes.log_ERROR, O_CLOEXEC);
2248         if (r)
2249                 goto ERROR;
2250
2251 #ifdef ENABLE_DEBUG
2252         // DEBUG
2253         r = pakfire_jail_setup_pipe(jail, &ctx.pipes.log_DEBUG, O_CLOEXEC);
2254         if (r)
2255                 goto ERROR;
2256 #endif /* ENABLE_DEBUG */
2257
2258         // Launch the process in a cgroup that is a leaf of the configured cgroup
2259         if (jail->cgroup) {
2260                 // Fetch our UUID
2261                 const char* uuid = pakfire_jail_uuid(jail);
2262
2263                 // Create a temporary cgroup
2264                 r = pakfire_cgroup_child(&ctx.cgroup, jail->cgroup, uuid, 0);
2265                 if (r) {
2266                         ERROR(jail->pakfire, "Could not create cgroup for jail: %m\n");
2267                         goto ERROR;
2268                 }
2269         }
2270
2271         /*
2272                 Initially, we will set up a new mount namespace and launch a child process in it.
2273
2274                 This process remains in the user/ipc/time/etc. namespace and will set up
2275                 the mount namespace.
2276         */
2277
2278         // Configure child process
2279         struct clone_args args = {
2280                 .flags =
2281                         CLONE_NEWNS |
2282                         CLONE_PIDFD |
2283                         CLONE_CLEAR_SIGHAND,
2284                 .exit_signal = SIGCHLD,
2285                 .pidfd = (long long unsigned int)&ctx.pidfd1,
2286         };
2287
2288         // Fork the first child process
2289         pid_t pid = clone3(&args, sizeof(args));
2290         if (pid < 0) {
2291                 CTX_ERROR(jail->ctx, "Could not fork the first child process: %s\n", strerror(errno));
2292                 r = -errno;
2293                 goto ERROR;
2294
2295         // Child process
2296         } else if (pid == 0) {
2297                 r = pakfire_jail_child1(jail, &ctx, argv);
2298                 _exit(r);
2299         }
2300
2301         // Parent process
2302         r = pakfire_jail_wait(jail, &ctx);
2303         if (r)
2304                 goto ERROR;
2305
2306 ERROR:
2307         // Destroy the temporary cgroup (if any)
2308         if (ctx.cgroup) {
2309 #if 0
2310                 // XXX this is currently disabled because it overwrites r
2311                 // Read cgroup stats
2312                 r = pakfire_cgroup_stat(ctx.cgroup, &ctx.cgroup_stats);
2313                 if (r) {
2314                         ERROR(jail->pakfire, "Could not read cgroup stats: %m\n");
2315                 } else {
2316                         pakfire_cgroup_stat_dump(ctx.cgroup, &ctx.cgroup_stats);
2317                 }
2318 #endif
2319
2320                 pakfire_cgroup_destroy(ctx.cgroup);
2321                 pakfire_cgroup_unref(ctx.cgroup);
2322         }
2323
2324         // Close any file descriptors
2325         pakfire_jail_close_pipe(jail, ctx.pipes.stdin);
2326         pakfire_jail_close_pipe(jail, ctx.pipes.stdout);
2327         pakfire_jail_close_pipe(jail, ctx.pipes.stderr);
2328         pakfire_jail_close_pipe(jail, ctx.pipes.log_INFO);
2329         pakfire_jail_close_pipe(jail, ctx.pipes.log_ERROR);
2330 #ifdef ENABLE_DEBUG
2331         pakfire_jail_close_pipe(jail, ctx.pipes.log_DEBUG);
2332 #endif /* ENABLE_DEBUG */
2333         if (ctx.pidfd1 >= 0)
2334                 close(ctx.pidfd1);
2335         if (ctx.pidfd2 >= 0)
2336                 close(ctx.pidfd2);
2337
2338         // Close sockets
2339         pakfire_jail_close_pipe(jail, ctx.socket);
2340
2341         return r;
2342 }
2343
2344 PAKFIRE_EXPORT int pakfire_jail_exec(
2345                 struct pakfire_jail* jail,
2346                 const char* argv[],
2347                 pakfire_jail_communicate_in  callback_in,
2348                 pakfire_jail_communicate_out callback_out,
2349                 void* data, int flags) {
2350         return __pakfire_jail_exec(jail, argv, 0, callback_in, callback_out, data, flags);
2351 }
2352
2353 static int pakfire_jail_exec_interactive(
2354                 struct pakfire_jail* jail, const char* argv[], int flags) {
2355         int r;
2356
2357         // Setup interactive stuff
2358         r = pakfire_jail_setup_interactive_env(jail);
2359         if (r)
2360                 return r;
2361
2362         return __pakfire_jail_exec(jail, argv, 1, NULL, NULL, NULL, flags);
2363 }
2364
2365 int pakfire_jail_exec_script(struct pakfire_jail* jail,
2366                 const char* script,
2367                 const size_t size,
2368                 const char* args[],
2369                 pakfire_jail_communicate_in  callback_in,
2370                 pakfire_jail_communicate_out callback_out,
2371                 void* data) {
2372         char path[PATH_MAX];
2373         const char** argv = NULL;
2374         FILE* f = NULL;
2375         int r;
2376
2377         const char* root = pakfire_get_path(jail->pakfire);
2378
2379         // Write the scriptlet to disk
2380         r = pakfire_path_append(path, root, PAKFIRE_TMP_DIR "/pakfire-script.XXXXXX");
2381         if (r)
2382                 goto ERROR;
2383
2384         // Create a temporary file
2385         f = pakfire_mktemp(path, 0700);
2386         if (!f) {
2387                 ERROR(jail->pakfire, "Could not create temporary file: %m\n");
2388                 goto ERROR;
2389         }
2390
2391         DEBUG(jail->pakfire, "Writing script to %s:\n%.*s\n", path, (int)size, script);
2392
2393         // Write data
2394         r = fprintf(f, "%s", script);
2395         if (r < 0) {
2396                 ERROR(jail->pakfire, "Could not write script to file %s: %m\n", path);
2397                 goto ERROR;
2398         }
2399
2400         // Close file
2401         r = fclose(f);
2402         if (r) {
2403                 ERROR(jail->pakfire, "Could not close script file %s: %m\n", path);
2404                 goto ERROR;
2405         }
2406
2407         f = NULL;
2408
2409         // Count how many arguments were passed
2410         unsigned int argc = 1;
2411         if (args) {
2412                 for (const char** arg = args; *arg; arg++)
2413                         argc++;
2414         }
2415
2416         argv = calloc(argc + 1, sizeof(*argv));
2417         if (!argv) {
2418                 ERROR(jail->pakfire, "Could not allocate argv: %m\n");
2419                 goto ERROR;
2420         }
2421
2422         // Set command
2423         argv[0] = (root) ? pakfire_path_relpath(root, path) : path;
2424
2425         // Copy args
2426         for (unsigned int i = 1; i < argc; i++)
2427                 argv[i] = args[i-1];
2428
2429         // Run the script
2430         r = pakfire_jail_exec(jail, argv, callback_in, callback_out, data, 0);
2431
2432 ERROR:
2433         if (argv)
2434                 free(argv);
2435         if (f)
2436                 fclose(f);
2437
2438         // Remove script from disk
2439         if (*path)
2440                 unlink(path);
2441
2442         return r;
2443 }
2444
2445 /*
2446         A convenience function that creates a new jail, runs the given command and destroys
2447         the jail again.
2448 */
2449 int pakfire_jail_run(struct pakfire* pakfire, const char* argv[], int flags, char** output) {
2450         struct pakfire_jail* jail = NULL;
2451         int r;
2452
2453         // Create a new jail
2454         r = pakfire_jail_create(&jail, pakfire);
2455         if (r)
2456                 goto ERROR;
2457
2458         // Execute the command
2459         r = pakfire_jail_exec(jail, argv, NULL, pakfire_jail_capture_stdout, output, 0);
2460
2461 ERROR:
2462         if (jail)
2463                 pakfire_jail_unref(jail);
2464
2465         return r;
2466 }
2467
2468 int pakfire_jail_run_script(struct pakfire* pakfire,
2469                 const char* script, const size_t length, const char* argv[], int flags) {
2470         struct pakfire_jail* jail = NULL;
2471         int r;
2472
2473         // Create a new jail
2474         r = pakfire_jail_create(&jail, pakfire);
2475         if (r)
2476                 goto ERROR;
2477
2478         // Execute the command
2479         r = pakfire_jail_exec_script(jail, script, length, argv, NULL, NULL, NULL);
2480
2481 ERROR:
2482         if (jail)
2483                 pakfire_jail_unref(jail);
2484
2485         return r;
2486 }
2487
2488 int pakfire_jail_shell(struct pakfire_jail* jail) {
2489         int r;
2490
2491         const char* argv[] = {
2492                 "/bin/bash", "--login", NULL,
2493         };
2494
2495         // Execute /bin/bash
2496         r = pakfire_jail_exec_interactive(jail, argv, 0);
2497
2498         // Raise any errors
2499         if (r < 0)
2500                 return r;
2501
2502         // Ignore any return codes from the shell
2503         return 0;
2504 }
2505
2506 static int pakfire_jail_run_if_possible(struct pakfire* pakfire, const char** argv) {
2507         char path[PATH_MAX];
2508         int r;
2509
2510         r = pakfire_path(pakfire, path, "%s", *argv);
2511         if (r)
2512                 return r;
2513
2514         // Check if the file is executable
2515         r = access(path, X_OK);
2516         if (r) {
2517                 DEBUG(pakfire, "%s is not executable. Skipping...\n", *argv);
2518                 return 0;
2519         }
2520
2521         return pakfire_jail_run(pakfire, argv, 0, NULL);
2522 }
2523
2524 int pakfire_jail_ldconfig(struct pakfire* pakfire) {
2525         const char* argv[] = {
2526                 "/sbin/ldconfig",
2527                 NULL,
2528         };
2529
2530         return pakfire_jail_run_if_possible(pakfire, argv);
2531 }
2532
2533 int pakfire_jail_run_systemd_tmpfiles(struct pakfire* pakfire) {
2534         const char* argv[] = {
2535                 "/usr/bin/systemd-tmpfiles",
2536                 "--create",
2537                 NULL,
2538         };
2539
2540         return pakfire_jail_run_if_possible(pakfire, argv);
2541 }