Features:
+* Add an alias to systemd-run maybe called "uid0" or so, which tries to mimic
+ the sudo/su command lines to some level, but is backed by transient services,
+ and proper security isolate/tty forwarding. This would then allow us to run
+ systems with NNP turned on reasonably nicely. To make this extra nice and
+ pretty, in ptyfwd rewrite every NL we pass through so that it resets the bg
+ color to some reddish tone, and erase the whole coming line first, so that
+ the background color indicates when operating with privileges and when not.
+
+* use udev rule networkd ownership property to take ownership of network
+ interfaces nspawn creates
+
* add a kernel cmdline switch (and cred?) for marking a system to be
"headless", in which case we never open /dev/console for reading, only for
writing. This would then mean: systemd-firstboot would process creds but not
-device scsi-hd,drive=hd,bootindex=1 \
-device vhost-vsock-pci,id=vhost-vsock-pci0,guest-cid=42 \
-smbios type=11,value=io.systemd.credential:vmm.notify_socket=vsock:2:1234 \
- -smbios type=11,value=io.systemd.credential.binary:tmpfiles.extra=$(echo "f~ /root/.ssh/authorized_keys 700 root root - $(ssh-add -L | base64 -w 0)" | base64 -w 0)
+ -smbios type=11,value=io.systemd.credential.binary:tmpfiles.extra=$(echo "f~ /root/.ssh/authorized_keys 600 root root - $(ssh-add -L | base64 -w 0)" | base64 -w 0)
```
A process on the host can listen for the notification, for example:
return r;
}
+int pidref_set_parent(PidRef *ret) {
+ _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
+ pid_t ppid;
+ int r;
+
+ assert(ret);
+
+ /* Acquires a pidref to our parent process. Deals with the fact that parent processes might exit, and
+ * we get reparented to other processes, with our old parent's PID already being recycled. */
+
+ ppid = getppid();
+ for (;;) {
+ r = pidref_set_pid(&parent, ppid);
+ if (r < 0)
+ return r;
+
+ if (parent.fd < 0) /* If pidfds are not available, then we are done */
+ break;
+
+ pid_t now_ppid = getppid();
+ if (now_ppid == ppid) /* If our ppid is still the same, then we are done */
+ break;
+
+ /* Otherwise let's try again with the new ppid */
+ ppid = now_ppid;
+ pidref_done(&parent);
+ }
+
+ *ret = TAKE_PIDREF(parent);
+ return 0;
+}
+
void pidref_done(PidRef *pidref) {
assert(pidref);
int pidref_set_pidfd(PidRef *pidref, int fd);
int pidref_set_pidfd_take(PidRef *pidref, int fd); /* takes ownership of the passed pidfd on success*/
int pidref_set_pidfd_consume(PidRef *pidref, int fd); /* takes ownership of the passed pidfd in both success and failure */
-
+int pidref_set_parent(PidRef *ret);
static inline int pidref_set_self(PidRef *pidref) {
return pidref_set_pid(pidref, 0);
}
#include "errno-util.h"
#include "extract-word.h"
#include "fd-util.h"
+#include "fileio.h"
#include "format-util.h"
#include "macro.h"
#include "missing_resource.h"
+#include "process-util.h"
#include "rlimit-util.h"
#include "string-table.h"
+#include "strv.h"
#include "time-util.h"
int setrlimit_closest(int resource, const struct rlimit *rlim) {
return 1;
}
+
+int pid_getrlimit(pid_t pid, int resource, struct rlimit *ret) {
+
+ static const char * const prefix_table[_RLIMIT_MAX] = {
+ [RLIMIT_CPU] = "Max cpu time",
+ [RLIMIT_FSIZE] = "Max file size",
+ [RLIMIT_DATA] = "Max data size",
+ [RLIMIT_STACK] = "Max stack size",
+ [RLIMIT_CORE] = "Max core file size",
+ [RLIMIT_RSS] = "Max resident set",
+ [RLIMIT_NPROC] = "Max processes",
+ [RLIMIT_NOFILE] = "Max open files",
+ [RLIMIT_MEMLOCK] = "Max locked memory",
+ [RLIMIT_AS] = "Max address space",
+ [RLIMIT_LOCKS] = "Max file locks",
+ [RLIMIT_SIGPENDING] = "Max pending signals",
+ [RLIMIT_MSGQUEUE] = "Max msgqueue size",
+ [RLIMIT_NICE] = "Max nice priority",
+ [RLIMIT_RTPRIO] = "Max realtime priority",
+ [RLIMIT_RTTIME] = "Max realtime timeout",
+ };
+
+ int r;
+
+ assert(resource >= 0);
+ assert(resource < _RLIMIT_MAX);
+ assert(pid >= 0);
+ assert(ret);
+
+ if (pid == 0 || pid == getpid_cached())
+ return RET_NERRNO(getrlimit(resource, ret));
+
+ r = RET_NERRNO(prlimit(pid, resource, /* new_limit= */ NULL, ret));
+ if (!ERRNO_IS_NEG_PRIVILEGE(r))
+ return r;
+
+ /* We don't have access? Then try to go via /proc/$PID/limits. Weirdly that's world readable in
+ * contrast to querying the data via prlimit() */
+
+ const char *p = procfs_file_alloca(pid, "limits");
+ _cleanup_free_ char *limits = NULL;
+
+ r = read_full_virtual_file(p, &limits, NULL);
+ if (r < 0)
+ return -EPERM; /* propagate original permission error if we can't access the limits file */
+
+ _cleanup_strv_free_ char **l = NULL;
+ l = strv_split(limits, "\n");
+ if (!l)
+ return -ENOMEM;
+
+ STRV_FOREACH(i, strv_skip(l, 1)) {
+ _cleanup_free_ char *soft = NULL, *hard = NULL;
+ uint64_t sv, hv;
+ const char *e;
+
+ e = startswith(*i, prefix_table[resource]);
+ if (!e)
+ continue;
+
+ if (*e != ' ')
+ continue;
+
+ e += strspn(e, WHITESPACE);
+
+ size_t n;
+ n = strcspn(e, WHITESPACE);
+ if (n == 0)
+ continue;
+
+ soft = strndup(e, n);
+ if (!soft)
+ return -ENOMEM;
+
+ e += n;
+ if (*e != ' ')
+ continue;
+
+ e += strspn(e, WHITESPACE);
+ n = strcspn(e, WHITESPACE);
+ if (n == 0)
+ continue;
+
+ hard = strndup(e, n);
+ if (!hard)
+ return -ENOMEM;
+
+ if (streq(soft, "unlimited"))
+ sv = RLIM_INFINITY;
+ else {
+ r = safe_atou64(soft, &sv);
+ if (r < 0)
+ return r;
+ }
+
+ if (streq(hard, "unlimited"))
+ hv = RLIM_INFINITY;
+ else {
+ r = safe_atou64(hard, &hv);
+ if (r < 0)
+ return r;
+ }
+
+ *ret = (struct rlimit) {
+ .rlim_cur = sv,
+ .rlim_max = hv,
+ };
+
+ return 0;
+ }
+
+ return -ENOTRECOVERABLE;
+}
int rlimit_nofile_bump(int limit);
int rlimit_nofile_safe(void);
+
+int pid_getrlimit(pid_t pid, int resource, struct rlimit *ret);
* don't read the other limits from PID 1 but prefer the static table above. */
};
- int rl;
+ int rl, r;
for (rl = 0; rl < _RLIMIT_MAX; rl++) {
/* Let's only fill in what the user hasn't explicitly configured anyway */
if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
/* For these two let's read the limits off PID 1. See above for an explanation. */
- if (prlimit(1, rl, NULL, &buffer) < 0)
- return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
+ r = pid_getrlimit(1, rl, &buffer);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
v = &buffer;
} else if (rl == RLIMIT_NOFILE) {
assert(fd_newroot >= 0);
assert(path);
- /* Change into the new rootfs. */
- if (fchdir(fd_newroot) < 0)
- return log_debug_errno(errno, "Failed to chdir into new rootfs '%s': %m", path);
-
/* Let the kernel tuck the new root under the old one. */
if (pivot_root(".", ".") < 0)
return log_debug_errno(errno, "Failed to pivot root to new rootfs '%s': %m", path);
assert(fd_newroot >= 0);
assert(path);
- /* Change into the new rootfs. */
- if (fchdir(fd_newroot) < 0)
- return log_debug_errno(errno, "Failed to chdir into new rootfs '%s': %m", path);
-
/* Move the new root fs */
if (mount(".", "/", NULL, MS_MOVE, NULL) < 0)
return log_debug_errno(errno, "Failed to move new rootfs '%s': %m", path);
int mount_switch_root_full(const char *path, unsigned long mount_propagation_flag, bool force_ms_move) {
_cleanup_close_ int fd_newroot = -EBADF;
- int r;
+ int r, is_current_root;
assert(path);
assert(mount_propagation_flag_is_valid(mount_propagation_flag));
if (fd_newroot < 0)
return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path);
- if (!force_ms_move) {
- r = mount_switch_root_pivot(fd_newroot, path);
- if (r < 0) {
- log_debug_errno(r, "Failed to pivot into new rootfs '%s', will try to use MS_MOVE instead: %m", path);
- force_ms_move = true;
+ is_current_root = path_is_root_at(fd_newroot, NULL);
+ if (is_current_root < 0)
+ return log_debug_errno(is_current_root, "Failed to determine if target dir is our root already: %m");
+
+ /* Change into the new rootfs. */
+ if (fchdir(fd_newroot) < 0)
+ return log_debug_errno(errno, "Failed to chdir into new rootfs '%s': %m", path);
+
+ /* Make this a NOP if we are supposed to switch to our current root fs. After all, both pivot_root()
+ * and MS_MOVE don't like that. */
+ if (!is_current_root) {
+ if (!force_ms_move) {
+ r = mount_switch_root_pivot(fd_newroot, path);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to pivot into new rootfs '%s', will try to use MS_MOVE instead: %m", path);
+ force_ms_move = true;
+ }
+ }
+ if (force_ms_move) {
+ /* Failed to pivot_root() fallback to MS_MOVE. For example, this may happen if the rootfs is
+ * an initramfs in which case pivot_root() isn't supported. */
+ r = mount_switch_root_move(fd_newroot, path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to switch to new rootfs '%s' with MS_MOVE: %m", path);
}
- }
- if (force_ms_move) {
- /* Failed to pivot_root() fallback to MS_MOVE. For example, this may happen if the rootfs is
- * an initramfs in which case pivot_root() isn't supported. */
- r = mount_switch_root_move(fd_newroot, path);
- if (r < 0)
- return log_debug_errno(r, "Failed to switch to new rootfs '%s' with MS_MOVE: %m", path);
}
/* Finally, let's establish the requested propagation flags. */
assert_se(s);
assert_se(touch(s) >= 0);
- for (int force_ms_move = 0; force_ms_move < 2; force_ms_move++) {
+ struct {
+ const char *path;
+ bool force_ms_move;
+ } table[] = {
+ { t, false },
+ { t, true },
+ { "/", false },
+ { "/", true },
+ };
+
+ FOREACH_ARRAY(i, table, ELEMENTSOF(table)) {
r = safe_fork("(switch-root)",
FORK_RESET_SIGNALS |
FORK_CLOSE_ALL_FDS |
assert_se(r >= 0);
if (r == 0) {
- assert_se(make_mount_point(t) >= 0);
- assert_se(mount_switch_root_full(t, /* mount_propagation_flag= */ 0, force_ms_move) >= 0);
+ assert_se(make_mount_point(i->path) >= 0);
+ assert_se(mount_switch_root_full(i->path, /* mount_propagation_flag= */ 0, i->force_ms_move) >= 0);
- assert_se(access(ASSERT_PTR(strrchr(s, '/')), F_OK) >= 0); /* absolute */
- assert_se(access(ASSERT_PTR(strrchr(s, '/')) + 1, F_OK) >= 0); /* relative */
- assert_se(access(s, F_OK) < 0 && errno == ENOENT); /* doesn't exist in our new environment */
+ if (!path_equal(i->path, "/")) {
+ assert_se(access(ASSERT_PTR(strrchr(s, '/')), F_OK) >= 0); /* absolute */
+ assert_se(access(ASSERT_PTR(strrchr(s, '/')) + 1, F_OK) >= 0); /* relative */
+ assert_se(access(s, F_OK) < 0 && errno == ENOENT); /* doesn't exist in our new environment */
+ }
_exit(EXIT_SUCCESS);
}
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <sys/resource.h>
+#if HAVE_VALGRIND_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
#include "alloc-util.h"
#include "capability-util.h"
#include "macro.h"
#include "missing_resource.h"
+#include "process-util.h"
#include "rlimit-util.h"
#include "string-util.h"
#include "tests.h"
#include "time-util.h"
+#include "user-util.h"
static void test_rlimit_parse_format_one(int resource, const char *string, rlim_t soft, rlim_t hard, int ret, const char *formatted) {
_cleanup_free_ char *f = NULL;
assert_se(old.rlim_max == new.rlim_max);
}
+TEST(pid_getrlimit) {
+ int r;
+
+ /* We fork off a child and read the parent's resource limit from there (i.e. our own), and compare
+ * with what getrlimit() gives us */
+
+ for (int resource = 0; resource < _RLIMIT_MAX; resource++) {
+ struct rlimit direct;
+
+ assert_se(getrlimit(resource, &direct) >= 0);
+
+ /* We fork off a child so that getrlimit() doesn't work anymore */
+ r = safe_fork("(getrlimit)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT, /* ret_pid= */ NULL);
+ assert_se(r >= 0);
+
+ if (r == 0) {
+ struct rlimit indirect;
+ /* child */
+
+ /* Drop privs, so that prlimit() doesn't work anymore */
+ (void) setresgid(GID_NOBODY, GID_NOBODY, GID_NOBODY);
+ (void) setresuid(UID_NOBODY, UID_NOBODY, UID_NOBODY);
+
+ assert_se(pid_getrlimit(getppid(), resource, &indirect) >= 0);
+
+#if HAVE_VALGRIND_VALGRIND_H
+ /* Valgrind fakes some changes in RLIMIT_NOFILE getrlimit() returns, work around that */
+ if (RUNNING_ON_VALGRIND && resource == RLIMIT_NOFILE) {
+ log_info("Skipping pid_getrlimit() check for RLIMIT_NOFILE, running in valgrind");
+ _exit(EXIT_SUCCESS);
+ }
+#endif
+
+ assert_se(direct.rlim_cur == indirect.rlim_cur);
+ assert_se(direct.rlim_max == indirect.rlim_max);
+
+ _exit(EXIT_SUCCESS);
+ }
+ }
+}
+
DEFINE_TEST_MAIN(LOG_INFO);
#include "signal-util.h"
#include "socket-util.h"
#include "stdio-util.h"
+#include "strv.h"
#include "umask-util.h"
#include "userdbd-manager.h"
m->deferred_start_worker_event_source = sd_event_source_unref(m->deferred_start_worker_event_source);
+ safe_close(m->listen_fd);
+
sd_event_unref(m->event);
return mfree(m);
_exit(EXIT_FAILURE);
}
-
if (setenv("USERDB_FIXED_WORKER", one_zero(fixed), 1) < 0) {
log_error_errno(errno, "Failed to set $USERDB_FIXED_WORKER: %m");
_exit(EXIT_FAILURE);
return 0;
}
-int manager_startup(Manager *m) {
- int n, r;
+static int manager_make_listen_socket(Manager *m) {
+ static const union sockaddr_union sockaddr = {
+ .un.sun_family = AF_UNIX,
+ .un.sun_path = "/run/systemd/userdb/io.systemd.Multiplexer",
+ };
+ int r;
+
+ assert(m);
+
+ if (m->listen_fd >= 0)
+ return 0;
+
+ r = mkdir_p("/run/systemd/userdb", 0755);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create /run/systemd/userdb: %m");
+
+ m->listen_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
+ if (m->listen_fd < 0)
+ return log_error_errno(errno, "Failed to bind on socket: %m");
+
+ (void) sockaddr_un_unlink(&sockaddr.un);
+
+ WITH_UMASK(0000)
+ if (bind(m->listen_fd, &sockaddr.sa, SOCKADDR_UN_LEN(sockaddr.un)) < 0)
+ return log_error_errno(errno, "Failed to bind socket: %m");
+
+ FOREACH_STRING(alias,
+ "/run/systemd/userdb/io.systemd.NameServiceSwitch",
+ "/run/systemd/userdb/io.systemd.DropIn") {
+
+ r = symlink_idempotent("io.systemd.Multiplexer", alias, /* make_relative= */ false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to symlink '%s': %m", alias);
+ }
+
+ if (listen(m->listen_fd, SOMAXCONN_DELUXE) < 0)
+ return log_error_errno(errno, "Failed to listen on socket: %m");
+
+ return 1;
+}
+
+static int manager_scan_listen_fds(Manager *m) {
+ int n;
assert(m);
- assert(m->listen_fd < 0);
- n = sd_listen_fds(false);
+ n = sd_listen_fds(/* unset_environment= */ true);
if (n < 0)
return log_error_errno(n, "Failed to determine number of passed file descriptors: %m");
if (n > 1)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected one listening fd, got %i.", n);
if (n == 1)
m->listen_fd = SD_LISTEN_FDS_START;
- else {
- static const union sockaddr_union sockaddr = {
- .un.sun_family = AF_UNIX,
- .un.sun_path = "/run/systemd/userdb/io.systemd.Multiplexer",
- };
-
- r = mkdir_p("/run/systemd/userdb", 0755);
- if (r < 0)
- return log_error_errno(r, "Failed to create /run/systemd/userdb: %m");
- m->listen_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
- if (m->listen_fd < 0)
- return log_error_errno(errno, "Failed to bind on socket: %m");
-
- (void) sockaddr_un_unlink(&sockaddr.un);
+ return 0;
+}
- WITH_UMASK(0000)
- if (bind(m->listen_fd, &sockaddr.sa, SOCKADDR_UN_LEN(sockaddr.un)) < 0)
- return log_error_errno(errno, "Failed to bind socket: %m");
+int manager_startup(Manager *m) {
+ int r;
- r = symlink_idempotent("io.systemd.Multiplexer",
- "/run/systemd/userdb/io.systemd.NameServiceSwitch", false);
- if (r < 0)
- return log_error_errno(r, "Failed to bind io.systemd.Multiplexer: %m");
+ assert(m);
+ assert(m->listen_fd < 0);
- r = symlink_idempotent("io.systemd.Multiplexer",
- "/run/systemd/userdb/io.systemd.DropIn", false);
- if (r < 0)
- return log_error_errno(r, "Failed to bind io.systemd.Multiplexer: %m");
+ r = manager_scan_listen_fds(m);
+ if (r < 0)
+ return r;
- if (listen(m->listen_fd, SOMAXCONN_DELUXE) < 0)
- return log_error_errno(errno, "Failed to listen on socket: %m");
- }
+ r = manager_make_listen_socket(m);
+ if (r < 0)
+ return r;
/* Let's make sure every accept() call on this socket times out after 25s. This allows workers to be
* GC'ed on idle */
if (setsockopt(m->listen_fd, SOL_SOCKET, SO_RCVTIMEO, TIMEVAL_STORE(LISTEN_TIMEOUT_USEC), sizeof(struct timeval)) < 0)
return log_error_errno(errno, "Failed to se SO_RCVTIMEO: %m");
- return start_workers(m, /* explicit_request= */ false);
+ r = start_workers(m, /* explicit_request= */ false);
+ if (r < 0)
+ return r;
+
+ return 0;
}
static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
static const JsonDispatch dispatch_table[] = {
- { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 },
+ { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 },
{ "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 },
- { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
{}
};
JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(last_group_name))));
}
-static int process_connection(VarlinkServer *server, int fd) {
+static int process_connection(VarlinkServer *server, int _fd) {
+ _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possesion */
_cleanup_(varlink_close_unrefp) Varlink *vl = NULL;
int r;
r = varlink_server_add_connection(server, fd, &vl);
- if (r < 0) {
- fd = safe_close(fd);
+ if (r < 0)
return log_error_errno(r, "Failed to add connection: %m");
- }
+ TAKE_FD(fd);
vl = varlink_ref(vl);
for (;;) {
static int run(int argc, char *argv[]) {
usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY;
_cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL;
+ _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
unsigned n_iterations = 0;
int m, listen_fd, r;
if (r < 0)
return log_error_errno(r, "Failed to disable userdb NSS compatibility: %m");
+ r = pidref_set_parent(&parent);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire pidfd of parent process: %m");
+ if (parent.pid == 1) /* We got reparented away from userdbd? */
+ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent already died, exiting.");
+
start_time = now(CLOCK_MONOTONIC);
for (;;) {
return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m");
if (FLAGS_SET(r, POLLIN)) {
- pid_t parent;
-
- parent = getppid();
- if (parent <= 1)
- return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent already died?");
-
- if (kill(parent, SIGUSR2) < 0)
- return log_error_errno(errno, "Failed to kill our own parent: %m");
+ r = pidref_kill(&parent, SIGUSR2);
+ if (r == -ESRCH)
+ return log_error_errno(r, "Parent already died?");
+ if (r < 0)
+ return log_error_errno(r, "Failed to send SIGUSR2 signal to parent: %m");
}
}