From: Luca Boccassi Date: Sun, 13 Apr 2025 21:10:36 +0000 (+0100) Subject: coredump: add support for new %F PIDFD specifier X-Git-Tag: v258-rc1~448^2~3 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=868d95577ec9f862580ad365726515459be582fc;p=thirdparty%2Fsystemd.git coredump: add support for new %F PIDFD specifier A new core_pattern specifier was added, %F, to provide a PIDFD to the usermode helper process referring to the crashed process. This removes all possible race conditions, ensuring only the crashed process gets inspected by systemd-coredump. --- diff --git a/man/systemd-coredump.xml b/man/systemd-coredump.xml index b62eb7b1296..d499600dc15 100644 --- a/man/systemd-coredump.xml +++ b/man/systemd-coredump.xml @@ -229,6 +229,17 @@ COREDUMP_FILENAME=/var/lib/systemd/coredump/core.Web….552351.….zst + + COREDUMP_BY_PIDFD= + If the crashed process was analyzed using a PIDFD provided by the kernel (requires + kernel v6.16) then this field will be present and set to 1. If this field is + not set, then the crashed process was analyzed via a PID, which is known to be subject to race + conditions. + + + + + COREDUMP_TIMESTAMP= The time of the crash as reported by the kernel (in μs since the epoch). diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c index 442cc10a82f..c4a5532cea8 100644 --- a/src/coredump/coredump.c +++ b/src/coredump/coredump.c @@ -104,6 +104,7 @@ typedef enum { /* The fields below were added to kernel/core_pattern at later points, so they might be missing. */ META_ARGV_HOSTNAME = _META_ARGV_REQUIRED, /* %h: hostname */ META_ARGV_DUMPABLE, /* %d: as set by the kernel */ + META_ARGV_PIDFD, /* %F: pidfd of the process, since v6.16 */ /* If new fields are added, they should be added here, to maintain compatibility * with callers which don't know about the new fields. */ _META_ARGV_MAX, @@ -133,6 +134,7 @@ static const char * const meta_field_names[_META_MAX] = { [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=", [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=", [META_ARGV_DUMPABLE] = "COREDUMP_DUMPABLE=", + [META_ARGV_PIDFD] = "COREDUMP_BY_PIDFD=", [META_COMM] = "COREDUMP_COMM=", [META_EXE] = "COREDUMP_EXE=", [META_UNIT] = "COREDUMP_UNIT=", @@ -1345,7 +1347,8 @@ static int gather_pid_metadata_from_argv( Context *context, int argc, char **argv) { - int r; + _cleanup_(pidref_done) PidRef local_pidref = PIDREF_NULL; + int r, kernel_fd = -EBADF; assert(iovw); assert(context); @@ -1377,6 +1380,47 @@ static int gather_pid_metadata_from_argv( t = buf; } + if (i == META_ARGV_PID) { + /* Store this so that we can check whether the core will be forwarded to a container + * even when the kernel doesn't provide a pidfd. Can be dropped once baseline is + * >= v6.16. */ + r = pidref_set_pidstr(&local_pidref, t); + if (r < 0) + return log_error_errno(r, "Failed to initialize pidref from pid %s: %m", t); + } + + if (i == META_ARGV_PIDFD) { + /* If the current kernel doesn't support the %F specifier (which resolves to a + * pidfd), but we included it in the core_pattern expression, we'll receive an empty + * string here. Deal with that gracefully. */ + if (isempty(t)) + continue; + + assert(!pidref_is_set(&context->pidref)); + assert(kernel_fd < 0); + + kernel_fd = parse_fd(t); + if (kernel_fd < 0) + return log_error_errno(kernel_fd, "Failed to parse pidfd \"%s\": %m", t); + + r = pidref_set_pidfd(&context->pidref, kernel_fd); + if (r < 0) + return log_error_errno(r, "Failed to initialize pidref from pidfd %d: %m", kernel_fd); + + /* If there are containers involved with different versions of the code they might + * not be using pidfds, so it would be wrong to set the metadata, skip it. */ + r = pidref_in_same_namespace(/* pid1 = */ NULL, &context->pidref, NAMESPACE_PID); + if (r < 0) + log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m"); + if (r <= 0) + continue; + + /* We don't print the fd number in the journal as it's meaningless, but we still + * record that the parsing was done with a kernel-provided fd as it means it's safe + * from races, which is valuable information to provide in the journal record. */ + t = "1"; + } + r = iovw_put_string_field(iovw, meta_field_names[i], t); if (r < 0) return r; @@ -1384,7 +1428,19 @@ static int gather_pid_metadata_from_argv( /* Cache some of the process metadata we collected so far and that we'll need to * access soon. */ - return context_parse_iovw(context, iovw); + r = context_parse_iovw(context, iovw); + if (r < 0) + return r; + + /* If the kernel didn't give us a PIDFD, then use the one derived from the + * PID immediately, given we have it. */ + if (!pidref_is_set(&context->pidref)) + context->pidref = TAKE_PIDREF(local_pidref); + + /* Close the kernel-provided FD as the last thing after everything else succeeded. */ + kernel_fd = safe_close(kernel_fd); + + return 0; } static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) { diff --git a/sysctl.d/50-coredump.conf.in b/sysctl.d/50-coredump.conf.in index a550c87258f..fe8f7670b06 100644 --- a/sysctl.d/50-coredump.conf.in +++ b/sysctl.d/50-coredump.conf.in @@ -13,7 +13,7 @@ # the core dump. # # See systemd-coredump(8) and core(5). -kernel.core_pattern=|{{LIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h %d +kernel.core_pattern=|{{LIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h %d %F # Allow 16 coredumps to be dispatched in parallel by the kernel. # We collect metadata from /proc/%P/, and thus need to make sure the crashed