]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
f5e04665 | 2 | |
86b9a3e3 | 3 | #include <elf.h> |
803a3464 | 4 | #include <stdio.h> |
b8fe1b1d | 5 | #include <sys/mount.h> |
587f2a5e | 6 | #include <sys/statvfs.h> |
cacd6403 | 7 | #include <sys/xattr.h> |
4f5dd394 | 8 | #include <unistd.h> |
f5e04665 | 9 | |
b06adfb0 | 10 | #include "sd-bus.h" |
73a99163 | 11 | #include "sd-daemon.h" |
f11943c5 | 12 | #include "sd-journal.h" |
309a747f | 13 | #include "sd-json.h" |
f11943c5 | 14 | #include "sd-login.h" |
73a99163 | 15 | #include "sd-messages.h" |
4f5dd394 LP |
16 | |
17 | #include "acl-util.h" | |
b5efdb8a | 18 | #include "alloc-util.h" |
587f2a5e | 19 | #include "bus-error.h" |
430f0182 | 20 | #include "capability-util.h" |
ba1261bc | 21 | #include "cgroup-util.h" |
4f5dd394 | 22 | #include "compress.h" |
34c10968 LP |
23 | #include "conf-parser.h" |
24 | #include "copy.h" | |
c8715007 | 25 | #include "coredump-util.h" |
f11943c5 | 26 | #include "coredump-vacuum.h" |
a0956174 | 27 | #include "dirent-util.h" |
ea680f05 | 28 | #include "elf-util.h" |
b06adfb0 | 29 | #include "errno-util.h" |
4f5dd394 | 30 | #include "escape.h" |
3ffd4af2 | 31 | #include "fd-util.h" |
4f5dd394 | 32 | #include "fileio.h" |
f4f15635 | 33 | #include "fs-util.h" |
b06adfb0 | 34 | #include "io-util.h" |
bd1ae178 | 35 | #include "iovec-util.h" |
b18453ed | 36 | #include "journal-importer.h" |
5edf875b | 37 | #include "journal-send.h" |
309a747f | 38 | #include "json-util.h" |
4f5dd394 | 39 | #include "log.h" |
5e332028 | 40 | #include "main-func.h" |
0a970718 | 41 | #include "memory-util.h" |
2485b7e2 | 42 | #include "memstream-util.h" |
35cd0ba5 | 43 | #include "mkdir-label.h" |
a108c43e | 44 | #include "namespace-util.h" |
6bedfcbb | 45 | #include "parse-util.h" |
a108c43e | 46 | #include "path-util.h" |
b06adfb0 | 47 | #include "pidref.h" |
0b452006 | 48 | #include "process-util.h" |
d14bcb4e | 49 | #include "signal-util.h" |
3c171f0b | 50 | #include "socket-util.h" |
4f5dd394 | 51 | #include "special.h" |
587f2a5e | 52 | #include "stat-util.h" |
8b43440b | 53 | #include "string-table.h" |
07630cea | 54 | #include "string-util.h" |
e4de7287 | 55 | #include "tmpfile-util.h" |
8e1ac16b | 56 | #include "uid-classification.h" |
b1d4f8e1 | 57 | #include "user-util.h" |
34727273 | 58 | |
da890466 | 59 | /* The maximum size up to which we process coredumps. We use 1G on 32-bit systems, and 32G on 64-bit systems */ |
e677041e LP |
60 | #if __SIZEOF_POINTER__ == 4 |
61 | #define PROCESS_SIZE_MAX ((uint64_t) (1LLU*1024LLU*1024LLU*1024LLU)) | |
62 | #elif __SIZEOF_POINTER__ == 8 | |
63 | #define PROCESS_SIZE_MAX ((uint64_t) (32LLU*1024LLU*1024LLU*1024LLU)) | |
64 | #else | |
65 | #error "Unexpected pointer size" | |
66 | #endif | |
34c10968 | 67 | |
bdfd7b2c | 68 | /* The maximum size up to which we leave the coredump around on disk */ |
34c10968 LP |
69 | #define EXTERNAL_SIZE_MAX PROCESS_SIZE_MAX |
70 | ||
bdfd7b2c | 71 | /* The maximum size up to which we store the coredump in the journal */ |
25cad95c | 72 | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
34c10968 | 73 | #define JOURNAL_SIZE_MAX ((size_t) (767LU*1024LU*1024LU)) |
25cad95c YW |
74 | #else |
75 | /* oss-fuzz limits memory usage. */ | |
76 | #define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU)) | |
77 | #endif | |
f5e04665 | 78 | |
587f2a5e LB |
79 | /* When checking for available memory and setting lower limits, don't |
80 | * go below 4MB for writing core files to storage. */ | |
81 | #define PROCESS_SIZE_MIN (4U*1024U*1024U) | |
82 | ||
c4aa09b0 | 83 | /* Make sure to not make this larger than the maximum journal entry |
27f931d1 | 84 | * size. See DATA_SIZE_MAX in journal-importer.h. */ |
874bc134 | 85 | assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX); |
f5e04665 | 86 | |
0aea6872 MS |
87 | #define MOUNT_TREE_ROOT "/run/systemd/mount-rootfs" |
88 | ||
49f1f2d4 | 89 | typedef enum { |
f46c706b | 90 | /* We use these as array indexes for our process metadata cache. |
ea5cc2a8 | 91 | * |
c673f1f6 ZJS |
92 | * The first indices of the cache stores the same metadata as the ones passed by the kernel via |
93 | * argv[], i.e. the strings specified in our pattern defined in /proc/sys/kernel/core_pattern, | |
94 | * see core(5). */ | |
f46c706b FB |
95 | |
96 | META_ARGV_PID, /* %P: as seen in the initial pid namespace */ | |
97 | META_ARGV_UID, /* %u: as seen in the initial user namespace */ | |
98 | META_ARGV_GID, /* %g: as seen in the initial user namespace */ | |
99 | META_ARGV_SIGNAL, /* %s: number of signal causing dump */ | |
e503019b | 100 | META_ARGV_TIMESTAMP, /* %t: time of dump, expressed as seconds since the Epoch (we expand this to μs granularity) */ |
f46c706b | 101 | META_ARGV_RLIMIT, /* %c: core file size soft resource limit */ |
ded0aac3 ZJS |
102 | _META_ARGV_REQUIRED, |
103 | /* The fields below were added to kernel/core_pattern at later points, so they might be missing. */ | |
104 | META_ARGV_HOSTNAME = _META_ARGV_REQUIRED, /* %h: hostname */ | |
0c49e004 | 105 | META_ARGV_DUMPABLE, /* %d: as set by the kernel */ |
868d9557 | 106 | META_ARGV_PIDFD, /* %F: pidfd of the process, since v6.16 */ |
ded0aac3 ZJS |
107 | /* If new fields are added, they should be added here, to maintain compatibility |
108 | * with callers which don't know about the new fields. */ | |
49f1f2d4 | 109 | _META_ARGV_MAX, |
f46c706b FB |
110 | |
111 | /* The following indexes are cached for a couple of special fields we use (and | |
112 | * thereby need to be retrieved quickly) for naming coredump files, and attaching | |
113 | * xattrs. Unlike the previous ones they are retrieved from the runtime | |
114 | * environment. */ | |
115 | ||
116 | META_COMM = _META_ARGV_MAX, | |
f46c706b FB |
117 | |
118 | /* The rest are similar to the previous ones except that we won't fail if one of | |
ded0aac3 | 119 | * them is missing in a message sent over the socket. */ |
f46c706b | 120 | |
49f1f2d4 | 121 | META_EXE, |
f46c706b | 122 | META_UNIT, |
3e4d0f6c | 123 | META_PROC_AUXV, |
f46c706b | 124 | _META_MAX |
49f1f2d4 | 125 | } meta_argv_t; |
f5e04665 | 126 | |
f46c706b | 127 | static const char * const meta_field_names[_META_MAX] = { |
510a1466 ZJS |
128 | [META_ARGV_PID] = "COREDUMP_PID=", |
129 | [META_ARGV_UID] = "COREDUMP_UID=", | |
130 | [META_ARGV_GID] = "COREDUMP_GID=", | |
131 | [META_ARGV_SIGNAL] = "COREDUMP_SIGNAL=", | |
132 | [META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=", | |
133 | [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=", | |
134 | [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=", | |
0c49e004 | 135 | [META_ARGV_DUMPABLE] = "COREDUMP_DUMPABLE=", |
868d9557 | 136 | [META_ARGV_PIDFD] = "COREDUMP_BY_PIDFD=", |
510a1466 ZJS |
137 | [META_COMM] = "COREDUMP_COMM=", |
138 | [META_EXE] = "COREDUMP_EXE=", | |
139 | [META_UNIT] = "COREDUMP_UNIT=", | |
3e4d0f6c | 140 | [META_PROC_AUXV] = "COREDUMP_PROC_AUXV=", |
f46c706b FB |
141 | }; |
142 | ||
143 | typedef struct Context { | |
313537da | 144 | PidRef pidref; |
9764bca9 NR |
145 | uid_t uid; |
146 | gid_t gid; | |
0c49e004 | 147 | unsigned dumpable; |
960b0458 | 148 | int signo; |
19455dd6 | 149 | uint64_t rlimit; |
f46c706b FB |
150 | bool is_pid1; |
151 | bool is_journald; | |
e6a8687b | 152 | bool got_pidfd; |
313537da LP |
153 | int mount_tree_fd; |
154 | ||
155 | /* These point into external memory, are not owned by this object */ | |
156 | const char *meta[_META_MAX]; | |
157 | size_t meta_size[_META_MAX]; | |
f46c706b FB |
158 | } Context; |
159 | ||
313537da LP |
160 | #define CONTEXT_NULL \ |
161 | (Context) { \ | |
162 | .pidref = PIDREF_NULL, \ | |
163 | .uid = UID_INVALID, \ | |
164 | .gid = GID_INVALID, \ | |
165 | .mount_tree_fd = -EBADF, \ | |
166 | } | |
167 | ||
34c10968 LP |
168 | typedef enum CoredumpStorage { |
169 | COREDUMP_STORAGE_NONE, | |
170 | COREDUMP_STORAGE_EXTERNAL, | |
171 | COREDUMP_STORAGE_JOURNAL, | |
34c10968 | 172 | _COREDUMP_STORAGE_MAX, |
2d93c20e | 173 | _COREDUMP_STORAGE_INVALID = -EINVAL, |
34c10968 LP |
174 | } CoredumpStorage; |
175 | ||
34c10968 | 176 | static const char* const coredump_storage_table[_COREDUMP_STORAGE_MAX] = { |
510a1466 | 177 | [COREDUMP_STORAGE_NONE] = "none", |
34c10968 | 178 | [COREDUMP_STORAGE_EXTERNAL] = "external", |
510a1466 | 179 | [COREDUMP_STORAGE_JOURNAL] = "journal", |
34c10968 LP |
180 | }; |
181 | ||
182 | DEFINE_PRIVATE_STRING_TABLE_LOOKUP(coredump_storage, CoredumpStorage); | |
42efe5be | 183 | static DEFINE_CONFIG_PARSE_ENUM(config_parse_coredump_storage, coredump_storage, CoredumpStorage); |
34727273 ZJS |
184 | |
185 | static CoredumpStorage arg_storage = COREDUMP_STORAGE_EXTERNAL; | |
8c9571d0 | 186 | static bool arg_compress = true; |
59f448cf LP |
187 | static uint64_t arg_process_size_max = PROCESS_SIZE_MAX; |
188 | static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX; | |
6e2b4a69 | 189 | static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX; |
f5fbe71d YW |
190 | static uint64_t arg_keep_free = UINT64_MAX; |
191 | static uint64_t arg_max_use = UINT64_MAX; | |
e26a7e08 MS |
192 | #if HAVE_DWFL_SET_SYSROOT |
193 | static bool arg_enter_namespace = false; | |
194 | #endif | |
34c10968 | 195 | |
313537da LP |
196 | static void context_done(Context *c) { |
197 | assert(c); | |
198 | ||
199 | pidref_done(&c->pidref); | |
200 | c->mount_tree_fd = safe_close(c->mount_tree_fd); | |
201 | } | |
202 | ||
34c10968 | 203 | static int parse_config(void) { |
34c10968 | 204 | static const ConfigTableItem items[] = { |
68511ceb MS |
205 | { "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage }, |
206 | { "Coredump", "Compress", config_parse_bool, 0, &arg_compress }, | |
207 | { "Coredump", "ProcessSizeMax", config_parse_iec_uint64, 0, &arg_process_size_max }, | |
208 | { "Coredump", "ExternalSizeMax", config_parse_iec_uint64_infinity, 0, &arg_external_size_max }, | |
209 | { "Coredump", "JournalSizeMax", config_parse_iec_size, 0, &arg_journal_size_max }, | |
210 | { "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free }, | |
211 | { "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use }, | |
212 | #if HAVE_DWFL_SET_SYSROOT | |
e26a7e08 | 213 | { "Coredump", "EnterNamespace", config_parse_bool, 0, &arg_enter_namespace }, |
68511ceb | 214 | #else |
8ec2e177 | 215 | { "Coredump", "EnterNamespace", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL }, |
68511ceb | 216 | #endif |
34c10968 LP |
217 | {} |
218 | }; | |
219 | ||
4a78074f LP |
220 | int r; |
221 | ||
6378f257 | 222 | r = config_parse_standard_file_with_dropins( |
e5abff37 | 223 | "systemd/coredump.conf", |
4a78074f LP |
224 | "Coredump\0", |
225 | config_item_table_lookup, | |
226 | items, | |
227 | CONFIG_PARSE_WARN, | |
228 | /* userdata= */ NULL); | |
229 | if (r < 0) | |
230 | return r; | |
231 | ||
232 | /* Let's make sure we fix up the maximum size we send to the journal here on the client side, for | |
233 | * efficiency reasons. journald wouldn't accept anything larger anyway. */ | |
234 | if (arg_journal_size_max > JOURNAL_SIZE_MAX) { | |
235 | log_warning("JournalSizeMax= set to larger value (%s) than journald would accept (%s), lowering automatically.", | |
236 | FORMAT_BYTES(arg_journal_size_max), FORMAT_BYTES(JOURNAL_SIZE_MAX)); | |
237 | arg_journal_size_max = JOURNAL_SIZE_MAX; | |
238 | } | |
239 | ||
240 | return 0; | |
34c10968 LP |
241 | } |
242 | ||
a1e92eee | 243 | static uint64_t storage_size_max(void) { |
ee0449fd ZJS |
244 | if (arg_storage == COREDUMP_STORAGE_EXTERNAL) |
245 | return arg_external_size_max; | |
246 | if (arg_storage == COREDUMP_STORAGE_JOURNAL) | |
247 | return arg_journal_size_max; | |
248 | assert(arg_storage == COREDUMP_STORAGE_NONE); | |
249 | return 0; | |
73a99163 ZJS |
250 | } |
251 | ||
3e4d0f6c ZJS |
252 | static int fix_acl(int fd, uid_t uid, bool allow_user) { |
253 | assert(fd >= 0); | |
254 | assert(uid_is_valid(uid)); | |
34c10968 | 255 | |
349cc4a5 | 256 | #if HAVE_ACL |
709f6e46 | 257 | int r; |
34c10968 | 258 | |
3e4d0f6c ZJS |
259 | /* We don't allow users to read coredumps if the uid or capabilities were changed. */ |
260 | if (!allow_user) | |
261 | return 0; | |
b59233e6 | 262 | |
554130fa | 263 | if (uid_is_system(uid) || uid_is_dynamic(uid) || uid_is_greeter(uid) || uid == UID_NOBODY) |
34c10968 LP |
264 | return 0; |
265 | ||
d81be4e7 | 266 | /* Make sure normal users can read (but not write or delete) their own coredumps */ |
567aeb58 | 267 | r = fd_add_uid_acl_permission(fd, uid, ACL_READ); |
709f6e46 | 268 | if (r < 0) |
567aeb58 | 269 | return log_error_errno(r, "Failed to adjust ACL of the coredump: %m"); |
34c10968 LP |
270 | #endif |
271 | ||
272 | return 0; | |
273 | } | |
274 | ||
f46c706b | 275 | static int fix_xattr(int fd, const Context *context) { |
f46c706b | 276 | static const char * const xattrs[_META_MAX] = { |
510a1466 ZJS |
277 | [META_ARGV_PID] = "user.coredump.pid", |
278 | [META_ARGV_UID] = "user.coredump.uid", | |
279 | [META_ARGV_GID] = "user.coredump.gid", | |
280 | [META_ARGV_SIGNAL] = "user.coredump.signal", | |
281 | [META_ARGV_TIMESTAMP] = "user.coredump.timestamp", | |
282 | [META_ARGV_RLIMIT] = "user.coredump.rlimit", | |
283 | [META_ARGV_HOSTNAME] = "user.coredump.hostname", | |
284 | [META_COMM] = "user.coredump.comm", | |
285 | [META_EXE] = "user.coredump.exe", | |
0cd77f97 LP |
286 | }; |
287 | ||
34c10968 LP |
288 | int r = 0; |
289 | ||
b59233e6 LP |
290 | assert(fd >= 0); |
291 | ||
60ecc386 | 292 | /* Attach some metadata to coredumps via extended attributes. Just because we can. */ |
34c10968 | 293 | |
fe96c0f8 | 294 | for (unsigned i = 0; i < _META_MAX; i++) { |
1eef15b1 ZJS |
295 | int k; |
296 | ||
f46c706b | 297 | if (isempty(context->meta[i]) || !xattrs[i]) |
0cd77f97 | 298 | continue; |
34c10968 | 299 | |
60ecc386 ZJS |
300 | k = RET_NERRNO(fsetxattr(fd, xattrs[i], context->meta[i], strlen(context->meta[i]), XATTR_CREATE)); |
301 | RET_GATHER(r, k); | |
0cd77f97 | 302 | } |
34c10968 LP |
303 | |
304 | return r; | |
305 | } | |
306 | ||
b0b21dce | 307 | #define filename_escape(s) xescape((s), "./ ") |
34c10968 | 308 | |
a1e92eee | 309 | static const char *coredump_tmpfile_name(const char *s) { |
1da3cb81 | 310 | return s ?: "(unnamed temporary file)"; |
0c773903 EV |
311 | } |
312 | ||
2d0bcf1e | 313 | static int fix_permissions_and_link( |
b59233e6 LP |
314 | int fd, |
315 | const char *filename, | |
316 | const char *target, | |
f46c706b | 317 | const Context *context, |
3e4d0f6c | 318 | bool allow_user) { |
b59233e6 | 319 | |
03532f0a LP |
320 | int r; |
321 | ||
b59233e6 | 322 | assert(fd >= 0); |
b59233e6 | 323 | assert(target); |
3c171f0b | 324 | assert(context); |
cfd652ed ZJS |
325 | |
326 | /* Ignore errors on these */ | |
3c171f0b | 327 | (void) fchmod(fd, 0640); |
9764bca9 | 328 | (void) fix_acl(fd, context->uid, allow_user); |
3c171f0b | 329 | (void) fix_xattr(fd, context); |
cfd652ed | 330 | |
74402bf0 | 331 | r = link_tmpfile(fd, filename, target, LINK_TMPFILE_SYNC); |
03532f0a LP |
332 | if (r < 0) |
333 | return log_error_errno(r, "Failed to move coredump %s into place: %m", target); | |
cfd652ed ZJS |
334 | |
335 | return 0; | |
336 | } | |
337 | ||
5125a0b8 LP |
338 | static int maybe_remove_external_coredump( |
339 | const Context *c, | |
340 | const char *filename, | |
341 | uint64_t size) { | |
342 | ||
343 | assert(c); | |
cfd652ed | 344 | |
5125a0b8 LP |
345 | /* Returns true if might remove, false if will not remove, < 0 on error. */ |
346 | ||
347 | if (arg_storage != COREDUMP_STORAGE_NONE && | |
348 | (c->is_pid1 || c->is_journald)) /* Always keep around in case of journald/pid1, since we cannot rely on the journal to accept them */ | |
349 | return false; | |
cfd652ed | 350 | |
fc6cec86 | 351 | if (arg_storage == COREDUMP_STORAGE_EXTERNAL && |
cfd652ed | 352 | size <= arg_external_size_max) |
5125a0b8 | 353 | return false; |
cfd652ed ZJS |
354 | |
355 | if (!filename) | |
5125a0b8 | 356 | return true; |
cfd652ed | 357 | |
4a62c710 MS |
358 | if (unlink(filename) < 0 && errno != ENOENT) |
359 | return log_error_errno(errno, "Failed to unlink %s: %m", filename); | |
cfd652ed | 360 | |
5125a0b8 | 361 | return true; |
cfd652ed ZJS |
362 | } |
363 | ||
f46c706b | 364 | static int make_filename(const Context *context, char **ret) { |
b59233e6 | 365 | _cleanup_free_ char *c = NULL, *u = NULL, *p = NULL, *t = NULL; |
a7f7d1bd | 366 | sd_id128_t boot = {}; |
34c10968 LP |
367 | int r; |
368 | ||
3c171f0b | 369 | assert(context); |
34c10968 | 370 | |
f46c706b | 371 | c = filename_escape(context->meta[META_COMM]); |
34c10968 | 372 | if (!c) |
b59233e6 | 373 | return -ENOMEM; |
34c10968 | 374 | |
f46c706b | 375 | u = filename_escape(context->meta[META_ARGV_UID]); |
0dc5d23c | 376 | if (!u) |
b59233e6 | 377 | return -ENOMEM; |
34c10968 LP |
378 | |
379 | r = sd_id128_get_boot(&boot); | |
b59233e6 | 380 | if (r < 0) |
34c10968 | 381 | return r; |
34c10968 | 382 | |
f46c706b | 383 | p = filename_escape(context->meta[META_ARGV_PID]); |
b59233e6 LP |
384 | if (!p) |
385 | return -ENOMEM; | |
386 | ||
f46c706b | 387 | t = filename_escape(context->meta[META_ARGV_TIMESTAMP]); |
b59233e6 LP |
388 | if (!t) |
389 | return -ENOMEM; | |
390 | ||
391 | if (asprintf(ret, | |
64a5384f | 392 | "/var/lib/systemd/coredump/core.%s.%s." SD_ID128_FORMAT_STR ".%s.%s", |
34c10968 | 393 | c, |
0dc5d23c | 394 | u, |
34c10968 LP |
395 | SD_ID128_FORMAT_VAL(boot), |
396 | p, | |
b59233e6 LP |
397 | t) < 0) |
398 | return -ENOMEM; | |
399 | ||
400 | return 0; | |
401 | } | |
402 | ||
3e4d0f6c ZJS |
403 | static int grant_user_access(int core_fd, const Context *context) { |
404 | int at_secure = -1; | |
405 | uid_t uid = UID_INVALID, euid = UID_INVALID; | |
406 | uid_t gid = GID_INVALID, egid = GID_INVALID; | |
407 | int r; | |
408 | ||
409 | assert(core_fd >= 0); | |
410 | assert(context); | |
411 | ||
412 | if (!context->meta[META_PROC_AUXV]) | |
413 | return log_warning_errno(SYNTHETIC_ERRNO(ENODATA), "No auxv data, not adjusting permissions."); | |
414 | ||
415 | uint8_t elf[EI_NIDENT]; | |
416 | errno = 0; | |
417 | if (pread(core_fd, &elf, sizeof(elf), 0) != sizeof(elf)) | |
418 | return log_warning_errno(errno_or_else(EIO), | |
419 | "Failed to pread from coredump fd: %s", STRERROR_OR_EOF(errno)); | |
420 | ||
421 | if (elf[EI_MAG0] != ELFMAG0 || | |
422 | elf[EI_MAG1] != ELFMAG1 || | |
423 | elf[EI_MAG2] != ELFMAG2 || | |
424 | elf[EI_MAG3] != ELFMAG3 || | |
425 | elf[EI_VERSION] != EV_CURRENT) | |
426 | return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN), | |
427 | "Core file does not have ELF header, not adjusting permissions."); | |
428 | if (!IN_SET(elf[EI_CLASS], ELFCLASS32, ELFCLASS64) || | |
429 | !IN_SET(elf[EI_DATA], ELFDATA2LSB, ELFDATA2MSB)) | |
430 | return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN), | |
431 | "Core file has strange ELF class, not adjusting permissions."); | |
432 | ||
433 | if ((elf[EI_DATA] == ELFDATA2LSB) != (__BYTE_ORDER == __LITTLE_ENDIAN)) | |
434 | return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN), | |
435 | "Core file has non-native endianness, not adjusting permissions."); | |
436 | ||
cb38fdbe ZJS |
437 | r = parse_auxv(LOG_WARNING, |
438 | /* elf_class= */ elf[EI_CLASS], | |
439 | context->meta[META_PROC_AUXV], | |
440 | context->meta_size[META_PROC_AUXV], | |
441 | &at_secure, &uid, &euid, &gid, &egid); | |
3e4d0f6c ZJS |
442 | if (r < 0) |
443 | return r; | |
444 | ||
0c49e004 ZJS |
445 | /* We allow access if %d/dumpable on the command line was exactly 1, we got all the data, |
446 | * at_secure is not set, and the uid/gid match euid/egid. */ | |
3e4d0f6c | 447 | bool ret = |
76e0ab49 | 448 | context->dumpable == SUID_DUMP_USER && |
3e4d0f6c ZJS |
449 | at_secure == 0 && |
450 | uid != UID_INVALID && euid != UID_INVALID && uid == euid && | |
451 | gid != GID_INVALID && egid != GID_INVALID && gid == egid; | |
0c49e004 | 452 | log_debug("Will %s access (dumpable=%u uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)", |
3e4d0f6c | 453 | ret ? "permit" : "restrict", |
0c49e004 | 454 | context->dumpable, |
3e4d0f6c ZJS |
455 | uid, euid, gid, egid, yes_no(at_secure)); |
456 | return ret; | |
457 | } | |
458 | ||
b59233e6 | 459 | static int save_external_coredump( |
f46c706b | 460 | const Context *context, |
3c171f0b | 461 | int input_fd, |
b59233e6 | 462 | char **ret_filename, |
5f3e0a74 HW |
463 | int *ret_node_fd, |
464 | int *ret_data_fd, | |
0cd4e913 | 465 | uint64_t *ret_size, |
587f2a5e | 466 | uint64_t *ret_compressed_size, |
cc4419ed | 467 | bool *ret_truncated) { |
b59233e6 | 468 | |
587f2a5e LB |
469 | _cleanup_(unlink_and_freep) char *tmp = NULL; |
470 | _cleanup_free_ char *fn = NULL; | |
254d1313 | 471 | _cleanup_close_ int fd = -EBADF; |
19455dd6 | 472 | uint64_t process_limit, max_size; |
587f2a5e | 473 | bool truncated, storage_on_tmpfs; |
b59233e6 LP |
474 | struct stat st; |
475 | int r; | |
476 | ||
3c171f0b | 477 | assert(context); |
b59233e6 | 478 | assert(ret_filename); |
5f3e0a74 HW |
479 | assert(ret_node_fd); |
480 | assert(ret_data_fd); | |
b59233e6 | 481 | assert(ret_size); |
587f2a5e LB |
482 | assert(ret_compressed_size); |
483 | assert(ret_truncated); | |
b59233e6 | 484 | |
19455dd6 | 485 | if (context->rlimit < page_size()) |
f46c706b | 486 | /* Is coredumping disabled? Then don't bother saving/processing the |
3a559f22 | 487 | * coredump. Anything below PAGE_SIZE cannot give a readable coredump |
f46c706b FB |
488 | * (the kernel uses ELF_EXEC_PAGESIZE which is not easily accessible, but |
489 | * is usually the same as PAGE_SIZE. */ | |
baaa35ad ZJS |
490 | return log_info_errno(SYNTHETIC_ERRNO(EBADSLT), |
491 | "Resource limits disable core dumping for process %s (%s).", | |
f46c706b | 492 | context->meta[META_ARGV_PID], context->meta[META_COMM]); |
bdfd7b2c | 493 | |
ee0449fd | 494 | process_limit = MAX(arg_process_size_max, storage_size_max()); |
baaa35ad ZJS |
495 | if (process_limit == 0) |
496 | return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT), | |
497 | "Limits for coredump processing and storage are both 0, not dumping core."); | |
ee0449fd | 498 | |
bdfd7b2c | 499 | /* Never store more than the process configured, or than we actually shall keep or process */ |
19455dd6 | 500 | max_size = MIN(context->rlimit, process_limit); |
bdfd7b2c | 501 | |
3c171f0b | 502 | r = make_filename(context, &fn); |
23bbb0de MS |
503 | if (r < 0) |
504 | return log_error_errno(r, "Failed to determine coredump file name: %m"); | |
34c10968 | 505 | |
1fbe8d0c | 506 | (void) mkdir_parents_label(fn, 0755); |
803a3464 | 507 | |
03532f0a | 508 | fd = open_tmpfile_linkable(fn, O_RDWR|O_CLOEXEC, &tmp); |
4a62c710 | 509 | if (fd < 0) |
03532f0a | 510 | return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn); |
803a3464 | 511 | |
587f2a5e LB |
512 | /* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on |
513 | * the service or the slice it belongs to. This is common on low-resources systems, | |
514 | * to avoid crashing processes to take away too many system resources. | |
515 | * Check the cgroup settings, and set max_size to a bit less than half of the | |
516 | * available memory left to the process. | |
517 | * Then, attempt to write the core file uncompressed first - if the write gets | |
518 | * interrupted, we know we won't be able to write it all, so instead compress what | |
519 | * was written so far, delete the uncompressed truncated core, and then continue | |
520 | * compressing from STDIN. Given the compressed core cannot be larger than the | |
521 | * uncompressed one, and 1KB for metadata is accounted for in the calculation, we | |
522 | * should be able to at least store the full compressed core file. */ | |
523 | ||
524 | storage_on_tmpfs = fd_is_temporary_fs(fd) > 0; | |
525 | if (storage_on_tmpfs && arg_compress) { | |
526 | _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; | |
527 | uint64_t cgroup_limit = UINT64_MAX; | |
528 | struct statvfs sv; | |
529 | ||
530 | /* If we can't get the cgroup limit, just ignore it, but don't fail, | |
531 | * try anyway with the config settings. */ | |
532 | r = sd_bus_default_system(&bus); | |
533 | if (r < 0) | |
534 | log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m"); | |
535 | else { | |
536 | _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; | |
537 | ||
538 | r = sd_bus_get_property_trivial( | |
539 | bus, | |
540 | "org.freedesktop.systemd1", | |
541 | "/org/freedesktop/systemd1/unit/self", | |
542 | "org.freedesktop.systemd1.Service", | |
543 | "MemoryAvailable", | |
544 | &error, | |
545 | 't', &cgroup_limit); | |
546 | if (r < 0) | |
547 | log_warning_errno(r, | |
548 | "Failed to query MemoryAvailable for current unit, " | |
549 | "falling back to static config settings: %s", | |
550 | bus_error_message(&error, r)); | |
551 | } | |
803a3464 | 552 | |
e6b25082 | 553 | /* First, ensure we are not going to go over the cgroup limit */ |
587f2a5e | 554 | max_size = MIN(cgroup_limit, max_size); |
e6b25082 LB |
555 | /* tmpfs might get full quickly, so check the available space too. But don't worry about |
556 | * errors here, failing to access the storage location will be better logged when writing to | |
557 | * it. */ | |
8facac5f | 558 | if (fstatvfs(fd, &sv) >= 0) |
587f2a5e | 559 | max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size); |
e6b25082 LB |
560 | /* Impose a lower minimum, otherwise we will miss the basic headers. */ |
561 | max_size = MAX(PROCESS_SIZE_MIN, max_size); | |
562 | /* Ensure we can always switch to compressing on the fly in case we are running out of space | |
563 | * by keeping half of the space/memory available, plus 1KB metadata overhead from the | |
564 | * compression algorithm. */ | |
565 | max_size = LESS_BY(max_size, 1024U) / 2; | |
566 | ||
567 | log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup and/or filesystem limits.", max_size); | |
7849c2ac TA |
568 | } |
569 | ||
587f2a5e LB |
570 | r = copy_bytes(input_fd, fd, max_size, 0); |
571 | if (r < 0) | |
572 | return log_error_errno(r, "Cannot store coredump of %s (%s): %m", | |
573 | context->meta[META_ARGV_PID], context->meta[META_COMM]); | |
574 | truncated = r == 1; | |
cfd652ed | 575 | |
3e4d0f6c ZJS |
576 | bool allow_user = grant_user_access(fd, context) > 0; |
577 | ||
587f2a5e LB |
578 | #if HAVE_COMPRESSION |
579 | if (arg_compress) { | |
580 | _cleanup_(unlink_and_freep) char *tmp_compressed = NULL; | |
581 | _cleanup_free_ char *fn_compressed = NULL; | |
254d1313 | 582 | _cleanup_close_ int fd_compressed = -EBADF; |
587f2a5e LB |
583 | uint64_t uncompressed_size = 0; |
584 | ||
86cbbc6d | 585 | if (lseek(fd, 0, SEEK_SET) < 0) |
587f2a5e | 586 | return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn); |
cfd652ed | 587 | |
ee00684c | 588 | fn_compressed = strjoin(fn, default_compression_extension()); |
587f2a5e LB |
589 | if (!fn_compressed) |
590 | return log_oom(); | |
cfd652ed | 591 | |
03532f0a | 592 | fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed); |
587f2a5e LB |
593 | if (fd_compressed < 0) |
594 | return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed); | |
cfd652ed | 595 | |
587f2a5e LB |
596 | r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size); |
597 | if (r < 0) | |
598 | return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed)); | |
599 | ||
600 | if (truncated && storage_on_tmpfs) { | |
601 | uint64_t partial_uncompressed_size = 0; | |
602 | ||
603 | /* Uncompressed write was truncated and we are writing to tmpfs: delete | |
604 | * the uncompressed core, and compress the remaining part from STDIN. */ | |
605 | ||
606 | tmp = unlink_and_free(tmp); | |
607 | fd = safe_close(fd); | |
608 | ||
609 | r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size); | |
610 | if (r < 0) | |
611 | return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed)); | |
612 | uncompressed_size += partial_uncompressed_size; | |
b59233e6 LP |
613 | } |
614 | ||
2d0bcf1e | 615 | r = fix_permissions_and_link(fd_compressed, tmp_compressed, fn_compressed, context, allow_user); |
cfd652ed | 616 | if (r < 0) |
587f2a5e | 617 | return r; |
b59233e6 | 618 | |
587f2a5e LB |
619 | if (fstat(fd_compressed, &st) < 0) |
620 | return log_error_errno(errno, | |
621 | "Failed to fstat core file %s: %m", | |
622 | coredump_tmpfile_name(tmp_compressed)); | |
cfd652ed | 623 | |
587f2a5e LB |
624 | *ret_filename = TAKE_PTR(fn_compressed); /* compressed */ |
625 | *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */ | |
587f2a5e LB |
626 | *ret_data_fd = TAKE_FD(fd); |
627 | *ret_size = uncompressed_size; | |
dc8e3118 | 628 | *ret_compressed_size = (uint64_t) st.st_size; /* compressed */ |
587f2a5e | 629 | *ret_truncated = truncated; |
cfd652ed | 630 | |
cfd652ed | 631 | return 0; |
34c10968 | 632 | } |
3b1a55e1 | 633 | #endif |
5f3e0a74 | 634 | |
587f2a5e LB |
635 | if (truncated) |
636 | log_struct(LOG_INFO, | |
08e86b15 | 637 | LOG_MESSAGE("Core file was truncated to %"PRIu64" bytes.", max_size), |
3cf6a3a3 YW |
638 | LOG_ITEM("SIZE_LIMIT=%"PRIu64, max_size), |
639 | LOG_MESSAGE_ID(SD_MESSAGE_TRUNCATED_CORE_STR)); | |
587f2a5e | 640 | |
2d0bcf1e | 641 | r = fix_permissions_and_link(fd, tmp, fn, context, allow_user); |
cfd652ed | 642 | if (r < 0) |
587f2a5e LB |
643 | return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn); |
644 | ||
645 | if (fstat(fd, &st) < 0) | |
646 | return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp)); | |
647 | ||
86cbbc6d | 648 | if (lseek(fd, 0, SEEK_SET) < 0) |
587f2a5e | 649 | return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn); |
34c10968 | 650 | |
0cfb0971 | 651 | *ret_filename = TAKE_PTR(fn); |
dc8e3118 | 652 | *ret_node_fd = -EBADF; |
1cc6c93a | 653 | *ret_data_fd = TAKE_FD(fd); |
59f448cf | 654 | *ret_size = (uint64_t) st.st_size; |
dc8e3118 | 655 | *ret_compressed_size = UINT64_MAX; |
587f2a5e | 656 | *ret_truncated = truncated; |
34c10968 | 657 | |
34c10968 | 658 | return 0; |
34c10968 LP |
659 | } |
660 | ||
661 | static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) { | |
662 | _cleanup_free_ char *field = NULL; | |
663 | ssize_t n; | |
664 | ||
8d4e028f | 665 | assert(fd >= 0); |
34c10968 LP |
666 | assert(ret); |
667 | assert(ret_size); | |
668 | ||
86cbbc6d | 669 | if (lseek(fd, 0, SEEK_SET) < 0) |
4a62c710 | 670 | return log_warning_errno(errno, "Failed to seek: %m"); |
803a3464 | 671 | |
34c10968 | 672 | field = malloc(9 + size); |
a73c74db LP |
673 | if (!field) |
674 | return log_warning_errno(SYNTHETIC_ERRNO(ENOMEM), | |
675 | "Failed to allocate memory for coredump, coredump will not be stored."); | |
34c10968 LP |
676 | |
677 | memcpy(field, "COREDUMP=", 9); | |
678 | ||
a73c74db LP |
679 | /* NB: simple read() would fail for overly large coredumps, since read() on Linux can only deal with |
680 | * 0x7ffff000 bytes max. Hence call things in a loop. */ | |
681 | n = loop_read(fd, field + 9, size, /* do_poll= */ false); | |
23bbb0de MS |
682 | if (n < 0) |
683 | return log_error_errno((int) n, "Failed to read core data: %m"); | |
baaa35ad | 684 | if ((size_t) n < size) |
4e494e6a | 685 | return log_error_errno(SYNTHETIC_ERRNO(EIO), "Core data too short."); |
34c10968 | 686 | |
1cc6c93a | 687 | *ret = TAKE_PTR(field); |
34c10968 LP |
688 | *ret_size = size + 9; |
689 | ||
34c10968 LP |
690 | return 0; |
691 | } | |
803a3464 | 692 | |
3f132692 JF |
693 | /* Joins /proc/[pid]/fd/ and /proc/[pid]/fdinfo/ into the following lines: |
694 | * 0:/dev/pts/23 | |
695 | * pos: 0 | |
696 | * flags: 0100002 | |
697 | * | |
698 | * 1:/dev/pts/23 | |
699 | * pos: 0 | |
700 | * flags: 0100002 | |
701 | * | |
702 | * 2:/dev/pts/23 | |
703 | * pos: 0 | |
704 | * flags: 0100002 | |
705 | * EOF | |
706 | */ | |
2485b7e2 YW |
707 | static int compose_open_fds(pid_t pid, char **ret) { |
708 | _cleanup_(memstream_done) MemStream m = {}; | |
4d84bc2f | 709 | _cleanup_closedir_ DIR *proc_fd_dir = NULL; |
254d1313 | 710 | _cleanup_close_ int proc_fdinfo_fd = -EBADF; |
59059b4a | 711 | const char *fddelim = "", *path; |
2485b7e2 | 712 | FILE *stream; |
7b26ea6f | 713 | int r; |
3f132692 JF |
714 | |
715 | assert(pid >= 0); | |
2485b7e2 | 716 | assert(ret); |
3f132692 | 717 | |
59059b4a | 718 | path = procfs_file_alloca(pid, "fd"); |
3f132692 | 719 | proc_fd_dir = opendir(path); |
59059b4a ZJS |
720 | if (!proc_fd_dir) |
721 | return -errno; | |
3f132692 | 722 | |
4d84bc2f | 723 | proc_fdinfo_fd = openat(dirfd(proc_fd_dir), "../fdinfo", O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC|O_PATH); |
59059b4a ZJS |
724 | if (proc_fdinfo_fd < 0) |
725 | return -errno; | |
3f132692 | 726 | |
2485b7e2 | 727 | stream = memstream_init(&m); |
3f132692 JF |
728 | if (!stream) |
729 | return -ENOMEM; | |
730 | ||
af3b864d | 731 | FOREACH_DIRENT(de, proc_fd_dir, return -errno) { |
3f132692 | 732 | _cleanup_fclose_ FILE *fdinfo = NULL; |
4d84bc2f | 733 | _cleanup_free_ char *fdname = NULL; |
254d1313 | 734 | _cleanup_close_ int fd = -EBADF; |
3f132692 | 735 | |
af3b864d | 736 | r = readlinkat_malloc(dirfd(proc_fd_dir), de->d_name, &fdname); |
3f132692 JF |
737 | if (r < 0) |
738 | return r; | |
739 | ||
af3b864d | 740 | fprintf(stream, "%s%s:%s\n", fddelim, de->d_name, fdname); |
3f132692 JF |
741 | fddelim = "\n"; |
742 | ||
743 | /* Use the directory entry from /proc/[pid]/fd with /proc/[pid]/fdinfo */ | |
af3b864d | 744 | fd = openat(proc_fdinfo_fd, de->d_name, O_NOFOLLOW|O_CLOEXEC|O_RDONLY); |
59059b4a | 745 | if (fd < 0) |
3f132692 JF |
746 | continue; |
747 | ||
b46c3e49 VC |
748 | fdinfo = take_fdopen(&fd, "r"); |
749 | if (!fdinfo) | |
3f132692 JF |
750 | continue; |
751 | ||
7b26ea6f LP |
752 | for (;;) { |
753 | _cleanup_free_ char *line = NULL; | |
754 | ||
755 | r = read_line(fdinfo, LONG_LINE_MAX, &line); | |
756 | if (r < 0) | |
757 | return r; | |
758 | if (r == 0) | |
759 | break; | |
760 | ||
0d536673 | 761 | fputs(line, stream); |
7b26ea6f | 762 | fputc('\n', stream); |
4d84bc2f | 763 | } |
3f132692 JF |
764 | } |
765 | ||
2485b7e2 | 766 | return memstream_finalize(&m, ret, NULL); |
3f132692 JF |
767 | } |
768 | ||
7ed03ce6 JF |
769 | /* Returns 1 if the parent was found. |
770 | * Returns 0 if there is not a process we can call the pid's | |
771 | * container parent (the pid's process isn't 'containerized'). | |
772 | * Returns a negative number on errors. | |
773 | */ | |
0b8b1332 | 774 | static int get_process_container_parent_cmdline(PidRef *pid, char** ret_cmdline) { |
83844031 | 775 | int r; |
7ed03ce6 | 776 | |
0b8b1332 LP |
777 | assert(pidref_is_set(pid)); |
778 | assert(!pidref_is_remote(pid)); | |
779 | ||
d6267b9b LP |
780 | r = pidref_from_same_root_fs(pid, &PIDREF_MAKE_FROM_PID(1)); |
781 | if (r < 0) | |
782 | return r; | |
783 | if (r > 0) { | |
784 | /* The process uses system root. */ | |
0b8b1332 | 785 | *ret_cmdline = NULL; |
7ed03ce6 JF |
786 | return 0; |
787 | } | |
788 | ||
0b8b1332 | 789 | _cleanup_(pidref_done) PidRef container_pid = PIDREF_NULL; |
ade39d9a | 790 | r = namespace_get_leader(pid, NAMESPACE_MOUNT, &container_pid); |
7ed03ce6 JF |
791 | if (r < 0) |
792 | return r; | |
793 | ||
0b8b1332 | 794 | r = pidref_get_cmdline(&container_pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, ret_cmdline); |
d3cba4ea EV |
795 | if (r < 0) |
796 | return r; | |
797 | ||
798 | return 1; | |
7ed03ce6 JF |
799 | } |
800 | ||
f46c706b | 801 | static int change_uid_gid(const Context *context) { |
ea8eb370 LP |
802 | int r; |
803 | ||
804 | assert(context); | |
805 | ||
9764bca9 NR |
806 | uid_t uid = context->uid; |
807 | gid_t gid = context->gid; | |
34c10968 | 808 | |
28add648 | 809 | if (uid_is_system(uid)) { |
888e378d LP |
810 | const char *user = "systemd-coredump"; |
811 | ||
fafff8f1 | 812 | r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); |
888e378d LP |
813 | if (r < 0) { |
814 | log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user); | |
815 | uid = gid = 0; | |
816 | } | |
888e378d | 817 | } |
3c171f0b LP |
818 | |
819 | return drop_privileges(uid, gid, 0); | |
820 | } | |
8c8549db | 821 | |
0aea6872 | 822 | static int attach_mount_tree(int mount_tree_fd) { |
68511ceb MS |
823 | int r; |
824 | ||
825 | assert(mount_tree_fd >= 0); | |
68511ceb | 826 | |
0aea6872 | 827 | r = detach_mount_namespace(); |
68511ceb | 828 | if (r < 0) |
0aea6872 | 829 | return log_warning_errno(r, "Failed to detach mount namespace: %m"); |
68511ceb | 830 | |
0aea6872 | 831 | r = mkdir_p_label(MOUNT_TREE_ROOT, 0555); |
68511ceb | 832 | if (r < 0) |
0aea6872 | 833 | return log_warning_errno(r, "Failed to create directory: %m"); |
68511ceb | 834 | |
0aea6872 | 835 | r = mount_setattr(mount_tree_fd, "", AT_EMPTY_PATH, |
8f8148cb MS |
836 | &(struct mount_attr) { |
837 | /* MOUNT_ATTR_NOSYMFOLLOW is left out on purpose to allow libdwfl to resolve symlinks. | |
838 | * libdwfl will use openat2() with RESOLVE_IN_ROOT so there is no risk of symlink escape. | |
839 | * https://sourceware.org/git/?p=elfutils.git;a=patch;h=06f0520f9a78b07c11c343181d552791dd630346 */ | |
840 | .attr_set = MOUNT_ATTR_RDONLY|MOUNT_ATTR_NOSUID|MOUNT_ATTR_NODEV|MOUNT_ATTR_NOEXEC, | |
841 | .propagation = MS_SLAVE, | |
842 | }, sizeof(struct mount_attr)); | |
68511ceb | 843 | if (r < 0) |
13cd1db0 | 844 | return log_warning_errno(errno, "Failed to change properties of mount tree: %m"); |
68511ceb | 845 | |
0aea6872 | 846 | r = move_mount(mount_tree_fd, "", -EBADF, MOUNT_TREE_ROOT, MOVE_MOUNT_F_EMPTY_PATH); |
68511ceb | 847 | if (r < 0) |
0aea6872 | 848 | return log_warning_errno(errno, "Failed to attach mount tree: %m"); |
68511ceb | 849 | |
68511ceb MS |
850 | return 0; |
851 | } | |
852 | ||
3c171f0b | 853 | static int submit_coredump( |
3e4d0f6c | 854 | const Context *context, |
9a435388 | 855 | struct iovec_wrapper *iovw, |
313537da | 856 | int input_fd) { |
34c10968 | 857 | |
309a747f | 858 | _cleanup_(sd_json_variant_unrefp) sd_json_variant *json_metadata = NULL; |
254d1313 | 859 | _cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF; |
d8a567df | 860 | _cleanup_free_ char *filename = NULL, *coredump_data = NULL, *stacktrace = NULL; |
0aea6872 | 861 | const char *module_name, *root = NULL; |
587f2a5e | 862 | uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX; |
6fea39ba | 863 | bool truncated = false, written = false; |
309a747f | 864 | sd_json_variant *module_json; |
3c171f0b | 865 | int r; |
83844031 | 866 | |
3c171f0b | 867 | assert(context); |
9a435388 | 868 | assert(iovw); |
3c171f0b | 869 | assert(input_fd >= 0); |
f5e04665 | 870 | |
3c171f0b LP |
871 | /* Vacuum before we write anything again */ |
872 | (void) coredump_vacuum(-1, arg_keep_free, arg_max_use); | |
803a3464 | 873 | |
3c171f0b | 874 | /* Always stream the coredump to disk, if that's possible */ |
c8e94763 LP |
875 | written = save_external_coredump( |
876 | context, input_fd, | |
877 | &filename, &coredump_node_fd, &coredump_fd, | |
878 | &coredump_size, &coredump_compressed_size, &truncated) >= 0; | |
879 | if (written) { | |
880 | /* If we could write it to disk we can now process it. */ | |
881 | /* If we don't want to keep the coredump on disk, remove it now, as later on we | |
882 | * will lack the privileges for it. However, we keep the fd to it, so that we can | |
883 | * still process it and log it. */ | |
5125a0b8 LP |
884 | r = maybe_remove_external_coredump( |
885 | context, | |
886 | filename, | |
887 | coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size); | |
c8e94763 LP |
888 | if (r < 0) |
889 | return r; | |
890 | if (r == 0) | |
891 | (void) iovw_put_string_field(iovw, "COREDUMP_FILENAME=", filename); | |
892 | else if (arg_storage == COREDUMP_STORAGE_EXTERNAL) | |
893 | log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)", | |
894 | coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max); | |
895 | ||
896 | /* Vacuum again, but exclude the coredump we just created */ | |
897 | (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use); | |
898 | } | |
6fea39ba | 899 | |
313537da | 900 | if (context->mount_tree_fd >= 0 && attach_mount_tree(context->mount_tree_fd) >= 0) |
0aea6872 | 901 | root = MOUNT_TREE_ROOT; |
68511ceb | 902 | |
c8e94763 LP |
903 | /* Now, let's drop privileges to become the user who owns the segfaulted process and allocate the |
904 | * coredump memory under the user's uid. This also ensures that the credentials journald will see are | |
905 | * the ones of the coredumping user, thus making sure the user gets access to the core dump. Let's | |
906 | * also get rid of all capabilities, if we run as root, we won't need them anymore. */ | |
3c171f0b LP |
907 | r = change_uid_gid(context); |
908 | if (r < 0) | |
909 | return log_error_errno(r, "Failed to drop privileges: %m"); | |
7bfce976 | 910 | |
c8e94763 LP |
911 | if (written) { |
912 | /* Try to get a stack trace if we can */ | |
913 | if (coredump_size > arg_process_size_max) | |
914 | log_debug("Not generating stack trace: core size %"PRIu64" is greater " | |
915 | "than %"PRIu64" (the configured maximum)", | |
916 | coredump_size, arg_process_size_max); | |
917 | else if (coredump_fd >= 0) { | |
918 | bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */ | |
919 | ||
920 | (void) parse_elf_object(coredump_fd, | |
921 | context->meta[META_EXE], | |
68511ceb | 922 | root, |
c8e94763 LP |
923 | /* fork_disable_dump= */ skip, /* avoid loops */ |
924 | &stacktrace, | |
925 | &json_metadata); | |
926 | } | |
c790632c | 927 | } |
51d3783d | 928 | |
6fea39ba | 929 | _cleanup_free_ char *core_message = NULL; |
6fea39ba LP |
930 | core_message = strjoin( |
931 | "Process ", context->meta[META_ARGV_PID], | |
932 | " (", context->meta[META_COMM], | |
933 | ") of user ", context->meta[META_ARGV_UID], | |
934 | written ? " dumped core." : " terminated abnormally without generating a coredump."); | |
935 | if (!core_message) | |
936 | return log_oom(); | |
937 | ||
938 | if (context->is_journald && filename) | |
939 | if (!strextend(&core_message, "\nCoredump diverted to ", filename)) | |
940 | return log_oom(); | |
51d3783d | 941 | |
6fea39ba LP |
942 | if (stacktrace) |
943 | if (!strextend(&core_message, "\n\n", stacktrace)) | |
944 | return log_oom(); | |
92e92d71 | 945 | |
5edf875b DDM |
946 | if (context->is_journald) |
947 | /* We might not be able to log to the journal, so let's always print the message to another | |
948 | * log target. The target was set previously to something safe. */ | |
9a435388 | 949 | log_dispatch(LOG_ERR, 0, core_message); |
92e92d71 | 950 | |
2a3bebd0 | 951 | (void) iovw_put_string_field(iovw, "MESSAGE=", core_message); |
3c171f0b | 952 | |
0cd4e913 | 953 | if (truncated) |
2a3bebd0 | 954 | (void) iovw_put_string_field(iovw, "COREDUMP_TRUNCATED=", "1"); |
0cd4e913 | 955 | |
c546154a LB |
956 | /* If we managed to parse any ELF metadata (build-id, ELF package meta), |
957 | * attach it as journal metadata. */ | |
958 | if (json_metadata) { | |
959 | _cleanup_free_ char *formatted_json = NULL; | |
960 | ||
309a747f | 961 | r = sd_json_variant_format(json_metadata, 0, &formatted_json); |
c546154a LB |
962 | if (r < 0) |
963 | return log_error_errno(r, "Failed to format JSON package metadata: %m"); | |
964 | ||
671769c9 | 965 | (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_JSON=", formatted_json); |
c546154a LB |
966 | } |
967 | ||
c790632c ZJS |
968 | /* In the unlikely scenario that context->meta[META_EXE] is not available, |
969 | * let's avoid guessing the module name and skip the loop. */ | |
970 | if (context->meta[META_EXE]) | |
971 | JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, json_metadata) { | |
309a747f | 972 | sd_json_variant *t; |
c546154a | 973 | |
c790632c ZJS |
974 | /* We only add structured fields for the 'main' ELF module, and only if we can identify it. */ |
975 | if (!path_equal_filename(module_name, context->meta[META_EXE])) | |
976 | continue; | |
c546154a | 977 | |
309a747f | 978 | t = sd_json_variant_by_key(module_json, "name"); |
c790632c | 979 | if (t) |
309a747f | 980 | (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_NAME=", sd_json_variant_string(t)); |
1f2abb79 | 981 | |
309a747f | 982 | t = sd_json_variant_by_key(module_json, "version"); |
c790632c | 983 | if (t) |
309a747f | 984 | (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_VERSION=", sd_json_variant_string(t)); |
c790632c | 985 | } |
c546154a | 986 | |
3c171f0b | 987 | /* Optionally store the entire coredump in the journal */ |
587f2a5e | 988 | if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) { |
6e9ef603 ZJS |
989 | if (coredump_size <= arg_journal_size_max) { |
990 | size_t sz = 0; | |
991 | ||
992 | /* Store the coredump itself in the journal */ | |
993 | ||
994 | r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz); | |
9a435388 FB |
995 | if (r >= 0) { |
996 | if (iovw_put(iovw, coredump_data, sz) >= 0) | |
997 | TAKE_PTR(coredump_data); | |
998 | } else | |
6e9ef603 ZJS |
999 | log_warning_errno(r, "Failed to attach the core to the journal entry: %m"); |
1000 | } else | |
5206a724 | 1001 | log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)", |
6e9ef603 | 1002 | coredump_size, arg_journal_size_max); |
f5e04665 LP |
1003 | } |
1004 | ||
5edf875b DDM |
1005 | /* If journald is coredumping, we have to be careful that we don't deadlock when trying to write the |
1006 | * coredump to the journal, so we put the journal socket in nonblocking mode before trying to write | |
1007 | * the coredump to the socket. */ | |
1008 | ||
1009 | if (context->is_journald) { | |
1010 | r = journal_fd_nonblock(true); | |
1011 | if (r < 0) | |
1012 | return log_error_errno(r, "Failed to make journal socket non-blocking: %m"); | |
1013 | } | |
1014 | ||
9a435388 | 1015 | r = sd_journal_sendv(iovw->iovec, iovw->count); |
5edf875b DDM |
1016 | |
1017 | if (context->is_journald) { | |
1018 | int k; | |
1019 | ||
1020 | k = journal_fd_nonblock(false); | |
1021 | if (k < 0) | |
1022 | return log_error_errno(k, "Failed to make journal socket blocking: %m"); | |
1023 | } | |
1024 | ||
1025 | if (r == -EAGAIN && context->is_journald) | |
1026 | log_warning_errno(r, "Failed to log journal coredump, ignoring: %m"); | |
1027 | else if (r < 0) | |
3c171f0b LP |
1028 | return log_error_errno(r, "Failed to log coredump: %m"); |
1029 | ||
1030 | return 0; | |
1031 | } | |
1032 | ||
960b0458 | 1033 | static int context_parse_iovw(Context *context, struct iovec_wrapper *iovw) { |
f46c706b FB |
1034 | const char *unit; |
1035 | int r; | |
3c171f0b | 1036 | |
3c171f0b | 1037 | assert(context); |
f46c706b | 1038 | assert(iovw); |
3c171f0b | 1039 | |
313537da LP |
1040 | /* Converts the data in the iovec array iovw into separate fields. Fills in context->meta[] (for |
1041 | * which no memory is allocated, it just contains direct pointers into the iovec array memory). */ | |
3c171f0b | 1042 | |
960b0458 LP |
1043 | bool have_signal_name = false; |
1044 | FOREACH_ARRAY(iovec, iovw->iovec, iovw->count) { | |
fe96c0f8 | 1045 | for (size_t i = 0; i < ELEMENTSOF(meta_field_names); i++) { |
c673f1f6 | 1046 | /* Note that these strings are NUL-terminated, because we made sure that a |
f46c706b | 1047 | * trailing NUL byte is in the buffer, though not included in the iov_len |
c673f1f6 | 1048 | * count (see process_socket() and gather_pid_metadata_*()). */ |
f46c706b | 1049 | assert(((char*) iovec->iov_base)[iovec->iov_len] == 0); |
3c171f0b | 1050 | |
b1694040 | 1051 | const char *p = memory_startswith(iovec->iov_base, iovec->iov_len, meta_field_names[i]); |
f46c706b FB |
1052 | if (p) { |
1053 | context->meta[i] = p; | |
3e4d0f6c | 1054 | context->meta_size[i] = iovec->iov_len - strlen(meta_field_names[i]); |
f46c706b FB |
1055 | break; |
1056 | } | |
1057 | } | |
960b0458 LP |
1058 | |
1059 | have_signal_name = have_signal_name || | |
1060 | memory_startswith(iovec->iov_base, iovec->iov_len, "COREDUMP_SIGNAL_NAME="); | |
3c171f0b | 1061 | } |
f46c706b | 1062 | |
c673f1f6 | 1063 | /* The basic fields from argv[] should always be there, refuse early if not. */ |
ded0aac3 | 1064 | for (int i = 0; i < _META_ARGV_REQUIRED; i++) |
098c3975 | 1065 | if (!context->meta[i]) |
c673f1f6 ZJS |
1066 | return log_error_errno(SYNTHETIC_ERRNO(EINVAL), |
1067 | "A required (%s) has not been sent, aborting.", meta_field_names[i]); | |
f46c706b | 1068 | |
313537da LP |
1069 | pid_t parsed_pid; |
1070 | r = parse_pid(context->meta[META_ARGV_PID], &parsed_pid); | |
f46c706b FB |
1071 | if (r < 0) |
1072 | return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]); | |
313537da LP |
1073 | if (pidref_is_set(&context->pidref)) { |
1074 | if (context->pidref.pid != parsed_pid) | |
c673f1f6 ZJS |
1075 | return log_error_errno(r, "Passed PID " PID_FMT " does not match passed " PID_FMT ": %m", |
1076 | parsed_pid, context->pidref.pid); | |
313537da LP |
1077 | } else { |
1078 | r = pidref_set_pid(&context->pidref, parsed_pid); | |
1079 | if (r < 0) | |
1080 | return log_error_errno(r, "Failed to initialize pidref from pid " PID_FMT ": %m", parsed_pid); | |
1081 | } | |
f46c706b | 1082 | |
9764bca9 NR |
1083 | r = parse_uid(context->meta[META_ARGV_UID], &context->uid); |
1084 | if (r < 0) | |
1085 | return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]); | |
1086 | ||
1087 | r = parse_gid(context->meta[META_ARGV_GID], &context->gid); | |
1088 | if (r < 0) | |
1089 | return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]); | |
1090 | ||
960b0458 LP |
1091 | r = parse_signo(context->meta[META_ARGV_SIGNAL], &context->signo); |
1092 | if (r < 0) | |
1093 | log_warning_errno(r, "Failed to parse signal number \"%s\", ignoring: %m", context->meta[META_ARGV_SIGNAL]); | |
1094 | ||
19455dd6 LP |
1095 | r = safe_atou64(context->meta[META_ARGV_RLIMIT], &context->rlimit); |
1096 | if (r < 0) | |
1097 | log_warning_errno(r, "Failed to parse resource limit \"%s\", ignoring: %m", context->meta[META_ARGV_RLIMIT]); | |
1098 | ||
76e0ab49 | 1099 | /* The value is set to contents of /proc/sys/fs/suid_dumpable, which we set to SUID_DUMP_SAFE (2), |
0c49e004 ZJS |
1100 | * if the process is marked as not dumpable, see PR_SET_DUMPABLE(2const). */ |
1101 | if (context->meta[META_ARGV_DUMPABLE]) { | |
1102 | r = safe_atou(context->meta[META_ARGV_DUMPABLE], &context->dumpable); | |
1103 | if (r < 0) | |
1104 | return log_error_errno(r, "Failed to parse dumpable field \"%s\": %m", context->meta[META_ARGV_DUMPABLE]); | |
76e0ab49 | 1105 | if (context->dumpable > SUID_DUMP_SAFE) |
0c49e004 ZJS |
1106 | log_notice("Got unexpected %%d/dumpable value %u.", context->dumpable); |
1107 | } | |
1108 | ||
f46c706b FB |
1109 | unit = context->meta[META_UNIT]; |
1110 | context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE); | |
1111 | context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE); | |
1112 | ||
960b0458 LP |
1113 | /* After parsing everything, let's also synthesize a new iovw field for the textual signal name if it |
1114 | * isn't already set. */ | |
1115 | if (SIGNAL_VALID(context->signo) && !have_signal_name) | |
1116 | (void) iovw_put_string_field(iovw, "COREDUMP_SIGNAL_NAME=SIG", signal_to_string(context->signo)); | |
1117 | ||
f46c706b | 1118 | return 0; |
3c171f0b LP |
1119 | } |
1120 | ||
1121 | static int process_socket(int fd) { | |
2865561e | 1122 | _cleanup_(iovw_done_free) struct iovec_wrapper iovw = {}; |
313537da LP |
1123 | _cleanup_(context_done) Context context = CONTEXT_NULL; |
1124 | _cleanup_close_ int input_fd = -EBADF; | |
313537da LP |
1125 | enum { |
1126 | STATE_PAYLOAD, | |
1127 | STATE_INPUT_FD_DONE, | |
1128 | STATE_PID_FD_DONE, | |
1129 | } state = STATE_PAYLOAD; | |
fe96c0f8 | 1130 | int r; |
3c171f0b LP |
1131 | |
1132 | assert(fd >= 0); | |
1133 | ||
d2acb93d | 1134 | log_setup(); |
3c171f0b | 1135 | |
ecfb4bb0 | 1136 | log_debug("Processing coredump received via socket..."); |
988e89ee | 1137 | |
3c171f0b | 1138 | for (;;) { |
fb29cdbe | 1139 | CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control; |
3c171f0b LP |
1140 | struct msghdr mh = { |
1141 | .msg_control = &control, | |
1142 | .msg_controllen = sizeof(control), | |
1143 | .msg_iovlen = 1, | |
1144 | }; | |
369b1237 | 1145 | ssize_t n, l; |
3c171f0b | 1146 | |
fe1ef0f8 | 1147 | l = next_datagram_size_fd(fd); |
2865561e LP |
1148 | if (l < 0) |
1149 | return log_error_errno(l, "Failed to determine datagram size to read: %m"); | |
3c171f0b | 1150 | |
369b1237 LP |
1151 | _cleanup_(iovec_done) struct iovec iovec = { |
1152 | .iov_len = l, | |
1153 | .iov_base = malloc(l + 1), | |
1154 | }; | |
2865561e LP |
1155 | if (!iovec.iov_base) |
1156 | return log_oom(); | |
3c171f0b | 1157 | |
9a435388 | 1158 | mh.msg_iov = &iovec; |
3c171f0b | 1159 | |
3691bcf3 | 1160 | n = recvmsg_safe(fd, &mh, MSG_CMSG_CLOEXEC); |
2865561e LP |
1161 | if (n < 0) |
1162 | return log_error_errno(n, "Failed to receive datagram: %m"); | |
3c171f0b | 1163 | |
313537da LP |
1164 | /* The final zero-length datagrams ("sentinels") carry file descriptors and tell us that |
1165 | * we're done. There are three sentinels: one with just the coredump fd, followed by one with | |
1166 | * the pidfd, and finally one with the mount tree fd. The latter two or the last one may be | |
1167 | * omitted (which is supported for compatibility with older systemd version, in particular to | |
1168 | * facilitate cross-container coredumping). */ | |
3c171f0b | 1169 | if (n == 0) { |
dac556fa | 1170 | struct cmsghdr *found; |
3c171f0b | 1171 | |
313537da LP |
1172 | found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int))); |
1173 | if (!found) { | |
1174 | /* This is zero length message but it either doesn't carry a single | |
1175 | * descriptor, or it has more than one. This is a protocol violation so let's | |
1176 | * bail out. | |
1177 | * | |
1178 | * Well, not quite! In practice there's one more complication: EOF on | |
1179 | * SOCK_SEQPACKET is not distinguishable from a zero length datagram. Hence | |
1180 | * if we get a zero length datagram without fds we consider it EOF, and | |
1181 | * that's permissible for the final two fds. Hence let's be strict on the | |
1182 | * first fd, but lenient on the other two. */ | |
1183 | ||
c673f1f6 ZJS |
1184 | if (!cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, (socklen_t) -1) && state != STATE_PAYLOAD) |
1185 | /* No fds, and already got the first fd → we are done. */ | |
313537da | 1186 | break; |
a65ad191 | 1187 | |
a65ad191 | 1188 | cmsg_close_all(&mh); |
2865561e LP |
1189 | return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), |
1190 | "Received zero length message with zero or more than one file descriptor(s), expected one."); | |
a65ad191 MS |
1191 | } |
1192 | ||
313537da | 1193 | switch (state) { |
68511ceb | 1194 | |
313537da LP |
1195 | case STATE_PAYLOAD: |
1196 | assert(input_fd < 0); | |
1197 | input_fd = *CMSG_TYPED_DATA(found, int); | |
1198 | state = STATE_INPUT_FD_DONE; | |
1199 | continue; | |
68511ceb | 1200 | |
313537da LP |
1201 | case STATE_INPUT_FD_DONE: |
1202 | assert(!pidref_is_set(&context.pidref)); | |
68511ceb | 1203 | |
313537da | 1204 | r = pidref_set_pidfd_consume(&context.pidref, *CMSG_TYPED_DATA(found, int)); |
2865561e LP |
1205 | if (r < 0) |
1206 | return log_error_errno(r, "Failed to initialize pidref: %m"); | |
68511ceb | 1207 | |
313537da LP |
1208 | state = STATE_PID_FD_DONE; |
1209 | continue; | |
68511ceb | 1210 | |
313537da LP |
1211 | case STATE_PID_FD_DONE: |
1212 | assert(context.mount_tree_fd < 0); | |
1213 | context.mount_tree_fd = *CMSG_TYPED_DATA(found, int); | |
1214 | /* We have all FDs we need so we are done. */ | |
68511ceb | 1215 | break; |
68511ceb MS |
1216 | } |
1217 | ||
313537da LP |
1218 | break; |
1219 | } | |
a65ad191 | 1220 | |
313537da | 1221 | cmsg_close_all(&mh); |
a65ad191 | 1222 | |
313537da | 1223 | /* Only zero length messages are allowed after the first message that carried a file descriptor. */ |
2865561e LP |
1224 | if (state != STATE_PAYLOAD) |
1225 | return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Received unexpected message with non-zero length."); | |
d9fd1d37 | 1226 | |
313537da | 1227 | /* Payload messages should not carry fds */ |
2865561e LP |
1228 | if (cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, (socklen_t) -1)) |
1229 | return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), | |
313537da | 1230 | "Received payload message with file descriptor(s), expected none."); |
3c171f0b LP |
1231 | |
1232 | /* Add trailing NUL byte, in case these are strings */ | |
9a435388 FB |
1233 | ((char*) iovec.iov_base)[n] = 0; |
1234 | iovec.iov_len = (size_t) n; | |
3c171f0b | 1235 | |
2865561e LP |
1236 | if (iovw_put(&iovw, iovec.iov_base, iovec.iov_len) < 0) |
1237 | return log_oom(); | |
369b1237 LP |
1238 | |
1239 | TAKE_STRUCT(iovec); | |
34c10968 LP |
1240 | } |
1241 | ||
61233823 | 1242 | /* Make sure we got all data we really need */ |
f8540bde | 1243 | assert(input_fd >= 0); |
3c171f0b | 1244 | |
32756e57 | 1245 | r = context_parse_iovw(&context, &iovw); |
f46c706b | 1246 | if (r < 0) |
2865561e | 1247 | return r; |
f46c706b | 1248 | |
49f1f2d4 ZJS |
1249 | /* Make sure we received all the expected fields. We support being called by an *older* |
1250 | * systemd-coredump from the outside, so we require only the basic set of fields that | |
1251 | * was being sent when the support for sending to containers over a socket was added | |
1252 | * in a108c43e36d3ceb6e34efe37c014fc2cda856000. */ | |
1253 | meta_argv_t i; | |
1254 | FOREACH_ARGUMENT(i, | |
1255 | META_ARGV_PID, | |
1256 | META_ARGV_UID, | |
1257 | META_ARGV_GID, | |
1258 | META_ARGV_SIGNAL, | |
1259 | META_ARGV_TIMESTAMP, | |
1260 | META_ARGV_RLIMIT, | |
1261 | META_ARGV_HOSTNAME, | |
1262 | META_COMM) | |
2865561e | 1263 | if (!context.meta[i]) |
49f1f2d4 ZJS |
1264 | return log_error_errno(SYNTHETIC_ERRNO(EINVAL), |
1265 | "Mandatory argument %s not received on socket, aborting.", | |
1266 | meta_field_names[i]); | |
3c171f0b | 1267 | |
2865561e | 1268 | return submit_coredump(&context, &iovw, input_fd); |
3c171f0b LP |
1269 | } |
1270 | ||
313537da | 1271 | static int send_iovec(const struct iovec_wrapper *iovw, int input_fd, PidRef *pidref, int mount_tree_fd) { |
254d1313 | 1272 | _cleanup_close_ int fd = -EBADF; |
3c171f0b LP |
1273 | int r; |
1274 | ||
9a435388 | 1275 | assert(iovw); |
3c171f0b LP |
1276 | assert(input_fd >= 0); |
1277 | ||
1278 | fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0); | |
1279 | if (fd < 0) | |
1280 | return log_error_errno(errno, "Failed to create coredump socket: %m"); | |
1281 | ||
1861986a LP |
1282 | r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/coredump"); |
1283 | if (r < 0) | |
1284 | return log_error_errno(r, "Failed to connect to coredump service: %m"); | |
3c171f0b | 1285 | |
fe96c0f8 | 1286 | for (size_t i = 0; i < iovw->count; i++) { |
fec603eb | 1287 | struct msghdr mh = { |
9a435388 | 1288 | .msg_iov = iovw->iovec + i, |
fec603eb LP |
1289 | .msg_iovlen = 1, |
1290 | }; | |
1291 | struct iovec copy[2]; | |
1292 | ||
1293 | for (;;) { | |
1294 | if (sendmsg(fd, &mh, MSG_NOSIGNAL) >= 0) | |
1295 | break; | |
1296 | ||
1297 | if (errno == EMSGSIZE && mh.msg_iov[0].iov_len > 0) { | |
f46c706b FB |
1298 | /* This field didn't fit? That's a pity. Given that this is |
1299 | * just metadata, let's truncate the field at half, and try | |
1300 | * again. We append three dots, in order to show that this is | |
1301 | * truncated. */ | |
fec603eb LP |
1302 | |
1303 | if (mh.msg_iov != copy) { | |
f46c706b FB |
1304 | /* We don't want to modify the caller's iovec, hence |
1305 | * let's create our own array, consisting of two new | |
1306 | * iovecs, where the first is a (truncated) copy of | |
1307 | * what we want to send, and the second one contains | |
1308 | * the trailing dots. */ | |
9a435388 | 1309 | copy[0] = iovw->iovec[i]; |
ea8eb370 | 1310 | copy[1] = IOVEC_MAKE(((const char[]){'.', '.', '.'}), 3); |
fec603eb LP |
1311 | |
1312 | mh.msg_iov = copy; | |
1313 | mh.msg_iovlen = 2; | |
1314 | } | |
1315 | ||
1316 | copy[0].iov_len /= 2; /* halve it, and try again */ | |
1317 | continue; | |
1318 | } | |
3c171f0b | 1319 | |
3c171f0b | 1320 | return log_error_errno(errno, "Failed to send coredump datagram: %m"); |
fec603eb | 1321 | } |
1eef15b1 ZJS |
1322 | } |
1323 | ||
313537da | 1324 | /* First sentinel: the coredump fd */ |
3c171f0b LP |
1325 | r = send_one_fd(fd, input_fd, 0); |
1326 | if (r < 0) | |
1327 | return log_error_errno(r, "Failed to send coredump fd: %m"); | |
1eef15b1 | 1328 | |
313537da LP |
1329 | /* The optional second sentinel: the pidfd */ |
1330 | if (!pidref_is_set(pidref) || pidref->fd < 0) /* If we have no pidfd, stop now */ | |
1331 | return 0; | |
1332 | ||
1333 | r = send_one_fd(fd, pidref->fd, 0); | |
1334 | if (r < 0) | |
1335 | return log_error_errno(r, "Failed to send pidfd: %m"); | |
1336 | ||
1337 | /* The optional third sentinel: the mount tree fd */ | |
1338 | if (mount_tree_fd < 0) /* If we have no mount tree, stop now */ | |
1339 | return 0; | |
1340 | ||
1341 | r = send_one_fd(fd, mount_tree_fd, 0); | |
1342 | if (r < 0) | |
1343 | return log_error_errno(r, "Failed to send mount tree fd: %m"); | |
68511ceb | 1344 | |
3c171f0b LP |
1345 | return 0; |
1346 | } | |
1eef15b1 | 1347 | |
64a5384f LP |
1348 | static int gather_pid_metadata_from_argv( |
1349 | struct iovec_wrapper *iovw, | |
1350 | Context *context, | |
1351 | int argc, char **argv) { | |
1352 | ||
868d9557 LB |
1353 | _cleanup_(pidref_done) PidRef local_pidref = PIDREF_NULL; |
1354 | int r, kernel_fd = -EBADF; | |
3c171f0b | 1355 | |
e6aa443f LP |
1356 | assert(iovw); |
1357 | assert(context); | |
1358 | ||
f46c706b | 1359 | /* We gather all metadata that were passed via argv[] into an array of iovecs that |
ded0aac3 ZJS |
1360 | * we'll forward to the socket unit. |
1361 | * | |
1362 | * We require at least _META_ARGV_REQUIRED args, but will accept more. | |
1363 | * We know how to parse _META_ARGV_MAX args. The rest will be ignored. */ | |
3c171f0b | 1364 | |
ded0aac3 | 1365 | if (argc < _META_ARGV_REQUIRED) |
f46c706b | 1366 | return log_error_errno(SYNTHETIC_ERRNO(EINVAL), |
ded0aac3 ZJS |
1367 | "Not enough arguments passed by the kernel (%i, expected between %i and %i).", |
1368 | argc, _META_ARGV_REQUIRED, _META_ARGV_MAX); | |
3c171f0b | 1369 | |
ded0aac3 | 1370 | for (int i = 0; i < MIN(argc, _META_ARGV_MAX); i++) { |
ea8eb370 LP |
1371 | _cleanup_free_ char *buf = NULL; |
1372 | const char *t = argv[i]; | |
3c171f0b | 1373 | |
960b0458 | 1374 | if (i == META_ARGV_TIMESTAMP) { |
f46c706b FB |
1375 | /* The journal fields contain the timestamp padded with six |
1376 | * zeroes, so that the kernel-supplied 1s granularity timestamps | |
e503019b | 1377 | * becomes 1μs granularity, i.e. the granularity systemd usually |
f46c706b | 1378 | * operates in. */ |
ea8eb370 LP |
1379 | buf = strjoin(argv[i], "000000"); |
1380 | if (!buf) | |
f46c706b | 1381 | return log_oom(); |
ea8eb370 LP |
1382 | |
1383 | t = buf; | |
c8091d92 LP |
1384 | } |
1385 | ||
868d9557 LB |
1386 | if (i == META_ARGV_PID) { |
1387 | /* Store this so that we can check whether the core will be forwarded to a container | |
1388 | * even when the kernel doesn't provide a pidfd. Can be dropped once baseline is | |
1389 | * >= v6.16. */ | |
1390 | r = pidref_set_pidstr(&local_pidref, t); | |
1391 | if (r < 0) | |
1392 | return log_error_errno(r, "Failed to initialize pidref from pid %s: %m", t); | |
1393 | } | |
1394 | ||
1395 | if (i == META_ARGV_PIDFD) { | |
1396 | /* If the current kernel doesn't support the %F specifier (which resolves to a | |
1397 | * pidfd), but we included it in the core_pattern expression, we'll receive an empty | |
1398 | * string here. Deal with that gracefully. */ | |
1399 | if (isempty(t)) | |
1400 | continue; | |
1401 | ||
1402 | assert(!pidref_is_set(&context->pidref)); | |
1403 | assert(kernel_fd < 0); | |
1404 | ||
1405 | kernel_fd = parse_fd(t); | |
1406 | if (kernel_fd < 0) | |
1407 | return log_error_errno(kernel_fd, "Failed to parse pidfd \"%s\": %m", t); | |
1408 | ||
1409 | r = pidref_set_pidfd(&context->pidref, kernel_fd); | |
1410 | if (r < 0) | |
1411 | return log_error_errno(r, "Failed to initialize pidref from pidfd %d: %m", kernel_fd); | |
1412 | ||
e6a8687b ZJS |
1413 | context->got_pidfd = 1; |
1414 | ||
868d9557 LB |
1415 | /* If there are containers involved with different versions of the code they might |
1416 | * not be using pidfds, so it would be wrong to set the metadata, skip it. */ | |
1417 | r = pidref_in_same_namespace(/* pid1 = */ NULL, &context->pidref, NAMESPACE_PID); | |
1418 | if (r < 0) | |
1419 | log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m"); | |
1420 | if (r <= 0) | |
1421 | continue; | |
1422 | ||
1423 | /* We don't print the fd number in the journal as it's meaningless, but we still | |
1424 | * record that the parsing was done with a kernel-provided fd as it means it's safe | |
1425 | * from races, which is valuable information to provide in the journal record. */ | |
1426 | t = "1"; | |
1427 | } | |
1428 | ||
f46c706b FB |
1429 | r = iovw_put_string_field(iovw, meta_field_names[i], t); |
1430 | if (r < 0) | |
1431 | return r; | |
8c8549db | 1432 | } |
803a3464 | 1433 | |
f46c706b | 1434 | /* Cache some of the process metadata we collected so far and that we'll need to |
c673f1f6 | 1435 | * access soon. */ |
868d9557 LB |
1436 | r = context_parse_iovw(context, iovw); |
1437 | if (r < 0) | |
1438 | return r; | |
1439 | ||
1440 | /* If the kernel didn't give us a PIDFD, then use the one derived from the | |
1441 | * PID immediately, given we have it. */ | |
1442 | if (!pidref_is_set(&context->pidref)) | |
1443 | context->pidref = TAKE_PIDREF(local_pidref); | |
1444 | ||
1445 | /* Close the kernel-provided FD as the last thing after everything else succeeded. */ | |
1446 | kernel_fd = safe_close(kernel_fd); | |
1447 | ||
1448 | return 0; | |
f46c706b | 1449 | } |
3c171f0b | 1450 | |
db9ac801 | 1451 | static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) { |
f46c706b FB |
1452 | uid_t owner_uid; |
1453 | pid_t pid; | |
1454 | char *t; | |
3e4d0f6c | 1455 | size_t size; |
f46c706b FB |
1456 | const char *p; |
1457 | int r; | |
f5e04665 | 1458 | |
e6aa443f LP |
1459 | assert(iovw); |
1460 | assert(context); | |
1461 | ||
f46c706b FB |
1462 | /* Note that if we fail on oom later on, we do not roll-back changes to the iovec |
1463 | * structure. (It remains valid, with the first iovec fields initialized.) */ | |
f5e04665 | 1464 | |
313537da | 1465 | pid = context->pidref.pid; |
f5e04665 | 1466 | |
f46c706b | 1467 | /* The following is mandatory */ |
1f485bc7 | 1468 | r = pidref_get_comm(&context->pidref, &t); |
9a435388 | 1469 | if (r < 0) |
f46c706b | 1470 | return log_error_errno(r, "Failed to get COMM: %m"); |
f5e04665 | 1471 | |
f46c706b | 1472 | r = iovw_put_string_field_free(iovw, "COREDUMP_COMM=", t); |
9a435388 FB |
1473 | if (r < 0) |
1474 | return r; | |
f45b8015 | 1475 | |
c790632c | 1476 | /* The following are optional, but we use them if present. */ |
2a3bebd0 FB |
1477 | r = get_process_exe(pid, &t); |
1478 | if (r >= 0) | |
1479 | r = iovw_put_string_field_free(iovw, "COREDUMP_EXE=", t); | |
1480 | if (r < 0) | |
f46c706b | 1481 | log_warning_errno(r, "Failed to get EXE, ignoring: %m"); |
bdfd7b2c | 1482 | |
1f485bc7 | 1483 | if (cg_pidref_get_unit(&context->pidref, &t) >= 0) |
2a3bebd0 | 1484 | (void) iovw_put_string_field_free(iovw, "COREDUMP_UNIT=", t); |
f5e04665 | 1485 | |
f46c706b | 1486 | if (cg_pid_get_user_unit(pid, &t) >= 0) |
2a3bebd0 | 1487 | (void) iovw_put_string_field_free(iovw, "COREDUMP_USER_UNIT=", t); |
f46c706b | 1488 | |
8703a508 | 1489 | if (cg_pidref_get_session(&context->pidref, &t) >= 0) |
9a435388 | 1490 | (void) iovw_put_string_field_free(iovw, "COREDUMP_SESSION=", t); |
f5e04665 | 1491 | |
8703a508 | 1492 | if (cg_pidref_get_owner_uid(&context->pidref, &owner_uid) >= 0) { |
9a435388 | 1493 | r = asprintf(&t, UID_FMT, owner_uid); |
7de80bfe | 1494 | if (r > 0) |
9a435388 | 1495 | (void) iovw_put_string_field_free(iovw, "COREDUMP_OWNER_UID=", t); |
f5e04665 LP |
1496 | } |
1497 | ||
9aa82023 | 1498 | if (sd_pid_get_slice(pid, &t) >= 0) |
2a3bebd0 | 1499 | (void) iovw_put_string_field_free(iovw, "COREDUMP_SLICE=", t); |
f5e04665 | 1500 | |
1f485bc7 | 1501 | if (pidref_get_cmdline(&context->pidref, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &t) >= 0) |
2a3bebd0 | 1502 | (void) iovw_put_string_field_free(iovw, "COREDUMP_CMDLINE=", t); |
a035f819 | 1503 | |
9aa82023 | 1504 | if (cg_pid_get_path_shifted(pid, NULL, &t) >= 0) |
2a3bebd0 | 1505 | (void) iovw_put_string_field_free(iovw, "COREDUMP_CGROUP=", t); |
a035f819 | 1506 | |
9aa82023 | 1507 | if (compose_open_fds(pid, &t) >= 0) |
2a3bebd0 | 1508 | (void) iovw_put_string_field_free(iovw, "COREDUMP_OPEN_FDS=", t); |
3f132692 JF |
1509 | |
1510 | p = procfs_file_alloca(pid, "status"); | |
da65941c | 1511 | if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0) |
2a3bebd0 | 1512 | (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_STATUS=", t); |
3f132692 JF |
1513 | |
1514 | p = procfs_file_alloca(pid, "maps"); | |
da65941c | 1515 | if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0) |
2a3bebd0 | 1516 | (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MAPS=", t); |
3f132692 | 1517 | |
da65941c LP |
1518 | p = procfs_file_alloca(pid, "limits"); /* this uses 'seq_file' in kernel, use read_full_file_at() */ |
1519 | if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0) | |
2a3bebd0 | 1520 | (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_LIMITS=", t); |
3f132692 JF |
1521 | |
1522 | p = procfs_file_alloca(pid, "cgroup"); | |
da65941c | 1523 | if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0) |
2a3bebd0 | 1524 | (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_CGROUP=", t); |
3f132692 | 1525 | |
d7032b1f | 1526 | p = procfs_file_alloca(pid, "mountinfo"); |
da65941c | 1527 | if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0) |
2a3bebd0 | 1528 | (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MOUNTINFO=", t); |
d7032b1f | 1529 | |
3e4d0f6c ZJS |
1530 | /* We attach /proc/auxv here. ELF coredumps also contain a note for this (NT_AUXV), see elf(5). */ |
1531 | p = procfs_file_alloca(pid, "auxv"); | |
da65941c | 1532 | if (read_full_file(p, &t, &size) >= 0) { |
3e4d0f6c ZJS |
1533 | char *buf = malloc(strlen("COREDUMP_PROC_AUXV=") + size + 1); |
1534 | if (buf) { | |
32756e57 | 1535 | /* Add a dummy terminator to make context_parse_iovw() happy. */ |
eda62239 | 1536 | *mempcpy_typesafe(stpcpy(buf, "COREDUMP_PROC_AUXV="), t, size) = '\0'; |
3e4d0f6c ZJS |
1537 | (void) iovw_consume(iovw, buf, size + strlen("COREDUMP_PROC_AUXV=")); |
1538 | } | |
1539 | ||
1540 | free(t); | |
1541 | } | |
1542 | ||
9aa82023 | 1543 | if (get_process_cwd(pid, &t) >= 0) |
2a3bebd0 | 1544 | (void) iovw_put_string_field_free(iovw, "COREDUMP_CWD=", t); |
3f132692 JF |
1545 | |
1546 | if (get_process_root(pid, &t) >= 0) { | |
9aa82023 ZJS |
1547 | bool proc_self_root_is_slash; |
1548 | ||
1549 | proc_self_root_is_slash = strcmp(t, "/") == 0; | |
3f132692 | 1550 | |
2a3bebd0 | 1551 | (void) iovw_put_string_field_free(iovw, "COREDUMP_ROOT=", t); |
7ed03ce6 JF |
1552 | |
1553 | /* If the process' root is "/", then there is a chance it has | |
1554 | * mounted own root and hence being containerized. */ | |
0b8b1332 | 1555 | if (proc_self_root_is_slash && get_process_container_parent_cmdline(&context->pidref, &t) > 0) |
2a3bebd0 | 1556 | (void) iovw_put_string_field_free(iovw, "COREDUMP_CONTAINER_CMDLINE=", t); |
3f132692 JF |
1557 | } |
1558 | ||
9aa82023 | 1559 | if (get_process_environ(pid, &t) >= 0) |
2a3bebd0 | 1560 | (void) iovw_put_string_field_free(iovw, "COREDUMP_ENVIRON=", t); |
9aa82023 | 1561 | |
c673f1f6 | 1562 | /* Now that we have parsed info from /proc/ ensure the pidfd is still valid before continuing. */ |
ba6c955f LB |
1563 | r = pidref_verify(&context->pidref); |
1564 | if (r < 0) | |
1565 | return log_error_errno(r, "PIDFD validation failed: %m"); | |
1566 | ||
c673f1f6 | 1567 | /* We successfully acquired all metadata. */ |
32756e57 | 1568 | return context_parse_iovw(context, iovw); |
9aa82023 | 1569 | } |
3f132692 | 1570 | |
ea8eb370 | 1571 | static int send_ucred(int transport_fd, const struct ucred *ucred) { |
a108c43e NR |
1572 | CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {}; |
1573 | struct msghdr mh = { | |
1574 | .msg_control = &control, | |
1575 | .msg_controllen = sizeof(control), | |
1576 | }; | |
1577 | struct cmsghdr *cmsg; | |
1578 | ||
1579 | assert(transport_fd >= 0); | |
ea8eb370 | 1580 | assert(ucred); |
a108c43e NR |
1581 | |
1582 | cmsg = CMSG_FIRSTHDR(&mh); | |
1583 | *cmsg = (struct cmsghdr) { | |
1584 | .cmsg_level = SOL_SOCKET, | |
1585 | .cmsg_type = SCM_CREDENTIALS, | |
1586 | .cmsg_len = CMSG_LEN(sizeof(struct ucred)), | |
1587 | }; | |
1588 | memcpy(CMSG_DATA(cmsg), ucred, sizeof(struct ucred)); | |
1589 | ||
1590 | return RET_NERRNO(sendmsg(transport_fd, &mh, MSG_NOSIGNAL)); | |
1591 | } | |
1592 | ||
1593 | static int receive_ucred(int transport_fd, struct ucred *ret_ucred) { | |
1594 | CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {}; | |
1595 | struct msghdr mh = { | |
1596 | .msg_control = &control, | |
1597 | .msg_controllen = sizeof(control), | |
1598 | }; | |
1599 | struct cmsghdr *cmsg = NULL; | |
1600 | struct ucred *ucred = NULL; | |
1601 | ssize_t n; | |
1602 | ||
ea8eb370 | 1603 | assert(transport_fd >= 0); |
a108c43e NR |
1604 | assert(ret_ucred); |
1605 | ||
1606 | n = recvmsg_safe(transport_fd, &mh, 0); | |
1607 | if (n < 0) | |
1608 | return n; | |
1609 | ||
1610 | CMSG_FOREACH(cmsg, &mh) | |
1611 | if (cmsg->cmsg_level == SOL_SOCKET && | |
1612 | cmsg->cmsg_type == SCM_CREDENTIALS && | |
1613 | cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { | |
1614 | ||
1615 | assert(!ucred); | |
1616 | ucred = CMSG_TYPED_DATA(cmsg, struct ucred); | |
1617 | } | |
1618 | ||
1619 | if (!ucred) | |
1620 | return -EIO; | |
1621 | ||
1622 | *ret_ucred = *ucred; | |
1623 | ||
1624 | return 0; | |
1625 | } | |
1626 | ||
8fc7b2a2 | 1627 | static int can_forward_coredump(Context *context, const PidRef *pid) { |
a108c43e NR |
1628 | _cleanup_free_ char *cgroup = NULL, *path = NULL, *unit = NULL; |
1629 | int r; | |
1630 | ||
8fc7b2a2 | 1631 | assert(context); |
0b8b1332 LP |
1632 | assert(pidref_is_set(pid)); |
1633 | assert(!pidref_is_remote(pid)); | |
1634 | ||
e6a8687b ZJS |
1635 | /* We need to avoid a situation where the attacker crashes a SUID process or a root daemon and |
1636 | * quickly replaces it with a namespaced process and we forward the coredump to the attacker, into | |
1637 | * the namespace. With %F/pidfd we can reliably check the namespace of the original process, hence we | |
1638 | * can allow forwarding. */ | |
76e0ab49 | 1639 | if (!context->got_pidfd && context->dumpable != SUID_DUMP_USER) |
8fc7b2a2 ZJS |
1640 | return false; |
1641 | ||
0b8b1332 | 1642 | r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); |
a108c43e NR |
1643 | if (r < 0) |
1644 | return r; | |
1645 | ||
1646 | r = path_extract_directory(cgroup, &path); | |
1647 | if (r < 0) | |
1648 | return r; | |
1649 | ||
1650 | r = cg_path_get_unit_path(path, &unit); | |
1651 | if (r == -ENOMEM) | |
1652 | return log_oom(); | |
1653 | if (r == -ENXIO) | |
1654 | /* No valid units in this path. */ | |
1655 | return false; | |
1656 | if (r < 0) | |
1657 | return r; | |
1658 | ||
1659 | /* We require that this process belongs to a delegated cgroup | |
1660 | * (i.e. Delegate=yes), with CoredumpReceive=yes also. */ | |
1661 | r = cg_is_delegated(unit); | |
1662 | if (r <= 0) | |
1663 | return r; | |
1664 | ||
1665 | return cg_has_coredump_receive(unit); | |
1666 | } | |
1667 | ||
1668 | static int forward_coredump_to_container(Context *context) { | |
1669 | _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, netnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF; | |
71136404 | 1670 | _cleanup_close_pair_ int pair[2] = EBADF_PAIR; |
0b8b1332 | 1671 | pid_t child; |
a108c43e | 1672 | struct ucred ucred = { |
313537da | 1673 | .pid = context->pidref.pid, |
a108c43e NR |
1674 | .uid = context->uid, |
1675 | .gid = context->gid, | |
1676 | }; | |
1677 | int r; | |
1678 | ||
313537da LP |
1679 | assert(context); |
1680 | ||
0b8b1332 LP |
1681 | _cleanup_(pidref_done) PidRef leader_pid = PIDREF_NULL; |
1682 | r = namespace_get_leader(&context->pidref, NAMESPACE_PID, &leader_pid); | |
a108c43e NR |
1683 | if (r < 0) |
1684 | return log_debug_errno(r, "Failed to get namespace leader: %m"); | |
1685 | ||
8fc7b2a2 | 1686 | r = can_forward_coredump(context, &leader_pid); |
a108c43e NR |
1687 | if (r < 0) |
1688 | return log_debug_errno(r, "Failed to check if coredump can be forwarded: %m"); | |
1689 | if (r == 0) | |
1690 | return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), | |
1691 | "Coredump will not be forwarded because no target cgroup was found."); | |
1692 | ||
1693 | r = RET_NERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair)); | |
1694 | if (r < 0) | |
1695 | return log_debug_errno(r, "Failed to create socket pair: %m"); | |
1696 | ||
1697 | r = setsockopt_int(pair[1], SOL_SOCKET, SO_PASSCRED, true); | |
1698 | if (r < 0) | |
1699 | return log_debug_errno(r, "Failed to set SO_PASSCRED: %m"); | |
1700 | ||
0b8b1332 | 1701 | r = pidref_namespace_open(&leader_pid, &pidnsfd, &mntnsfd, &netnsfd, &usernsfd, &rootfd); |
a108c43e | 1702 | if (r < 0) |
0b8b1332 | 1703 | return log_debug_errno(r, "Failed to open namespaces of PID " PID_FMT ": %m", leader_pid.pid); |
a108c43e NR |
1704 | |
1705 | r = namespace_fork("(sd-coredumpns)", "(sd-coredump)", NULL, 0, | |
e9ccae31 | 1706 | FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM, |
a108c43e NR |
1707 | pidnsfd, mntnsfd, netnsfd, usernsfd, rootfd, &child); |
1708 | if (r < 0) | |
0b8b1332 | 1709 | return log_debug_errno(r, "Failed to fork into namespaces of PID " PID_FMT ": %m", leader_pid.pid); |
a108c43e | 1710 | if (r == 0) { |
a108c43e NR |
1711 | pair[0] = safe_close(pair[0]); |
1712 | ||
3f8999a7 | 1713 | r = access_nofollow("/run/systemd/coredump", W_OK); |
7c1dd9e2 MY |
1714 | if (r < 0) { |
1715 | log_debug_errno(r, "Cannot find coredump socket, exiting: %m"); | |
a108c43e NR |
1716 | _exit(EXIT_FAILURE); |
1717 | } | |
1718 | ||
1719 | r = receive_ucred(pair[1], &ucred); | |
1720 | if (r < 0) { | |
1721 | log_debug_errno(r, "Failed to receive ucred and fd: %m"); | |
1722 | _exit(EXIT_FAILURE); | |
1723 | } | |
1724 | ||
313537da | 1725 | _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = iovw_new(); |
a108c43e NR |
1726 | if (!iovw) { |
1727 | log_oom(); | |
1728 | _exit(EXIT_FAILURE); | |
1729 | } | |
1730 | ||
1731 | (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR); | |
1732 | (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT)); | |
1733 | (void) iovw_put_string_field(iovw, "COREDUMP_FORWARDED=", "1"); | |
1734 | ||
1735 | for (int i = 0; i < _META_ARGV_MAX; i++) { | |
a108c43e NR |
1736 | char buf[DECIMAL_STR_MAX(pid_t)]; |
1737 | const char *t = context->meta[i]; | |
1738 | ||
ea8eb370 | 1739 | /* Patch some of the fields with the translated ucred data */ |
1d03d970 | 1740 | switch (i) { |
a108c43e NR |
1741 | |
1742 | case META_ARGV_PID: | |
1743 | xsprintf(buf, PID_FMT, ucred.pid); | |
1744 | t = buf; | |
a108c43e NR |
1745 | break; |
1746 | ||
1747 | case META_ARGV_UID: | |
1748 | xsprintf(buf, UID_FMT, ucred.uid); | |
1749 | t = buf; | |
1750 | break; | |
1751 | ||
1752 | case META_ARGV_GID: | |
1753 | xsprintf(buf, GID_FMT, ucred.gid); | |
1754 | t = buf; | |
1755 | break; | |
1756 | ||
a108c43e | 1757 | default: |
5c9feb2d | 1758 | ; |
a108c43e NR |
1759 | } |
1760 | ||
1761 | r = iovw_put_string_field(iovw, meta_field_names[i], t); | |
1762 | if (r < 0) { | |
1763 | log_debug_errno(r, "Failed to construct iovec: %m"); | |
1764 | _exit(EXIT_FAILURE); | |
1765 | } | |
1766 | } | |
1767 | ||
313537da | 1768 | _cleanup_(context_done) Context child_context = CONTEXT_NULL; |
32756e57 | 1769 | r = context_parse_iovw(&child_context, iovw); |
a108c43e NR |
1770 | if (r < 0) { |
1771 | log_debug_errno(r, "Failed to save context: %m"); | |
1772 | _exit(EXIT_FAILURE); | |
1773 | } | |
1774 | ||
1775 | r = gather_pid_metadata_from_procfs(iovw, &child_context); | |
1776 | if (r < 0) { | |
1777 | log_debug_errno(r, "Failed to gather metadata from procfs: %m"); | |
1778 | _exit(EXIT_FAILURE); | |
1779 | } | |
1780 | ||
313537da | 1781 | r = send_iovec(iovw, STDIN_FILENO, &context->pidref, /* mount_tree_fd= */ -EBADF); |
a108c43e NR |
1782 | if (r < 0) { |
1783 | log_debug_errno(r, "Failed to send iovec to coredump socket: %m"); | |
1784 | _exit(EXIT_FAILURE); | |
1785 | } | |
1786 | ||
1787 | _exit(EXIT_SUCCESS); | |
1788 | } | |
1789 | ||
1790 | pair[1] = safe_close(pair[1]); | |
1791 | ||
1792 | /* We need to translate the PID, UID, and GID of the crashing process | |
1793 | * to the container's namespaces. Do this by sending an SCM_CREDENTIALS | |
1794 | * message on a socket pair, and read the result when we join the | |
1795 | * container. The kernel will perform the translation for us. */ | |
1796 | r = send_ucred(pair[0], &ucred); | |
1797 | if (r < 0) | |
1798 | return log_debug_errno(r, "Failed to send metadata to container: %m"); | |
1799 | ||
1800 | r = wait_for_terminate_and_check("(sd-coredumpns)", child, 0); | |
1801 | if (r < 0) | |
1802 | return log_debug_errno(r, "Failed to wait for child to terminate: %m"); | |
1803 | if (r != EXIT_SUCCESS) | |
4e494e6a | 1804 | return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Failed to process coredump in container."); |
a108c43e NR |
1805 | |
1806 | return 0; | |
1807 | } | |
1808 | ||
00f73980 | 1809 | static int acquire_pid_mount_tree_fd(const Context *context, int *ret_fd) { |
b8fe1b1d MS |
1810 | /* Don't bother preparing environment if we can't pass it to libdwfl. */ |
1811 | #if !HAVE_DWFL_SET_SYSROOT | |
1812 | *ret_fd = -EOPNOTSUPP; | |
1813 | log_debug("dwfl_set_sysroot() is not supported."); | |
1814 | #else | |
1815 | _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, fd = -EBADF; | |
68511ceb | 1816 | _cleanup_close_pair_ int pair[2] = EBADF_PAIR; |
b8fe1b1d | 1817 | int r; |
68511ceb MS |
1818 | |
1819 | assert(context); | |
b8fe1b1d | 1820 | assert(ret_fd); |
68511ceb | 1821 | |
e26a7e08 | 1822 | if (!arg_enter_namespace) { |
b8fe1b1d MS |
1823 | *ret_fd = -EHOSTDOWN; |
1824 | log_debug("EnterNamespace=no so we won't use mount tree of the crashed process for generating backtrace."); | |
1825 | return 0; | |
1826 | } | |
68511ceb MS |
1827 | |
1828 | if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair) < 0) | |
1829 | return log_error_errno(errno, "Failed to create socket pair: %m"); | |
1830 | ||
36812cb6 LP |
1831 | r = pidref_namespace_open( |
1832 | &context->pidref, | |
1833 | /* ret_pidns_fd= */ NULL, | |
1834 | &mntns_fd, | |
1835 | /* ret_netns_fd= */ NULL, | |
1836 | /* ret_userns_fd= */ NULL, | |
1837 | &root_fd); | |
68511ceb MS |
1838 | if (r < 0) |
1839 | return log_error_errno(r, "Failed to open mount namespace of crashing process: %m"); | |
1840 | ||
a88e72be MS |
1841 | r = namespace_fork("(sd-mount-tree-ns)", |
1842 | "(sd-mount-tree)", | |
1843 | /* except_fds= */ NULL, | |
1844 | /* n_except_fds= */ 0, | |
c287f0f7 | 1845 | FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT, |
a88e72be MS |
1846 | /* pidns_fd= */ -EBADF, |
1847 | mntns_fd, | |
1848 | /* netns_fd= */ -EBADF, | |
1849 | /* userns_fd= */ -EBADF, | |
1850 | root_fd, | |
c287f0f7 | 1851 | NULL); |
68511ceb | 1852 | if (r < 0) |
e5bad3a7 | 1853 | return r; |
68511ceb MS |
1854 | if (r == 0) { |
1855 | pair[0] = safe_close(pair[0]); | |
1856 | ||
84289ab9 MS |
1857 | fd = open_tree(-EBADF, "/", AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); |
1858 | if (fd < 0) { | |
68511ceb MS |
1859 | log_error_errno(errno, "Failed to clone mount tree: %m"); |
1860 | _exit(EXIT_FAILURE); | |
1861 | } | |
1862 | ||
84289ab9 | 1863 | r = send_one_fd(pair[1], fd, 0); |
68511ceb MS |
1864 | if (r < 0) { |
1865 | log_error_errno(r, "Failed to send mount tree to parent: %m"); | |
1866 | _exit(EXIT_FAILURE); | |
1867 | } | |
1868 | ||
1869 | _exit(EXIT_SUCCESS); | |
1870 | } | |
1871 | ||
1872 | pair[1] = safe_close(pair[1]); | |
1873 | ||
68511ceb MS |
1874 | fd = receive_one_fd(pair[0], MSG_DONTWAIT); |
1875 | if (fd < 0) | |
1876 | return log_error_errno(fd, "Failed to receive mount tree: %m"); | |
1877 | ||
b8fe1b1d MS |
1878 | *ret_fd = TAKE_FD(fd); |
1879 | #endif | |
1880 | return 0; | |
68511ceb MS |
1881 | } |
1882 | ||
92b8e5e7 | 1883 | static int process_kernel(int argc, char *argv[]) { |
6257e2fb | 1884 | _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL; |
313537da | 1885 | _cleanup_(context_done) Context context = CONTEXT_NULL; |
960b0458 | 1886 | int r; |
9aa82023 | 1887 | |
1f9d2a81 DDM |
1888 | /* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds |
1889 | * could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to | |
1890 | * /dev/null. */ | |
5bb1d7fb | 1891 | r = rearrange_stdio(STDIN_FILENO, -EBADF, -EBADF); |
1f9d2a81 DDM |
1892 | if (r < 0) |
1893 | return log_error_errno(r, "Failed to connect stdout/stderr to /dev/null: %m"); | |
1894 | ||
988e89ee ZJS |
1895 | log_debug("Processing coredump received from the kernel..."); |
1896 | ||
9a435388 FB |
1897 | iovw = iovw_new(); |
1898 | if (!iovw) | |
1899 | return log_oom(); | |
1900 | ||
f46c706b FB |
1901 | /* Collect all process metadata passed by the kernel through argv[] */ |
1902 | r = gather_pid_metadata_from_argv(iovw, &context, argc - 1, argv + 1); | |
92e92d71 | 1903 | if (r < 0) |
6257e2fb | 1904 | return r; |
86562420 | 1905 | |
f46c706b | 1906 | /* Collect the rest of the process metadata retrieved from the runtime */ |
db9ac801 | 1907 | r = gather_pid_metadata_from_procfs(iovw, &context); |
f46c706b | 1908 | if (r < 0) |
6257e2fb | 1909 | return r; |
f46c706b | 1910 | |
1e344c1d | 1911 | if (!context.is_journald) |
f46c706b | 1912 | /* OK, now we know it's not the journal, hence we can make use of it now. */ |
1e344c1d | 1913 | log_set_target_and_open(LOG_TARGET_JOURNAL_OR_KMSG); |
f46c706b | 1914 | |
2a9b1a76 HB |
1915 | /* Log minimal metadata now, so it is not lost if the system is about to shut down. */ |
1916 | log_info("Process %s (%s) of user %s terminated abnormally with signal %s/%s, processing...", | |
960b0458 LP |
1917 | context.meta[META_ARGV_PID], context.meta[META_COMM], |
1918 | context.meta[META_ARGV_UID], context.meta[META_ARGV_SIGNAL], | |
1919 | signal_to_string(context.signo)); | |
2a9b1a76 | 1920 | |
92b8e5e7 | 1921 | r = pidref_in_same_namespace(/* pid1 = */ NULL, &context.pidref, NAMESPACE_PID); |
a108c43e NR |
1922 | if (r < 0) |
1923 | log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m"); | |
1924 | if (r == 0) { | |
1925 | /* If this fails, fallback to the old behavior so that | |
1926 | * there is still some record of the crash. */ | |
1927 | r = forward_coredump_to_container(&context); | |
1928 | if (r >= 0) | |
1929 | return 0; | |
68511ceb | 1930 | |
00f73980 | 1931 | r = acquire_pid_mount_tree_fd(&context, &context.mount_tree_fd); |
b8fe1b1d | 1932 | if (r < 0) |
68511ceb | 1933 | log_warning_errno(r, "Failed to access the mount tree of a container, ignoring: %m"); |
a108c43e NR |
1934 | } |
1935 | ||
c673f1f6 | 1936 | /* If this is PID 1, disable coredump collection, we'll unlikely be able to process |
f46c706b FB |
1937 | * it later on. |
1938 | * | |
1939 | * FIXME: maybe we should disable coredumps generation from the beginning and | |
c673f1f6 ZJS |
1940 | * re-enable it only when we know it's either safe (i.e. we're not running OOM) or |
1941 | * it's not PID 1 ? */ | |
f46c706b FB |
1942 | if (context.is_pid1) { |
1943 | log_notice("Due to PID 1 having crashed coredump collection will now be turned off."); | |
1944 | disable_coredumps(); | |
1945 | } | |
34c10968 | 1946 | |
a108c43e NR |
1947 | (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR); |
1948 | (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT)); | |
1949 | ||
f46c706b | 1950 | if (context.is_journald || context.is_pid1) |
313537da | 1951 | return submit_coredump(&context, iovw, STDIN_FILENO); |
9aa82023 | 1952 | |
313537da | 1953 | return send_iovec(iovw, STDIN_FILENO, &context.pidref, context.mount_tree_fd); |
3c171f0b | 1954 | } |
34c10968 | 1955 | |
988e89ee | 1956 | static int process_backtrace(int argc, char *argv[]) { |
3a19fe46 YW |
1957 | _cleanup_(journal_importer_cleanup) JournalImporter importer = JOURNAL_IMPORTER_INIT(STDIN_FILENO); |
1958 | _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL; | |
313537da | 1959 | _cleanup_(context_done) Context context = CONTEXT_NULL; |
9a435388 | 1960 | char *message; |
988e89ee ZJS |
1961 | int r; |
1962 | ||
ea8eb370 LP |
1963 | assert(argc >= 2); |
1964 | ||
988e89ee ZJS |
1965 | log_debug("Processing backtrace on stdin..."); |
1966 | ||
9a435388 FB |
1967 | iovw = iovw_new(); |
1968 | if (!iovw) | |
5b45a160 ZJS |
1969 | return log_oom(); |
1970 | ||
2a3bebd0 FB |
1971 | (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_BACKTRACE_STR); |
1972 | (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT)); | |
f46c706b FB |
1973 | |
1974 | /* Collect all process metadata from argv[] by making sure to skip the | |
1975 | * '--backtrace' option */ | |
1976 | r = gather_pid_metadata_from_argv(iovw, &context, argc - 2, argv + 2); | |
988e89ee | 1977 | if (r < 0) |
3a19fe46 | 1978 | return r; |
aaeb2522 | 1979 | |
f46c706b | 1980 | /* Collect the rest of the process metadata retrieved from the runtime */ |
db9ac801 | 1981 | r = gather_pid_metadata_from_procfs(iovw, &context); |
f46c706b | 1982 | if (r < 0) |
3a19fe46 | 1983 | return r; |
988e89ee | 1984 | |
86562420 | 1985 | for (;;) { |
5b45a160 | 1986 | r = journal_importer_process_data(&importer); |
3a19fe46 YW |
1987 | if (r < 0) |
1988 | return log_error_errno(r, "Failed to parse journal entry on stdin: %m"); | |
d74dc4f2 ZJS |
1989 | if (r == 1 || /* complete entry */ |
1990 | journal_importer_eof(&importer)) /* end of data */ | |
5b45a160 | 1991 | break; |
988e89ee | 1992 | } |
988e89ee | 1993 | |
5b45a160 ZJS |
1994 | if (journal_importer_eof(&importer)) { |
1995 | log_warning("Did not receive a full journal entry on stdin, ignoring message sent by reporter"); | |
988e89ee | 1996 | |
f46c706b FB |
1997 | message = strjoina("Process ", context.meta[META_ARGV_PID], |
1998 | " (", context.meta[META_COMM], ")" | |
1999 | " of user ", context.meta[META_ARGV_UID], | |
2000 | " failed with ", context.meta[META_ARGV_SIGNAL]); | |
9a435388 FB |
2001 | |
2002 | r = iovw_put_string_field(iovw, "MESSAGE=", message); | |
2003 | if (r < 0) | |
3a19fe46 | 2004 | return r; |
5b45a160 | 2005 | } else { |
3a19fe46 YW |
2006 | /* The imported iovecs are not supposed to be freed by us so let's copy and merge them at the |
2007 | * end of the array. */ | |
2008 | r = iovw_append(iovw, &importer.iovw); | |
2009 | if (r < 0) | |
2010 | return r; | |
9a435388 | 2011 | } |
988e89ee | 2012 | |
9a435388 | 2013 | r = sd_journal_sendv(iovw->iovec, iovw->count); |
988e89ee | 2014 | if (r < 0) |
3a19fe46 | 2015 | return log_error_errno(r, "Failed to log backtrace: %m"); |
988e89ee | 2016 | |
3a19fe46 | 2017 | return 0; |
988e89ee ZJS |
2018 | } |
2019 | ||
4515a95e | 2020 | static int run(int argc, char *argv[]) { |
3c171f0b | 2021 | int r; |
fee80f69 | 2022 | |
9aa82023 ZJS |
2023 | /* First, log to a safe place, since we don't know what crashed and it might |
2024 | * be journald which we'd rather not log to then. */ | |
8d4e028f | 2025 | |
1e344c1d | 2026 | log_set_target_and_open(LOG_TARGET_KMSG); |
8d4e028f | 2027 | |
3c171f0b | 2028 | /* Make sure we never enter a loop */ |
9ce8e3e4 | 2029 | (void) set_dumpable(SUID_DUMP_DISABLE); |
8d4e028f | 2030 | |
3c171f0b LP |
2031 | /* Ignore all parse errors */ |
2032 | (void) parse_config(); | |
fee80f69 | 2033 | |
3c171f0b LP |
2034 | log_debug("Selected storage '%s'.", coredump_storage_to_string(arg_storage)); |
2035 | log_debug("Selected compression %s.", yes_no(arg_compress)); | |
fee80f69 | 2036 | |
3c171f0b | 2037 | r = sd_listen_fds(false); |
4515a95e ZJS |
2038 | if (r < 0) |
2039 | return log_error_errno(r, "Failed to determine the number of file descriptors: %m"); | |
fee80f69 | 2040 | |
9aa82023 ZJS |
2041 | /* If we got an fd passed, we are running in coredumpd mode. Otherwise we |
2042 | * are invoked from the kernel as coredump handler. */ | |
988e89ee ZJS |
2043 | if (r == 0) { |
2044 | if (streq_ptr(argv[1], "--backtrace")) | |
4515a95e | 2045 | return process_backtrace(argc, argv); |
988e89ee | 2046 | else |
4515a95e | 2047 | return process_kernel(argc, argv); |
988e89ee | 2048 | } else if (r == 1) |
4515a95e | 2049 | return process_socket(SD_LISTEN_FDS_START); |
f5e04665 | 2050 | |
baaa35ad ZJS |
2051 | return log_error_errno(SYNTHETIC_ERRNO(EINVAL), |
2052 | "Received unexpected number of file descriptors."); | |
f5e04665 | 2053 | } |
4515a95e ZJS |
2054 | |
2055 | DEFINE_MAIN_FUNCTION(run); |