]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-oci.c
476dad08eb8491452729aa9c51adb297a7b2a76c
[thirdparty/systemd.git] / src / nspawn / nspawn-oci.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <linux/oom.h>
4 #include <sys/stat.h>
5
6 #include "sd-bus.h"
7 #include "sd-json.h"
8
9 #include "alloc-util.h"
10 #include "bus-util.h"
11 #include "capability-list.h"
12 #include "cgroup-util.h"
13 #include "cpu-set-util.h"
14 #include "device-util.h"
15 #include "devnum-util.h"
16 #include "env-util.h"
17 #include "hostname-util.h"
18 #include "json-util.h"
19 #include "nspawn-mount.h"
20 #include "nspawn-oci.h"
21 #include "path-util.h"
22 #include "rlimit-util.h"
23 #include "string-util.h"
24 #include "strv.h"
25 #include "time-util.h"
26
27 /* TODO:
28 * OCI runtime tool implementation
29 * hooks
30 *
31 * Spec issues:
32 *
33 * How is RLIM_INFINITY supposed to be encoded?
34 * configured effective caps is bullshit, as execv() corrupts it anyway
35 * pipes bind mounted is *very* different from pipes newly created, comments regarding bind mount or not are bogus
36 * annotation values structured? or string?
37 * configurable file system namespace path, but then also root path? wtf?
38 * apply sysctl inside of the container? or outside?
39 * how is unlimited pids tasks limit to be encoded?
40 * what are the defaults for caps if not specified?
41 * what are the default uid/gid mappings if one is missing but the other set, or when user ns is on but no namespace configured
42 * the source field of "mounts" is really weird, as it cannot realistically be relative to the bundle, since we never know if that's what the fs wants
43 * spec contradicts itself on the mount "type" field, as the example uses "bind" as type, but it's not listed in /proc/filesystem, and is something made up by /bin/mount
44 * if type of mount is left out, what shall be assumed? "bind"?
45 * readonly mounts is entirely redundant?
46 * should escaping be applied when joining mount options with ","?
47 * devices cgroup support is bogus, "allow" and "deny" on the kernel level is about adding/removing entries, not about access
48 * spec needs to say that "rwm" devices cgroup combination can't be the empty string
49 * cgrouspv1 crap: kernel, kernelTCP, swappiness, disableOOMKiller, swap, devices, leafWeight
50 * general: it shouldn't leak lower level abstractions this obviously
51 * unmanagable cgroups stuff: realtimeRuntime/realtimePeriod
52 * needs to say what happense when some option is not specified, i.e. which defaults apply
53 * no architecture? no personality?
54 * seccomp example and logic is simply broken: there's no constant "SCMP_ACT_ERRNO".
55 * spec should say what to do with unknown props
56 * /bin/mount regarding NFS and FUSE required?
57 * what does terminal=false mean?
58 * sysctl inside or outside? allow-listing?
59 * swapiness typo -> swappiness
60 *
61 * Unsupported:
62 *
63 * apparmorProfile
64 * selinuxLabel + mountLabel
65 * hugepageLimits
66 * network
67 * rdma
68 * intelRdt
69 * swappiness, disableOOMKiller, kernel, kernelTCP, leafWeight (because it's dead, cgroupsv2 can't do it and hence systemd neither)
70 *
71 * Non-slice cgroup paths
72 * Propagation that is not slave + shared
73 * more than one uid/gid mapping, mappings with a container base != 0, or non-matching uid/gid mappings
74 * device cgroups access = false items that are not catchall
75 * device cgroups matches where minor is specified, but major isn't. similar where major is specified but char/block is not. also, any match that only has a type set that has less than "rwm" set. also, any entry that has none of rwm set.
76 *
77 */
78
79 /* Special values for the cpu.shares attribute */
80 #define CGROUP_CPU_SHARES_INVALID UINT64_MAX
81 #define CGROUP_CPU_SHARES_MIN UINT64_C(2)
82 #define CGROUP_CPU_SHARES_MAX UINT64_C(262144)
83 #define CGROUP_CPU_SHARES_DEFAULT UINT64_C(1024)
84
85 /* Special values for the blkio.weight attribute */
86 #define CGROUP_BLKIO_WEIGHT_INVALID UINT64_MAX
87 #define CGROUP_BLKIO_WEIGHT_MIN UINT64_C(10)
88 #define CGROUP_BLKIO_WEIGHT_MAX UINT64_C(1000)
89 #define CGROUP_BLKIO_WEIGHT_DEFAULT UINT64_C(500)
90
91 static int oci_unexpected(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
92 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
93 "Unexpected OCI element '%s' of type '%s'.", name, sd_json_variant_type_to_string(sd_json_variant_type(v)));
94 }
95
96 static int oci_dispatch(sd_json_variant *v, const sd_json_dispatch_field table[], sd_json_dispatch_flags_t flags, void *userdata) {
97 return sd_json_dispatch_full(v, table, oci_unexpected, flags, userdata, /* reterr_bad_field= */ NULL);
98 }
99
100 static int oci_unsupported(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
101 return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP),
102 "Unsupported OCI element '%s' of type '%s'.", name, sd_json_variant_type_to_string(sd_json_variant_type(v)));
103 }
104
105 static int oci_terminal(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
106 Settings *s = ASSERT_PTR(userdata);
107
108 /* If not specified, or set to true, we'll default to either an interactive or a read-only
109 * console. If specified as false, we'll forcibly move to "pipe" mode though. */
110 s->console_mode = sd_json_variant_boolean(v) ? _CONSOLE_MODE_INVALID : CONSOLE_PIPE;
111 return 0;
112 }
113
114 static int oci_console_dimension(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
115 unsigned *u = ASSERT_PTR(userdata);
116 uint64_t k;
117
118 k = sd_json_variant_unsigned(variant);
119 if (k == 0)
120 return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE),
121 "Console size field '%s' is too small.", strna(name));
122 if (k > USHRT_MAX) /* TIOCSWINSZ's struct winsize uses "unsigned short" for width and height */
123 return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE),
124 "Console size field '%s' is too large.", strna(name));
125
126 *u = (unsigned) k;
127 return 0;
128 }
129
130 static int oci_console_size(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
131 Settings *s = ASSERT_PTR(userdata);
132
133 static const sd_json_dispatch_field table[] = {
134 { "height", SD_JSON_VARIANT_UNSIGNED, oci_console_dimension, offsetof(Settings, console_height), SD_JSON_MANDATORY },
135 { "width", SD_JSON_VARIANT_UNSIGNED, oci_console_dimension, offsetof(Settings, console_width), SD_JSON_MANDATORY },
136 {}
137 };
138
139 return oci_dispatch(v, table, flags, s);
140 }
141
142 static int oci_env(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
143 char ***l = ASSERT_PTR(userdata);
144 sd_json_variant *e;
145 int r;
146
147 JSON_VARIANT_ARRAY_FOREACH(e, v) {
148 const char *n;
149
150 if (!sd_json_variant_is_string(e))
151 return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL),
152 "Environment array contains non-string.");
153
154 assert_se(n = sd_json_variant_string(e));
155
156 if (!env_assignment_is_valid(n))
157 return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL),
158 "Environment assignment not valid: %s", n);
159
160 r = strv_extend(l, n);
161 if (r < 0)
162 return log_oom();
163 }
164
165 return 0;
166 }
167
168 static int oci_args(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
169 _cleanup_strv_free_ char **l = NULL;
170 char ***value = ASSERT_PTR(userdata);
171 int r;
172
173 r = sd_json_variant_strv(v, &l);
174 if (r < 0)
175 return json_log(v, flags, r, "Cannot parse arguments as list of strings: %m");
176
177 if (strv_isempty(l))
178 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
179 "Argument list empty, refusing.");
180
181 if (isempty(l[0]))
182 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
183 "Executable name is empty, refusing.");
184
185 return strv_free_and_replace(*value, l);
186 }
187
188 static int oci_rlimit_type(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
189 const char *z;
190 int *type = ASSERT_PTR(userdata);
191 int t;
192
193 z = startswith(sd_json_variant_string(v), "RLIMIT_");
194 if (!z)
195 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
196 "rlimit entry's name does not begin with 'RLIMIT_', refusing: %s",
197 sd_json_variant_string(v));
198
199 t = rlimit_from_string(z);
200 if (t < 0)
201 return json_log(v, flags, t,
202 "rlimit name unknown: %s", sd_json_variant_string(v));
203
204 *type = t;
205 return 0;
206 }
207
208 static int oci_rlimit_value(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
209 rlim_t *value = ASSERT_PTR(userdata);
210 rlim_t z;
211
212 if (sd_json_variant_is_negative(v))
213 z = RLIM_INFINITY;
214 else {
215 if (!sd_json_variant_is_unsigned(v))
216 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
217 "rlimits limit not unsigned, refusing.");
218
219 z = (rlim_t) sd_json_variant_unsigned(v);
220
221 if ((uint64_t) z != sd_json_variant_unsigned(v))
222 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
223 "rlimits limit out of range, refusing.");
224 }
225
226 *value = z;
227 return 0;
228 }
229
230 static int oci_rlimits(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
231 Settings *s = ASSERT_PTR(userdata);
232 sd_json_variant *e;
233 int r;
234
235 JSON_VARIANT_ARRAY_FOREACH(e, v) {
236
237 struct rlimit_data {
238 int type;
239 rlim_t soft;
240 rlim_t hard;
241 } data = {
242 .type = -1,
243 .soft = RLIM_INFINITY,
244 .hard = RLIM_INFINITY,
245 };
246
247 static const sd_json_dispatch_field table[] = {
248 { "soft", SD_JSON_VARIANT_NUMBER, oci_rlimit_value, offsetof(struct rlimit_data, soft), SD_JSON_MANDATORY },
249 { "hard", SD_JSON_VARIANT_NUMBER, oci_rlimit_value, offsetof(struct rlimit_data, hard), SD_JSON_MANDATORY },
250 { "type", SD_JSON_VARIANT_STRING, oci_rlimit_type, offsetof(struct rlimit_data, type), SD_JSON_MANDATORY },
251 {}
252 };
253
254 r = oci_dispatch(e, table, flags, &data);
255 if (r < 0)
256 return r;
257
258 assert(data.type >= 0);
259 assert(data.type < _RLIMIT_MAX);
260
261 if (s->rlimit[data.type])
262 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
263 "rlimits array contains duplicate entry, refusing.");
264
265 s->rlimit[data.type] = new(struct rlimit, 1);
266 if (!s->rlimit[data.type])
267 return log_oom();
268
269 *s->rlimit[data.type] = (struct rlimit) {
270 .rlim_cur = data.soft,
271 .rlim_max = data.hard,
272 };
273
274 }
275 return 0;
276 }
277
278 static int oci_capability_array(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
279 uint64_t *mask = ASSERT_PTR(userdata);
280 uint64_t m = 0;
281 sd_json_variant *e;
282
283 JSON_VARIANT_ARRAY_FOREACH(e, v) {
284 const char *n;
285 int cap;
286
287 if (!sd_json_variant_is_string(e))
288 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
289 "Entry in capabilities array is not a string.");
290
291 assert_se(n = sd_json_variant_string(e));
292
293 cap = capability_from_name(n);
294 if (cap < 0)
295 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
296 "Unknown capability: %s", n);
297
298 m |= UINT64_C(1) << cap;
299 }
300
301 if (*mask == UINT64_MAX)
302 *mask = m;
303 else
304 *mask |= m;
305
306 return 0;
307 }
308
309 static int oci_capabilities(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
310
311 static const sd_json_dispatch_field table[] = {
312 { "effective", SD_JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, effective) },
313 { "bounding", SD_JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, bounding) },
314 { "inheritable", SD_JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, inheritable) },
315 { "permitted", SD_JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, permitted) },
316 { "ambient", SD_JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, ambient) },
317 {}
318 };
319
320 Settings *s = ASSERT_PTR(userdata);
321 int r;
322
323 r = oci_dispatch(v, table, flags, &s->full_capabilities);
324 if (r < 0)
325 return r;
326
327 if (s->full_capabilities.bounding != UINT64_MAX) {
328 s->capability = s->full_capabilities.bounding;
329 s->drop_capability = ~s->full_capabilities.bounding;
330 }
331
332 return 0;
333 }
334
335 static int oci_oom_score_adj(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
336 Settings *s = ASSERT_PTR(userdata);
337 int64_t k;
338
339 k = sd_json_variant_integer(v);
340 if (k < OOM_SCORE_ADJ_MIN || k > OOM_SCORE_ADJ_MAX)
341 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
342 "oomScoreAdj value out of range: %" PRIi64, k);
343
344 s->oom_score_adjust = (int) k;
345 s->oom_score_adjust_set = true;
346
347 return 0;
348 }
349
350 static int oci_supplementary_gids(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
351 Settings *s = ASSERT_PTR(userdata);
352 sd_json_variant *e;
353 int r;
354
355 JSON_VARIANT_ARRAY_FOREACH(e, v) {
356 gid_t gid;
357
358 if (!sd_json_variant_is_unsigned(e))
359 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
360 "Supplementary GID entry is not a UID.");
361
362 r = sd_json_dispatch_uid_gid(name, e, flags, &gid);
363 if (r < 0)
364 return r;
365
366 if (!GREEDY_REALLOC(s->supplementary_gids, s->n_supplementary_gids + 1))
367 return log_oom();
368
369 s->supplementary_gids[s->n_supplementary_gids++] = gid;
370 }
371
372 return 0;
373 }
374
375 static int oci_user(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
376
377 static const sd_json_dispatch_field table[] = {
378 { "uid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(Settings, uid), SD_JSON_MANDATORY },
379 { "gid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(Settings, gid), SD_JSON_MANDATORY },
380 { "additionalGids", SD_JSON_VARIANT_ARRAY, oci_supplementary_gids, 0, 0 },
381 {}
382 };
383
384 return oci_dispatch(v, table, flags, userdata);
385 }
386
387 static int oci_process(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
388
389 static const sd_json_dispatch_field table[] = {
390 { "terminal", SD_JSON_VARIANT_BOOLEAN, oci_terminal, 0, 0 },
391 { "consoleSize", SD_JSON_VARIANT_OBJECT, oci_console_size, 0, 0 },
392 { "cwd", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(Settings, working_directory), 0 },
393 { "env", SD_JSON_VARIANT_ARRAY, oci_env, offsetof(Settings, environment), 0 },
394 { "args", SD_JSON_VARIANT_ARRAY, oci_args, offsetof(Settings, parameters), 0 },
395 { "rlimits", SD_JSON_VARIANT_ARRAY, oci_rlimits, 0, 0 },
396 { "apparmorProfile", SD_JSON_VARIANT_STRING, oci_unsupported, 0, SD_JSON_PERMISSIVE },
397 { "capabilities", SD_JSON_VARIANT_OBJECT, oci_capabilities, 0, 0 },
398 { "noNewPrivileges", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_tristate, offsetof(Settings, no_new_privileges), 0 },
399 { "oomScoreAdj", SD_JSON_VARIANT_INTEGER, oci_oom_score_adj, 0, 0 },
400 { "selinuxLabel", SD_JSON_VARIANT_STRING, oci_unsupported, 0, SD_JSON_PERMISSIVE },
401 { "user", SD_JSON_VARIANT_OBJECT, oci_user, 0, 0 },
402 {}
403 };
404
405 return oci_dispatch(v, table, flags, userdata);
406 }
407
408 static int oci_root(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
409 Settings *s = ASSERT_PTR(userdata);
410 int r;
411
412 static const sd_json_dispatch_field table[] = {
413 { "path", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(Settings, root) },
414 { "readonly", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_tristate, offsetof(Settings, read_only) },
415 {}
416 };
417
418 r = oci_dispatch(v, table, flags, s);
419 if (r < 0)
420 return r;
421
422 if (s->root && !path_is_absolute(s->root)) {
423 char *joined;
424
425 joined = path_join(s->bundle, s->root);
426 if (!joined)
427 return log_oom();
428
429 free_and_replace(s->root, joined);
430 }
431
432 return 0;
433 }
434
435 static int oci_hostname(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
436 Settings *s = ASSERT_PTR(userdata);
437 const char *n;
438
439 assert_se(n = sd_json_variant_string(v));
440
441 if (!hostname_is_valid(n, 0))
442 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
443 "Hostname string is not a valid hostname: %s", n);
444
445 return free_and_strdup_warn(&s->hostname, n);
446 }
447
448 static bool oci_exclude_mount(const char *path) {
449
450 /* Returns "true" for all mounts we insist to mount on our own, and hence ignore the OCI data. */
451
452 if (PATH_IN_SET(path,
453 "/dev",
454 "/dev/mqueue",
455 "/dev/pts",
456 "/dev/shm",
457 "/proc",
458 "/proc/acpi",
459 "/proc/apm",
460 "/proc/asound",
461 "/proc/bus",
462 "/proc/fs",
463 "/proc/irq",
464 "/proc/kallsyms",
465 "/proc/kcore",
466 "/proc/keys",
467 "/proc/scsi",
468 "/proc/sys",
469 "/proc/sys/net",
470 "/proc/sysrq-trigger",
471 "/proc/timer_list",
472 "/run",
473 "/sys",
474 "/sys",
475 "/sys/fs/selinux",
476 "/tmp"))
477 return true;
478
479 /* Similar, skip the whole /sys/fs/cgroups subtree */
480 if (path_startswith(path, "/sys/fs/cgroup"))
481 return true;
482
483 return false;
484 }
485
486 typedef struct oci_mount_data {
487 char *destination;
488 char *source;
489 char *type;
490 char **options;
491 } oci_mount_data;
492
493 static void oci_mount_data_done(oci_mount_data *data) {
494 assert(data);
495
496 free(data->destination);
497 free(data->source);
498 free(data->type);
499 strv_free(data->options);
500 }
501
502 static int oci_mounts(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
503 Settings *s = ASSERT_PTR(userdata);
504 sd_json_variant *e;
505 int r;
506
507 JSON_VARIANT_ARRAY_FOREACH(e, v) {
508 static const sd_json_dispatch_field table[] = {
509 { "destination", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(oci_mount_data, destination), SD_JSON_MANDATORY },
510 { "source", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(oci_mount_data, source), 0 },
511 { "options", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(oci_mount_data, options), 0, },
512 { "type", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(oci_mount_data, type), 0 },
513 {}
514 };
515
516 _cleanup_free_ char *joined_options = NULL;
517 _cleanup_(oci_mount_data_done) oci_mount_data data = {};
518 CustomMount *m;
519
520 r = oci_dispatch(e, table, flags, &data);
521 if (r < 0)
522 return r;
523
524 if (!path_is_absolute(data.destination))
525 return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL),
526 "Mount destination not an absolute path: %s", data.destination);
527
528 if (oci_exclude_mount(data.destination))
529 continue;
530
531 if (data.options) {
532 joined_options = strv_join(data.options, ",");
533 if (!joined_options)
534 return log_oom();
535 }
536
537 if (!data.type || streq(data.type, "bind")) {
538 if (data.source && !path_is_absolute(data.source)) {
539 char *joined;
540
541 joined = path_join(s->bundle, data.source);
542 if (!joined)
543 return log_oom();
544
545 free_and_replace(data.source, joined);
546 }
547
548 data.type = mfree(data.type);
549
550 m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_BIND);
551 } else
552 m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_ARBITRARY);
553 if (!m)
554 return log_oom();
555
556 m->destination = TAKE_PTR(data.destination);
557 m->source = TAKE_PTR(data.source);
558 m->options = TAKE_PTR(joined_options);
559 m->type_argument = TAKE_PTR(data.type);
560 }
561
562 return 0;
563 }
564
565 static int oci_namespace_type(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
566 unsigned long *nsflags = ASSERT_PTR(userdata);
567 const char *n;
568
569 assert_se(n = sd_json_variant_string(v));
570
571 /* We don't use namespace_flags_from_string() here, as the OCI spec uses slightly different names than the
572 * kernel here. */
573 if (streq(n, "pid"))
574 *nsflags = CLONE_NEWPID;
575 else if (streq(n, "network"))
576 *nsflags = CLONE_NEWNET;
577 else if (streq(n, "mount"))
578 *nsflags = CLONE_NEWNS;
579 else if (streq(n, "ipc"))
580 *nsflags = CLONE_NEWIPC;
581 else if (streq(n, "uts"))
582 *nsflags = CLONE_NEWUTS;
583 else if (streq(n, "user"))
584 *nsflags = CLONE_NEWUSER;
585 else if (streq(n, "cgroup"))
586 *nsflags = CLONE_NEWCGROUP;
587 else
588 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
589 "Unknown namespace type, refusing: %s", n);
590
591 return 0;
592 }
593
594 struct namespace_data {
595 unsigned long type;
596 char *path;
597 };
598
599 static void namespace_data_done(struct namespace_data *data) {
600 assert(data);
601
602 free(data->path);
603 }
604
605 static int oci_namespaces(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
606 Settings *s = ASSERT_PTR(userdata);
607 unsigned long n = 0;
608 sd_json_variant *e;
609 int r;
610
611 JSON_VARIANT_ARRAY_FOREACH(e, v) {
612 _cleanup_(namespace_data_done) struct namespace_data data = {};
613
614 static const sd_json_dispatch_field table[] = {
615 { "type", SD_JSON_VARIANT_STRING, oci_namespace_type, offsetof(struct namespace_data, type), SD_JSON_MANDATORY },
616 { "path", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(struct namespace_data, path), 0 },
617 {}
618 };
619
620 r = oci_dispatch(e, table, flags, &data);
621 if (r < 0)
622 return r;
623
624 if (data.path) {
625 if (data.type != CLONE_NEWNET)
626 return json_log(e, flags, SYNTHETIC_ERRNO(EOPNOTSUPP),
627 "Specifying namespace path for non-network namespace is not supported.");
628
629 if (s->network_namespace_path)
630 return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL),
631 "Network namespace path specified more than once, refusing.");
632
633 free_and_replace(s->network_namespace_path, data.path);
634 }
635
636 if (FLAGS_SET(n, data.type))
637 return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL),
638 "Duplicate namespace specification, refusing.");
639
640 n |= data.type;
641 }
642
643 if (!FLAGS_SET(n, CLONE_NEWNS))
644 return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP),
645 "Containers without a mount namespace aren't supported.");
646
647 s->private_network = FLAGS_SET(n, CLONE_NEWNET);
648 s->userns_mode = FLAGS_SET(n, CLONE_NEWUSER) ? USER_NAMESPACE_FIXED : USER_NAMESPACE_NO;
649 s->use_cgns = FLAGS_SET(n, CLONE_NEWCGROUP);
650
651 s->clone_ns_flags = n & (CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
652
653 return 0;
654 }
655
656 static int oci_uid_gid_range(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
657 uid_t *uid = ASSERT_PTR(userdata);
658 uid_t u;
659 uint64_t k;
660
661 assert_cc(sizeof(uid_t) == sizeof(gid_t));
662
663 /* This is very much like oci_uid_gid(), except the checks are a bit different, as this is a UID range rather
664 * than a specific UID, and hence UID_INVALID has no special significance. OTOH a range of zero makes no
665 * sense. */
666
667 k = sd_json_variant_unsigned(v);
668 u = (uid_t) k;
669 if ((uint64_t) u != k)
670 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
671 "UID/GID out of range: %" PRIu64, k);
672 if (u == 0)
673 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
674 "UID/GID range can't be zero.");
675
676 *uid = u;
677 return 0;
678 }
679
680 static int oci_uid_gid_mappings(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
681 struct mapping_data {
682 uid_t host_id;
683 uid_t container_id;
684 uid_t range;
685 } data = {
686 .host_id = UID_INVALID,
687 .container_id = UID_INVALID,
688 .range = 0,
689 };
690
691 static const sd_json_dispatch_field table[] = {
692 { "containerID", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(struct mapping_data, container_id), SD_JSON_MANDATORY },
693 { "hostID", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(struct mapping_data, host_id), SD_JSON_MANDATORY },
694 { "size", SD_JSON_VARIANT_UNSIGNED, oci_uid_gid_range, offsetof(struct mapping_data, range), SD_JSON_MANDATORY },
695 {}
696 };
697
698 Settings *s = ASSERT_PTR(userdata);
699 sd_json_variant *e;
700 int r;
701
702 if (sd_json_variant_elements(v) == 0)
703 return 0;
704
705 if (sd_json_variant_elements(v) > 1)
706 return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP),
707 "UID/GID mappings with more than one entry are not supported.");
708
709 assert_se(e = sd_json_variant_by_index(v, 0));
710
711 r = oci_dispatch(e, table, flags, &data);
712 if (r < 0)
713 return r;
714
715 if (data.range > UINT32_MAX - data.host_id ||
716 data.range > UINT32_MAX - data.container_id)
717 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
718 "UID/GID range goes beyond UID/GID validity range, refusing.");
719
720 if (data.container_id != 0)
721 return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP),
722 "UID/GID mappings with a non-zero container base are not supported.");
723
724 if (data.range < 0x10000)
725 json_log(v, flags|SD_JSON_WARNING, 0,
726 "UID/GID mapping with less than 65536 UID/GIDS set up, you are looking for trouble.");
727
728 if (s->uid_range != UID_INVALID &&
729 (s->uid_shift != data.host_id || s->uid_range != data.range))
730 return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP),
731 "Non-matching UID and GID mappings are not supported.");
732
733 s->uid_shift = data.host_id;
734 s->uid_range = data.range;
735
736 return 0;
737 }
738
739 static int oci_device_type(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
740 mode_t *mode = ASSERT_PTR(userdata);
741 const char *t;
742
743 assert_se(t = sd_json_variant_string(v));
744
745 if (STR_IN_SET(t, "c", "u"))
746 *mode = (*mode & ~S_IFMT) | S_IFCHR;
747 else if (streq(t, "b"))
748 *mode = (*mode & ~S_IFMT) | S_IFBLK;
749 else if (streq(t, "p"))
750 *mode = (*mode & ~S_IFMT) | S_IFIFO;
751 else
752 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
753 "Unknown device type: %s", t);
754
755 return 0;
756 }
757
758 static int oci_device_major(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
759 unsigned *u = ASSERT_PTR(userdata);
760 uint64_t k;
761
762 k = sd_json_variant_unsigned(v);
763 if (!DEVICE_MAJOR_VALID(k))
764 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
765 "Device major %" PRIu64 " out of range.", k);
766
767 *u = (unsigned) k;
768 return 0;
769 }
770
771 static int oci_device_minor(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
772 unsigned *u = ASSERT_PTR(userdata);
773 uint64_t k;
774
775 k = sd_json_variant_unsigned(v);
776 if (!DEVICE_MINOR_VALID(k))
777 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
778 "Device minor %" PRIu64 " out of range.", k);
779
780 *u = (unsigned) k;
781 return 0;
782 }
783
784 static int oci_device_file_mode(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
785 mode_t *mode = ASSERT_PTR(userdata);
786 mode_t m;
787 uint64_t k;
788
789 k = sd_json_variant_unsigned(v);
790 m = (mode_t) k;
791
792 if ((m & ~07777) != 0 || (uint64_t) m != k)
793 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
794 "fileMode out of range, refusing.");
795
796 *mode = (*mode & ~07777) | m;
797 return 0;
798 }
799
800 static int oci_devices(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
801 Settings *s = ASSERT_PTR(userdata);
802 sd_json_variant *e;
803 int r;
804
805 JSON_VARIANT_ARRAY_FOREACH(e, v) {
806
807 static const sd_json_dispatch_field table[] = {
808 { "type", SD_JSON_VARIANT_STRING, oci_device_type, offsetof(DeviceNode, mode), SD_JSON_MANDATORY },
809 { "path", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(DeviceNode, path), SD_JSON_MANDATORY },
810 { "major", SD_JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(DeviceNode, major), 0 },
811 { "minor", SD_JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(DeviceNode, minor), 0 },
812 { "fileMode", SD_JSON_VARIANT_UNSIGNED, oci_device_file_mode, offsetof(DeviceNode, mode), 0 },
813 { "uid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DeviceNode, uid), 0 },
814 { "gid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DeviceNode, gid), 0 },
815 {}
816 };
817
818 DeviceNode *node;
819
820 if (!GREEDY_REALLOC(s->extra_nodes, s->n_extra_nodes + 1))
821 return log_oom();
822
823 node = s->extra_nodes + s->n_extra_nodes;
824 *node = (DeviceNode) {
825 .uid = UID_INVALID,
826 .gid = GID_INVALID,
827 .major = UINT_MAX,
828 .minor = UINT_MAX,
829 .mode = 0644,
830 };
831
832 r = oci_dispatch(e, table, flags, node);
833 if (r < 0)
834 goto fail_element;
835
836 if (S_ISCHR(node->mode) || S_ISBLK(node->mode)) {
837 _cleanup_free_ char *path = NULL;
838
839 if (node->major == UINT_MAX || node->minor == UINT_MAX) {
840 r = json_log(e, flags, SYNTHETIC_ERRNO(EINVAL),
841 "Major/minor required when device node is device node.");
842 goto fail_element;
843 }
844
845 /* Suppress a couple of implicit device nodes */
846 r = devname_from_devnum(node->mode, makedev(node->major, node->minor), &path);
847 if (r < 0)
848 json_log(e, flags|SD_JSON_DEBUG, r, "Failed to resolve device node %u:%u, ignoring: %m", node->major, node->minor);
849 else {
850 if (PATH_IN_SET(path,
851 "/dev/null",
852 "/dev/zero",
853 "/dev/full",
854 "/dev/random",
855 "/dev/urandom",
856 "/dev/tty",
857 "/dev/net/tun",
858 "/dev/ptmx",
859 "/dev/pts/ptmx",
860 "/dev/console")) {
861
862 json_log(e, flags|SD_JSON_DEBUG, 0, "Ignoring devices item for device '%s', as it is implicitly created anyway.", path);
863 free(node->path);
864 continue;
865 }
866 }
867 }
868
869 s->n_extra_nodes++;
870 continue;
871
872 fail_element:
873 free(node->path);
874 return r;
875 }
876
877 return 0;
878 }
879
880 static int oci_cgroups_path(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
881 _cleanup_free_ char *slice = NULL, *backwards = NULL;
882 Settings *s = ASSERT_PTR(userdata);
883 const char *p;
884 int r;
885
886 assert_se(p = sd_json_variant_string(v));
887
888 r = cg_path_get_slice(p, &slice);
889 if (r < 0)
890 return json_log(v, flags, r, "Couldn't derive slice unit name from path '%s': %m", p);
891
892 r = cg_slice_to_path(slice, &backwards);
893 if (r < 0)
894 return json_log(v, flags, r, "Couldn't convert slice unit name '%s' back to path: %m", slice);
895
896 if (!path_equal(backwards, p))
897 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
898 "Control group path '%s' does not refer to slice unit, refusing.", p);
899
900 free_and_replace(s->slice, slice);
901 return 0;
902 }
903
904 static int oci_cgroup_device_type(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
905 mode_t *mode = ASSERT_PTR(userdata);
906 const char *n;
907
908 assert_se(n = sd_json_variant_string(v));
909
910 if (streq(n, "c"))
911 *mode = S_IFCHR;
912 else if (streq(n, "b"))
913 *mode = S_IFBLK;
914 else
915 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
916 "Control group device type unknown: %s", n);
917
918 return 0;
919 }
920
921 struct device_data {
922 bool allow;
923 bool r;
924 bool w;
925 bool m;
926 mode_t type;
927 unsigned major;
928 unsigned minor;
929 };
930
931 static int oci_cgroup_device_access(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
932 struct device_data *d = ASSERT_PTR(userdata);
933 bool r = false, w = false, m = false;
934
935 for (const char *s = ASSERT_PTR(sd_json_variant_string(v)); *s; s++)
936 if (*s == 'r')
937 r = true;
938 else if (*s == 'w')
939 w = true;
940 else if (*s == 'm')
941 m = true;
942 else
943 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
944 "Unknown device access character '%c'.", *s);
945
946 d->r = r;
947 d->w = w;
948 d->m = m;
949
950 return 0;
951 }
952
953 static int oci_cgroup_devices(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
954 _cleanup_free_ struct device_data *list = NULL;
955 Settings *s = ASSERT_PTR(userdata);
956 size_t n_list = 0;
957 bool noop = false;
958 sd_json_variant *e;
959 int r;
960
961 JSON_VARIANT_ARRAY_FOREACH(e, v) {
962
963 struct device_data data = {
964 .major = UINT_MAX,
965 .minor = UINT_MAX,
966 };
967
968 static const sd_json_dispatch_field table[] = {
969 { "allow", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(struct device_data, allow), SD_JSON_MANDATORY },
970 { "type", SD_JSON_VARIANT_STRING, oci_cgroup_device_type, offsetof(struct device_data, type), 0 },
971 { "major", SD_JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), 0 },
972 { "minor", SD_JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), 0 },
973 { "access", SD_JSON_VARIANT_STRING, oci_cgroup_device_access, 0, 0 },
974 {}
975 };
976
977 r = oci_dispatch(e, table, flags, &data);
978 if (r < 0)
979 return r;
980
981 if (!data.allow) {
982 /* The fact that OCI allows 'deny' entries makes really no sense, as 'allow'
983 * vs. 'deny' for the devices cgroup controller is really not about allow-listing and
984 * deny-listing but about adding and removing entries from the allow list. Since we
985 * always start out with an empty allow list we hence ignore the whole thing, as
986 * removing entries which don't exist make no sense. We'll log about this, since this
987 * is really borked in the spec, with one exception: the entry that's supposed to
988 * drop the kernel's default we ignore silently */
989
990 if (!data.r || !data.w || !data.m || data.type != 0 || data.major != UINT_MAX || data.minor != UINT_MAX)
991 json_log(v, flags|SD_JSON_WARNING, 0, "Devices cgroup allow list with arbitrary 'allow' entries not supported, ignoring.");
992
993 /* We ignore the 'deny' entry as for us that's implied */
994 continue;
995 }
996
997 if (!data.r && !data.w && !data.m) {
998 json_log(v, flags|LOG_WARNING, 0, "Device cgroup allow list entry with no effect found, ignoring.");
999 continue;
1000 }
1001
1002 if (data.minor != UINT_MAX && data.major == UINT_MAX)
1003 return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP),
1004 "Device cgroup allow list entries with minors but no majors not supported.");
1005
1006 if (data.major != UINT_MAX && data.type == 0)
1007 return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP),
1008 "Device cgroup allow list entries with majors but no device node type not supported.");
1009
1010 if (data.type == 0) {
1011 if (data.r && data.w && data.m) /* a catchall allow list entry means we are looking at a noop */
1012 noop = true;
1013 else
1014 return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP),
1015 "Device cgroup allow list entries with no type not supported.");
1016 }
1017
1018 if (!GREEDY_REALLOC(list, n_list + 1))
1019 return log_oom();
1020
1021 list[n_list++] = data;
1022 }
1023
1024 if (noop)
1025 return 0;
1026
1027 r = settings_allocate_properties(s);
1028 if (r < 0)
1029 return r;
1030
1031 r = sd_bus_message_open_container(s->properties, 'r', "sv");
1032 if (r < 0)
1033 return bus_log_create_error(r);
1034
1035 r = sd_bus_message_append(s->properties, "s", "DeviceAllow");
1036 if (r < 0)
1037 return bus_log_create_error(r);
1038
1039 r = sd_bus_message_open_container(s->properties, 'v', "a(ss)");
1040 if (r < 0)
1041 return bus_log_create_error(r);
1042
1043 r = sd_bus_message_open_container(s->properties, 'a', "(ss)");
1044 if (r < 0)
1045 return bus_log_create_error(r);
1046
1047 FOREACH_ARRAY(d, list, n_list) {
1048 _cleanup_free_ char *pattern = NULL;
1049 char access[4];
1050 size_t n = 0;
1051
1052 if (d->minor == UINT_MAX) {
1053 const char *t;
1054
1055 if (d->type == S_IFBLK)
1056 t = "block";
1057 else {
1058 assert(d->type == S_IFCHR);
1059 t = "char";
1060 }
1061
1062 if (d->major == UINT_MAX) {
1063 pattern = strjoin(t, "-*");
1064 if (!pattern)
1065 return log_oom();
1066 } else {
1067 if (asprintf(&pattern, "%s-%u", t, d->major) < 0)
1068 return log_oom();
1069 }
1070
1071 } else {
1072 assert(d->major != UINT_MAX); /* If a minor is specified, then a major also needs to be specified */
1073
1074 r = device_path_make_major_minor(d->type, makedev(d->major, d->minor), &pattern);
1075 if (r < 0)
1076 return log_oom();
1077 }
1078
1079 if (d->r)
1080 access[n++] = 'r';
1081 if (d->w)
1082 access[n++] = 'w';
1083 if (d->m)
1084 access[n++] = 'm';
1085 access[n] = 0;
1086
1087 assert(n > 0);
1088
1089 r = sd_bus_message_append(s->properties, "(ss)", pattern, access);
1090 if (r < 0)
1091 return bus_log_create_error(r);
1092 }
1093
1094 r = sd_bus_message_close_container(s->properties);
1095 if (r < 0)
1096 return bus_log_create_error(r);
1097
1098 r = sd_bus_message_close_container(s->properties);
1099 if (r < 0)
1100 return bus_log_create_error(r);
1101
1102 r = sd_bus_message_close_container(s->properties);
1103 if (r < 0)
1104 return bus_log_create_error(r);
1105
1106 return 0;
1107 }
1108
1109 static int oci_cgroup_memory_limit(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1110 uint64_t *m = ASSERT_PTR(userdata);
1111 uint64_t k;
1112
1113 if (sd_json_variant_is_negative(v)) {
1114 *m = UINT64_MAX;
1115 return 0;
1116 }
1117
1118 if (!sd_json_variant_is_unsigned(v))
1119 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
1120 "Memory limit is not an unsigned integer.");
1121
1122 k = sd_json_variant_unsigned(v);
1123 if (k >= UINT64_MAX)
1124 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
1125 "Memory limit too large: %" PRIu64, k);
1126
1127 *m = (uint64_t) k;
1128 return 0;
1129 }
1130
1131 static int oci_cgroup_memory(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1132
1133 struct memory_data {
1134 uint64_t limit;
1135 uint64_t reservation;
1136 uint64_t swap;
1137 } data = {
1138 .limit = UINT64_MAX,
1139 .reservation = UINT64_MAX,
1140 .swap = UINT64_MAX,
1141 };
1142
1143 static const sd_json_dispatch_field table[] = {
1144 { "limit", SD_JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, limit), 0 },
1145 { "reservation", SD_JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, reservation), 0 },
1146 { "swap", SD_JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, swap), 0 },
1147 { "kernel", SD_JSON_VARIANT_NUMBER, oci_unsupported, 0, SD_JSON_PERMISSIVE },
1148 { "kernelTCP", SD_JSON_VARIANT_NUMBER, oci_unsupported, 0, SD_JSON_PERMISSIVE },
1149 { "swapiness", SD_JSON_VARIANT_NUMBER, oci_unsupported, 0, SD_JSON_PERMISSIVE },
1150 { "disableOOMKiller", SD_JSON_VARIANT_BOOLEAN, oci_unsupported, 0, SD_JSON_PERMISSIVE },
1151 {}
1152 };
1153
1154 Settings *s = ASSERT_PTR(userdata);
1155 int r;
1156
1157 r = oci_dispatch(v, table, flags, &data);
1158 if (r < 0)
1159 return r;
1160
1161 if (data.swap != UINT64_MAX) {
1162 if (data.limit == UINT64_MAX)
1163 json_log(v, flags|LOG_WARNING, 0, "swap limit without memory limit is not supported, ignoring.");
1164 else if (data.swap < data.limit)
1165 json_log(v, flags|LOG_WARNING, 0, "swap limit is below memory limit, ignoring.");
1166 else {
1167 r = settings_allocate_properties(s);
1168 if (r < 0)
1169 return r;
1170
1171 r = sd_bus_message_append(s->properties, "(sv)", "MemorySwapMax", "t", data.swap - data.limit);
1172 if (r < 0)
1173 return bus_log_create_error(r);
1174 }
1175 }
1176
1177 if (data.limit != UINT64_MAX) {
1178 r = settings_allocate_properties(s);
1179 if (r < 0)
1180 return r;
1181
1182 r = sd_bus_message_append(s->properties, "(sv)", "MemoryMax", "t", data.limit);
1183 if (r < 0)
1184 return bus_log_create_error(r);
1185 }
1186
1187 if (data.reservation != UINT64_MAX) {
1188 r = settings_allocate_properties(s);
1189 if (r < 0)
1190 return r;
1191
1192 r = sd_bus_message_append(s->properties, "(sv)", "MemoryLow", "t", data.reservation);
1193 if (r < 0)
1194 return bus_log_create_error(r);
1195 }
1196
1197 return 0;
1198 }
1199
1200 struct cpu_data {
1201 uint64_t weight;
1202 uint64_t quota;
1203 uint64_t period;
1204 CPUSet cpu_set;
1205 };
1206
1207 static int oci_cgroup_cpu_shares(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1208 uint64_t k, *u = ASSERT_PTR(userdata);
1209
1210 k = sd_json_variant_unsigned(v);
1211 if (k < CGROUP_CPU_SHARES_MIN || k > CGROUP_CPU_SHARES_MAX)
1212 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), "shares value out of range.");
1213
1214 /* convert from cgroup v1 cpu.shares to v2 cpu.weight */
1215 assert_cc(CGROUP_CPU_SHARES_MAX <= UINT64_MAX / CGROUP_WEIGHT_DEFAULT);
1216 *u = CLAMP(k * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT, CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
1217 return 0;
1218 }
1219
1220 static int oci_cgroup_cpu_quota(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1221 uint64_t k, *u = ASSERT_PTR(userdata);
1222
1223 k = sd_json_variant_unsigned(v);
1224 if (k <= 0 || k >= UINT64_MAX)
1225 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), "period/quota value out of range.");
1226
1227 *u = k;
1228 return 0;
1229 }
1230
1231 static int oci_cgroup_cpu_cpus(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1232 struct cpu_data *data = ASSERT_PTR(userdata);
1233 CPUSet set;
1234 const char *n;
1235 int r;
1236
1237 assert_se(n = sd_json_variant_string(v));
1238
1239 r = parse_cpu_set(n, &set);
1240 if (r < 0)
1241 return json_log(v, flags, r, "Failed to parse CPU set specification: %s", n);
1242
1243 return cpu_set_done_and_replace(data->cpu_set, set);
1244 }
1245
1246 static int oci_cgroup_cpu(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1247
1248 static const sd_json_dispatch_field table[] = {
1249 { "shares", SD_JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_shares, offsetof(struct cpu_data, weight), 0 },
1250 { "quota", SD_JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_quota, offsetof(struct cpu_data, quota), 0 },
1251 { "period", SD_JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_quota, offsetof(struct cpu_data, period), 0 },
1252 { "realtimeRuntime", SD_JSON_VARIANT_UNSIGNED, oci_unsupported, 0, 0 },
1253 { "realtimePeriod", SD_JSON_VARIANT_UNSIGNED, oci_unsupported, 0, 0 },
1254 { "cpus", SD_JSON_VARIANT_STRING, oci_cgroup_cpu_cpus, 0, 0 },
1255 { "mems", SD_JSON_VARIANT_STRING, oci_unsupported, 0, 0 },
1256 {}
1257 };
1258
1259 struct cpu_data data = {
1260 .weight = UINT64_MAX,
1261 .quota = UINT64_MAX,
1262 .period = UINT64_MAX,
1263 };
1264
1265 Settings *s = ASSERT_PTR(userdata);
1266 int r;
1267
1268 r = oci_dispatch(v, table, flags, &data);
1269 if (r < 0) {
1270 cpu_set_done(&data.cpu_set);
1271 return r;
1272 }
1273
1274 cpu_set_done_and_replace(s->cpu_set, data.cpu_set);
1275
1276 if (data.weight != UINT64_MAX) {
1277 r = settings_allocate_properties(s);
1278 if (r < 0)
1279 return r;
1280
1281 r = sd_bus_message_append(s->properties, "(sv)", "CPUWeight", "t", data.weight);
1282 if (r < 0)
1283 return bus_log_create_error(r);
1284 }
1285
1286 if (data.quota != UINT64_MAX && data.period != UINT64_MAX) {
1287 r = settings_allocate_properties(s);
1288 if (r < 0)
1289 return r;
1290
1291 r = sd_bus_message_append(s->properties, "(sv)", "CPUQuotaPerSecUSec", "t", data.quota * USEC_PER_SEC / data.period);
1292 if (r < 0)
1293 return bus_log_create_error(r);
1294
1295 r = sd_bus_message_append(s->properties, "(sv)", "CPUQuotaPeriodUSec", "t", data.period);
1296 if (r < 0)
1297 return bus_log_create_error(r);
1298
1299 } else if ((data.quota != UINT64_MAX) != (data.period != UINT64_MAX))
1300 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
1301 "CPU quota and period not used together.");
1302
1303 return 0;
1304 }
1305
1306 static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
1307 /* convert from cgroup v1 blkio.weight to v2 io.weight */
1308 assert_cc(CGROUP_BLKIO_WEIGHT_MAX <= UINT64_MAX / CGROUP_WEIGHT_DEFAULT);
1309 return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
1310 CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
1311 }
1312
1313 static int oci_cgroup_block_io_weight(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1314 Settings *s = ASSERT_PTR(userdata);
1315 uint64_t k;
1316 int r;
1317
1318 k = sd_json_variant_unsigned(v);
1319 if (k < CGROUP_BLKIO_WEIGHT_MIN || k > CGROUP_BLKIO_WEIGHT_MAX)
1320 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
1321 "Block I/O weight out of range.");
1322
1323 r = settings_allocate_properties(s);
1324 if (r < 0)
1325 return r;
1326
1327 r = sd_bus_message_append(s->properties, "(sv)", "IOWeight", "t", cgroup_weight_blkio_to_io(k));
1328 if (r < 0)
1329 return bus_log_create_error(r);
1330
1331 return 0;
1332 }
1333
1334 static int oci_cgroup_block_io_weight_device(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1335 Settings *s = ASSERT_PTR(userdata);
1336 sd_json_variant *e;
1337 int r;
1338
1339 JSON_VARIANT_ARRAY_FOREACH(e, v) {
1340 struct device_data {
1341 unsigned major;
1342 unsigned minor;
1343 uint64_t weight;
1344 } data = {
1345 .major = UINT_MAX,
1346 .minor = UINT_MAX,
1347 .weight = UINT64_MAX,
1348 };
1349
1350 static const sd_json_dispatch_field table[] = {
1351 { "major", SD_JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), SD_JSON_MANDATORY },
1352 { "minor", SD_JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), SD_JSON_MANDATORY },
1353 { "weight", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(struct device_data, weight), 0 },
1354 { "leafWeight", SD_JSON_VARIANT_INTEGER, oci_unsupported, 0, SD_JSON_PERMISSIVE },
1355 {}
1356 };
1357
1358 _cleanup_free_ char *path = NULL;
1359
1360 r = oci_dispatch(e, table, flags, &data);
1361 if (r < 0)
1362 return r;
1363
1364 if (data.weight == UINT64_MAX)
1365 continue;
1366
1367 if (data.weight < CGROUP_BLKIO_WEIGHT_MIN || data.weight > CGROUP_BLKIO_WEIGHT_MAX)
1368 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
1369 "Block I/O device weight out of range.");
1370
1371 r = device_path_make_major_minor(S_IFBLK, makedev(data.major, data.minor), &path);
1372 if (r < 0)
1373 return json_log(v, flags, r, "Failed to build device path: %m");
1374
1375 r = settings_allocate_properties(s);
1376 if (r < 0)
1377 return r;
1378
1379 r = sd_bus_message_append(s->properties, "(sv)", "IODeviceWeight", "a(st)", 1,
1380 path, cgroup_weight_blkio_to_io(data.weight));
1381 if (r < 0)
1382 return bus_log_create_error(r);
1383 }
1384
1385 return 0;
1386 }
1387
1388 static int oci_cgroup_block_io_throttle(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1389 Settings *s = ASSERT_PTR(userdata);
1390 const char *pname;
1391 sd_json_variant *e;
1392 int r;
1393
1394 pname = streq(name, "throttleReadBpsDevice") ? "IOReadBandwidthMax" :
1395 streq(name, "throttleWriteBpsDevice") ? "IOWriteBandwidthMax" :
1396 streq(name, "throttleReadIOPSDevice") ? "IOReadIOPSMax" :
1397 "IOWriteIOPSMax";
1398
1399 JSON_VARIANT_ARRAY_FOREACH(e, v) {
1400 struct device_data {
1401 unsigned major;
1402 unsigned minor;
1403 uint64_t rate;
1404 } data = {
1405 .major = UINT_MAX,
1406 .minor = UINT_MAX,
1407 };
1408
1409 static const sd_json_dispatch_field table[] = {
1410 { "major", SD_JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), SD_JSON_MANDATORY },
1411 { "minor", SD_JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), SD_JSON_MANDATORY },
1412 { "rate", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(struct device_data, rate), SD_JSON_MANDATORY },
1413 {}
1414 };
1415
1416 _cleanup_free_ char *path = NULL;
1417
1418 r = oci_dispatch(e, table, flags, &data);
1419 if (r < 0)
1420 return r;
1421
1422 if (data.rate >= UINT64_MAX)
1423 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
1424 "Block I/O device rate out of range.");
1425
1426 r = device_path_make_major_minor(S_IFBLK, makedev(data.major, data.minor), &path);
1427 if (r < 0)
1428 return json_log(v, flags, r, "Failed to build device path: %m");
1429
1430 r = settings_allocate_properties(s);
1431 if (r < 0)
1432 return r;
1433
1434 r = sd_bus_message_append(s->properties, "(sv)", pname, "a(st)", 1, path, (uint64_t) data.rate);
1435 if (r < 0)
1436 return bus_log_create_error(r);
1437 }
1438
1439 return 0;
1440 }
1441
1442 static int oci_cgroup_block_io(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1443
1444 static const sd_json_dispatch_field table[] = {
1445 { "weight", SD_JSON_VARIANT_UNSIGNED, oci_cgroup_block_io_weight, 0, 0 },
1446 { "leafWeight", SD_JSON_VARIANT_UNSIGNED, oci_unsupported, 0, SD_JSON_PERMISSIVE },
1447 { "weightDevice", SD_JSON_VARIANT_ARRAY, oci_cgroup_block_io_weight_device, 0, 0 },
1448 { "throttleReadBpsDevice", SD_JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 },
1449 { "throttleWriteBpsDevice", SD_JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 },
1450 { "throttleReadIOPSDevice", SD_JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 },
1451 { "throttleWriteIOPSDevice", SD_JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 },
1452 {}
1453 };
1454
1455 return oci_dispatch(v, table, flags, userdata);
1456 }
1457
1458 static int oci_cgroup_pids(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1459
1460 static const sd_json_dispatch_field table[] = {
1461 { "limit", SD_JSON_VARIANT_NUMBER, sd_json_dispatch_variant, 0, SD_JSON_MANDATORY },
1462 {}
1463 };
1464
1465 _cleanup_(sd_json_variant_unrefp) sd_json_variant *k = NULL;
1466 Settings *s = ASSERT_PTR(userdata);
1467 uint64_t m;
1468 int r;
1469
1470 r = oci_dispatch(v, table, flags, &k);
1471 if (r < 0)
1472 return r;
1473
1474 if (sd_json_variant_is_negative(k))
1475 m = UINT64_MAX;
1476 else {
1477 if (!sd_json_variant_is_unsigned(k))
1478 return json_log(k, flags, SYNTHETIC_ERRNO(EINVAL),
1479 "pids limit not unsigned integer, refusing.");
1480
1481 m = (uint64_t) sd_json_variant_unsigned(k);
1482
1483 if ((uint64_t) m != sd_json_variant_unsigned(k))
1484 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
1485 "pids limit out of range, refusing.");
1486 }
1487
1488 r = settings_allocate_properties(s);
1489 if (r < 0)
1490 return r;
1491
1492 r = sd_bus_message_append(s->properties, "(sv)", "TasksMax", "t", m);
1493 if (r < 0)
1494 return bus_log_create_error(r);
1495
1496 return 0;
1497 }
1498
1499 static int oci_resources(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1500
1501 static const sd_json_dispatch_field table[] = {
1502 { "devices", SD_JSON_VARIANT_ARRAY, oci_cgroup_devices, 0, 0 },
1503 { "memory", SD_JSON_VARIANT_OBJECT, oci_cgroup_memory, 0, 0 },
1504 { "cpu", SD_JSON_VARIANT_OBJECT, oci_cgroup_cpu, 0, 0 },
1505 { "blockIO", SD_JSON_VARIANT_OBJECT, oci_cgroup_block_io, 0, 0 },
1506 { "hugepageLimits", SD_JSON_VARIANT_ARRAY, oci_unsupported, 0, 0 },
1507 { "network", SD_JSON_VARIANT_OBJECT, oci_unsupported, 0, 0 },
1508 { "pids", SD_JSON_VARIANT_OBJECT, oci_cgroup_pids, 0, 0 },
1509 { "rdma", SD_JSON_VARIANT_OBJECT, oci_unsupported, 0, 0 },
1510 {}
1511 };
1512
1513 return oci_dispatch(v, table, flags, userdata);
1514 }
1515
1516 static bool sysctl_key_valid(const char *s) {
1517 bool dot = true;
1518
1519 /* Note that we are a bit stricter here than in systemd-sysctl, as that inherited semantics from the old sysctl
1520 * tool, which were really weird (as it swaps / and . in both ways) */
1521
1522 if (isempty(s))
1523 return false;
1524
1525 for (; *s; s++) {
1526
1527 if (*s <= ' ' || *s >= 127)
1528 return false;
1529 if (*s == '/')
1530 return false;
1531 if (*s == '.') {
1532
1533 if (dot) /* Don't allow two dots next to each other (or at the beginning) */
1534 return false;
1535
1536 dot = true;
1537 } else
1538 dot = false;
1539 }
1540
1541 if (dot) /* don't allow a dot at the end */
1542 return false;
1543
1544 return true;
1545 }
1546
1547 static int oci_sysctl(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1548 Settings *s = ASSERT_PTR(userdata);
1549 sd_json_variant *w;
1550 const char *k;
1551 int r;
1552
1553 JSON_VARIANT_OBJECT_FOREACH(k, w, v) {
1554 const char *m;
1555
1556 if (!sd_json_variant_is_string(w))
1557 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
1558 "sysctl parameter is not a string, refusing.");
1559
1560 assert_se(m = sd_json_variant_string(w));
1561
1562 if (!sysctl_key_valid(k))
1563 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
1564 "sysctl key invalid, refusing: %s", k);
1565
1566 r = strv_extend_many(&s->sysctl, k, m);
1567 if (r < 0)
1568 return log_oom();
1569 }
1570
1571 return 0;
1572 }
1573
1574 #if HAVE_SECCOMP
1575 static int oci_seccomp_action_from_string(const char *name, uint32_t *ret) {
1576
1577 static const struct {
1578 const char *name;
1579 uint32_t action;
1580 } table[] = {
1581 { "SCMP_ACT_ALLOW", SCMP_ACT_ALLOW },
1582 { "SCMP_ACT_ERRNO", SCMP_ACT_ERRNO(EPERM) }, /* the OCI spec doesn't document the error, but it appears EPERM is supposed to be used */
1583 { "SCMP_ACT_KILL", SCMP_ACT_KILL },
1584 #ifdef SCMP_ACT_KILL_PROCESS
1585 { "SCMP_ACT_KILL_PROCESS", SCMP_ACT_KILL_PROCESS },
1586 #endif
1587 #ifdef SCMP_ACT_KILL_THREAD
1588 { "SCMP_ACT_KILL_THREAD", SCMP_ACT_KILL_THREAD },
1589 #endif
1590 #ifdef SCMP_ACT_LOG
1591 { "SCMP_ACT_LOG", SCMP_ACT_LOG },
1592 #endif
1593 { "SCMP_ACT_TRAP", SCMP_ACT_TRAP },
1594
1595 /* We don't support SCMP_ACT_TRACE because that requires a tracer, and that doesn't really make sense
1596 * here */
1597 };
1598
1599 FOREACH_ELEMENT(i, table)
1600 if (streq_ptr(name, i->name)) {
1601 *ret = i->action;
1602 return 0;
1603 }
1604
1605 return -EINVAL;
1606 }
1607
1608 static int oci_seccomp_arch_from_string(const char *name, uint32_t *ret) {
1609
1610 static const struct {
1611 const char *name;
1612 uint32_t arch;
1613 } table[] = {
1614 { "SCMP_ARCH_AARCH64", SCMP_ARCH_AARCH64 },
1615 { "SCMP_ARCH_ARM", SCMP_ARCH_ARM },
1616 #ifdef SCMP_ARCH_LOONGARCH64
1617 { "SCMP_ARCH_LOONGARCH64", SCMP_ARCH_LOONGARCH64 },
1618 #endif
1619 { "SCMP_ARCH_MIPS", SCMP_ARCH_MIPS },
1620 { "SCMP_ARCH_MIPS64", SCMP_ARCH_MIPS64 },
1621 { "SCMP_ARCH_MIPS64N32", SCMP_ARCH_MIPS64N32 },
1622 { "SCMP_ARCH_MIPSEL", SCMP_ARCH_MIPSEL },
1623 { "SCMP_ARCH_MIPSEL64", SCMP_ARCH_MIPSEL64 },
1624 { "SCMP_ARCH_MIPSEL64N32", SCMP_ARCH_MIPSEL64N32 },
1625 { "SCMP_ARCH_NATIVE", SCMP_ARCH_NATIVE },
1626 #ifdef SCMP_ARCH_PARISC
1627 { "SCMP_ARCH_PARISC", SCMP_ARCH_PARISC },
1628 #endif
1629 #ifdef SCMP_ARCH_PARISC64
1630 { "SCMP_ARCH_PARISC64", SCMP_ARCH_PARISC64 },
1631 #endif
1632 { "SCMP_ARCH_PPC", SCMP_ARCH_PPC },
1633 { "SCMP_ARCH_PPC64", SCMP_ARCH_PPC64 },
1634 { "SCMP_ARCH_PPC64LE", SCMP_ARCH_PPC64LE },
1635 #ifdef SCMP_ARCH_RISCV64
1636 { "SCMP_ARCH_RISCV64", SCMP_ARCH_RISCV64 },
1637 #endif
1638 { "SCMP_ARCH_S390", SCMP_ARCH_S390 },
1639 { "SCMP_ARCH_S390X", SCMP_ARCH_S390X },
1640 { "SCMP_ARCH_X32", SCMP_ARCH_X32 },
1641 { "SCMP_ARCH_X86", SCMP_ARCH_X86 },
1642 { "SCMP_ARCH_X86_64", SCMP_ARCH_X86_64 },
1643 };
1644
1645 FOREACH_ELEMENT(i, table)
1646 if (streq_ptr(i->name, name)) {
1647 *ret = i->arch;
1648 return 0;
1649 }
1650
1651 return -EINVAL;
1652 }
1653
1654 static int oci_seccomp_compare_from_string(const char *name, enum scmp_compare *ret) {
1655
1656 static const struct {
1657 const char *name;
1658 enum scmp_compare op;
1659 } table[] = {
1660 { "SCMP_CMP_NE", SCMP_CMP_NE },
1661 { "SCMP_CMP_LT", SCMP_CMP_LT },
1662 { "SCMP_CMP_LE", SCMP_CMP_LE },
1663 { "SCMP_CMP_EQ", SCMP_CMP_EQ },
1664 { "SCMP_CMP_GE", SCMP_CMP_GE },
1665 { "SCMP_CMP_GT", SCMP_CMP_GT },
1666 { "SCMP_CMP_MASKED_EQ", SCMP_CMP_MASKED_EQ },
1667 };
1668
1669 FOREACH_ELEMENT(i, table)
1670 if (streq_ptr(i->name, name)) {
1671 *ret = i->op;
1672 return 0;
1673 }
1674
1675 return -EINVAL;
1676 }
1677
1678 static int oci_seccomp_archs(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1679 scmp_filter_ctx *sc = ASSERT_PTR(userdata);
1680 sd_json_variant *e;
1681 int r;
1682
1683 JSON_VARIANT_ARRAY_FOREACH(e, v) {
1684 uint32_t a;
1685
1686 if (!sd_json_variant_is_string(e))
1687 return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL),
1688 "Architecture entry is not a string.");
1689
1690 r = oci_seccomp_arch_from_string(sd_json_variant_string(e), &a);
1691 if (r < 0)
1692 return json_log(e, flags, r, "Unknown architecture: %s", sd_json_variant_string(e));
1693
1694 r = seccomp_arch_add(sc, a);
1695 if (r == -EEXIST)
1696 continue;
1697 if (r < 0)
1698 return json_log(e, flags, r, "Failed to add architecture to seccomp filter: %m");
1699 }
1700
1701 return 0;
1702 }
1703
1704 struct syscall_rule {
1705 char **names;
1706 uint32_t action;
1707 struct scmp_arg_cmp *arguments;
1708 size_t n_arguments;
1709 };
1710
1711 static void syscall_rule_done(struct syscall_rule *rule) {
1712 assert(rule);
1713
1714 strv_free(rule->names);
1715 free(rule->arguments);
1716 };
1717
1718 static int oci_seccomp_action(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1719 uint32_t *action = ASSERT_PTR(userdata);
1720 int r;
1721
1722 r = oci_seccomp_action_from_string(sd_json_variant_string(v), action);
1723 if (r < 0)
1724 return json_log(v, flags, r, "Unknown system call action '%s': %m", sd_json_variant_string(v));
1725
1726 return 0;
1727 }
1728
1729 static int oci_seccomp_op(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1730 enum scmp_compare *op = ASSERT_PTR(userdata);
1731 int r;
1732
1733 r = oci_seccomp_compare_from_string(sd_json_variant_string(v), op);
1734 if (r < 0)
1735 return json_log(v, flags, r, "Unknown seccomp operator '%s': %m", sd_json_variant_string(v));
1736
1737 return 0;
1738 }
1739
1740 static int oci_seccomp_args(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1741 struct syscall_rule *rule = ASSERT_PTR(userdata);
1742 sd_json_variant *e;
1743 int r;
1744
1745 JSON_VARIANT_ARRAY_FOREACH(e, v) {
1746 static const sd_json_dispatch_field table[] = {
1747 { "index", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint32, offsetof(struct scmp_arg_cmp, arg), SD_JSON_MANDATORY },
1748 { "value", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(struct scmp_arg_cmp, datum_a), SD_JSON_MANDATORY },
1749 { "valueTwo", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(struct scmp_arg_cmp, datum_b), 0 },
1750 { "op", SD_JSON_VARIANT_STRING, oci_seccomp_op, offsetof(struct scmp_arg_cmp, op), SD_JSON_MANDATORY },
1751 {},
1752 };
1753
1754 struct scmp_arg_cmp *p;
1755 int expected;
1756
1757 if (!GREEDY_REALLOC(rule->arguments, rule->n_arguments + 1))
1758 return log_oom();
1759
1760 p = rule->arguments + rule->n_arguments;
1761
1762 *p = (struct scmp_arg_cmp) {
1763 .arg = 0,
1764 .datum_a = 0,
1765 .datum_b = 0,
1766 .op = 0,
1767 };
1768
1769 r = oci_dispatch(e, table, flags, p);
1770 if (r < 0)
1771 return r;
1772
1773 expected = p->op == SCMP_CMP_MASKED_EQ ? 4 : 3;
1774 if (r != expected)
1775 json_log(e, flags|SD_JSON_WARNING, 0, "Wrong number of system call arguments for JSON data, ignoring.");
1776
1777 /* Note that we are a bit sloppy here and do not insist that SCMP_CMP_MASKED_EQ gets two datum values,
1778 * and the other only one. That's because buildah for example by default calls things with
1779 * SCMP_CMP_MASKED_EQ but only one argument. We use 0 when the value is not specified. */
1780
1781 rule->n_arguments++;
1782 }
1783
1784 return 0;
1785 }
1786
1787 static int oci_seccomp_syscalls(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1788 scmp_filter_ctx *sc = ASSERT_PTR(userdata);
1789 sd_json_variant *e;
1790 int r;
1791
1792 JSON_VARIANT_ARRAY_FOREACH(e, v) {
1793 static const sd_json_dispatch_field table[] = {
1794 { "names", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(struct syscall_rule, names), SD_JSON_MANDATORY },
1795 { "action", SD_JSON_VARIANT_STRING, oci_seccomp_action, offsetof(struct syscall_rule, action), SD_JSON_MANDATORY },
1796 { "args", SD_JSON_VARIANT_ARRAY, oci_seccomp_args, 0, 0 },
1797 {}
1798 };
1799 _cleanup_(syscall_rule_done) struct syscall_rule rule = {
1800 .action = UINT32_MAX,
1801 };
1802
1803 r = oci_dispatch(e, table, flags, &rule);
1804 if (r < 0)
1805 return r;
1806
1807 if (strv_isempty(rule.names))
1808 return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "System call name list is empty.");
1809
1810 STRV_FOREACH(i, rule.names) {
1811 int nr;
1812
1813 nr = seccomp_syscall_resolve_name(*i);
1814 if (nr == __NR_SCMP_ERROR) {
1815 log_debug("Unknown syscall %s, skipping.", *i);
1816 continue;
1817 }
1818
1819 r = seccomp_rule_add_array(sc, rule.action, nr, rule.n_arguments, rule.arguments);
1820 if (r < 0)
1821 return r;
1822 }
1823 }
1824
1825 return 0;
1826 }
1827 #endif
1828
1829 static int oci_seccomp(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1830
1831 #if HAVE_SECCOMP
1832 static const sd_json_dispatch_field table[] = {
1833 { "defaultAction", SD_JSON_VARIANT_STRING, NULL, 0, SD_JSON_MANDATORY },
1834 { "architectures", SD_JSON_VARIANT_ARRAY, oci_seccomp_archs, 0, 0 },
1835 { "syscalls", SD_JSON_VARIANT_ARRAY, oci_seccomp_syscalls, 0, 0 },
1836 {}
1837 };
1838
1839 _cleanup_(seccomp_releasep) scmp_filter_ctx sc = NULL;
1840 Settings *s = ASSERT_PTR(userdata);
1841 sd_json_variant *def;
1842 uint32_t d;
1843 int r;
1844
1845 def = sd_json_variant_by_key(v, "defaultAction");
1846 if (!def)
1847 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), "defaultAction element missing.");
1848
1849 if (!sd_json_variant_is_string(def))
1850 return json_log(def, flags, SYNTHETIC_ERRNO(EINVAL), "defaultAction is not a string.");
1851
1852 r = oci_seccomp_action_from_string(sd_json_variant_string(def), &d);
1853 if (r < 0)
1854 return json_log(def, flags, r, "Unknown default action: %s", sd_json_variant_string(def));
1855
1856 sc = seccomp_init(d);
1857 if (!sc)
1858 return json_log(v, flags, SYNTHETIC_ERRNO(ENOMEM), "Couldn't allocate seccomp object.");
1859
1860 r = oci_dispatch(v, table, flags, sc);
1861 if (r < 0)
1862 return r;
1863
1864 seccomp_release(s->seccomp);
1865 s->seccomp = TAKE_PTR(sc);
1866 return 0;
1867 #else
1868 return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), "libseccomp support not enabled, can't parse seccomp object.");
1869 #endif
1870 }
1871
1872 static int oci_rootfs_propagation(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1873 const char *s;
1874
1875 s = sd_json_variant_string(v);
1876
1877 if (streq(s, "shared"))
1878 return 0;
1879
1880 json_log(v, flags|SD_JSON_DEBUG, 0, "Ignoring rootfsPropagation setting '%s'.", s);
1881 return 0;
1882 }
1883
1884 static int oci_masked_paths(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1885 Settings *s = ASSERT_PTR(userdata);
1886 sd_json_variant *e;
1887
1888 JSON_VARIANT_ARRAY_FOREACH(e, v) {
1889 _cleanup_free_ char *destination = NULL;
1890 CustomMount *m;
1891 const char *p;
1892
1893 if (!sd_json_variant_is_string(e))
1894 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
1895 "Path is not a string, refusing.");
1896
1897 assert_se(p = sd_json_variant_string(e));
1898
1899 if (!path_is_absolute(p))
1900 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
1901 "Path is not absolute, refusing: %s", p);
1902
1903 if (oci_exclude_mount(p))
1904 continue;
1905
1906 destination = strdup(p);
1907 if (!destination)
1908 return log_oom();
1909
1910 m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_INACCESSIBLE);
1911 if (!m)
1912 return log_oom();
1913
1914 m->destination = TAKE_PTR(destination);
1915
1916 /* The spec doesn't say this, but apparently pre-existing implementations are lenient towards
1917 * non-existing paths to mask. Let's hence be too. */
1918 m->graceful = true;
1919 }
1920
1921 return 0;
1922 }
1923
1924 static int oci_readonly_paths(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1925 Settings *s = ASSERT_PTR(userdata);
1926 sd_json_variant *e;
1927
1928 JSON_VARIANT_ARRAY_FOREACH(e, v) {
1929 _cleanup_free_ char *source = NULL, *destination = NULL;
1930 CustomMount *m;
1931 const char *p;
1932
1933 if (!sd_json_variant_is_string(e))
1934 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
1935 "Path is not a string, refusing.");
1936
1937 assert_se(p = sd_json_variant_string(e));
1938
1939 if (!path_is_absolute(p))
1940 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
1941 "Path is not absolute, refusing: %s", p);
1942
1943 if (oci_exclude_mount(p))
1944 continue;
1945
1946 source = strjoin("+", p);
1947 if (!source)
1948 return log_oom();
1949
1950 destination = strdup(p);
1951 if (!destination)
1952 return log_oom();
1953
1954 m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_BIND);
1955 if (!m)
1956 return log_oom();
1957
1958 m->source = TAKE_PTR(source);
1959 m->destination = TAKE_PTR(destination);
1960 m->read_only = true;
1961 }
1962
1963 return 0;
1964 }
1965
1966 static int oci_linux(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1967
1968 static const sd_json_dispatch_field table[] = {
1969 { "namespaces", SD_JSON_VARIANT_ARRAY, oci_namespaces, 0, 0 },
1970 { "uidMappings", SD_JSON_VARIANT_ARRAY, oci_uid_gid_mappings, 0, 0 },
1971 { "gidMappings", SD_JSON_VARIANT_ARRAY, oci_uid_gid_mappings, 0, 0 },
1972 { "devices", SD_JSON_VARIANT_ARRAY, oci_devices, 0, 0 },
1973 { "cgroupsPath", SD_JSON_VARIANT_STRING, oci_cgroups_path, 0, 0 },
1974 { "resources", SD_JSON_VARIANT_OBJECT, oci_resources, 0, 0 },
1975 { "intelRdt", SD_JSON_VARIANT_OBJECT, oci_unsupported, 0, SD_JSON_PERMISSIVE },
1976 { "sysctl", SD_JSON_VARIANT_OBJECT, oci_sysctl, 0, 0 },
1977 { "seccomp", SD_JSON_VARIANT_OBJECT, oci_seccomp, 0, 0 },
1978 { "rootfsPropagation", SD_JSON_VARIANT_STRING, oci_rootfs_propagation, 0, 0 },
1979 { "maskedPaths", SD_JSON_VARIANT_ARRAY, oci_masked_paths, 0, 0 },
1980 { "readonlyPaths", SD_JSON_VARIANT_ARRAY, oci_readonly_paths, 0, 0 },
1981 { "mountLabel", SD_JSON_VARIANT_STRING, oci_unsupported, 0, SD_JSON_PERMISSIVE },
1982 {}
1983 };
1984
1985 return oci_dispatch(v, table, flags, userdata);
1986 }
1987
1988 static int oci_hook_timeout(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
1989 usec_t *u = ASSERT_PTR(userdata);
1990 uint64_t k;
1991
1992 k = sd_json_variant_unsigned(v);
1993 if (k == 0 || k > (UINT64_MAX-1)/USEC_PER_SEC)
1994 return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE),
1995 "Hook timeout value out of range.");
1996
1997 *u = k * USEC_PER_SEC;
1998 return 0;
1999 }
2000
2001 static int oci_hooks_array(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
2002 Settings *s = ASSERT_PTR(userdata);
2003 sd_json_variant *e;
2004 int r;
2005
2006 JSON_VARIANT_ARRAY_FOREACH(e, v) {
2007
2008 static const sd_json_dispatch_field table[] = {
2009 { "path", SD_JSON_VARIANT_STRING, json_dispatch_path, offsetof(OciHook, path), SD_JSON_MANDATORY },
2010 { "args", SD_JSON_VARIANT_ARRAY, oci_args, offsetof(OciHook, args), 0, },
2011 { "env", SD_JSON_VARIANT_ARRAY, oci_env, offsetof(OciHook, env), 0 },
2012 { "timeout", SD_JSON_VARIANT_UNSIGNED, oci_hook_timeout, offsetof(OciHook, timeout), 0 },
2013 {}
2014 };
2015
2016 OciHook **array, *new_item;
2017 size_t *n_array;
2018
2019 if (streq(name, "prestart")) {
2020 array = &s->oci_hooks_prestart;
2021 n_array = &s->n_oci_hooks_prestart;
2022 } else if (streq(name, "poststart")) {
2023 array = &s->oci_hooks_poststart;
2024 n_array = &s->n_oci_hooks_poststart;
2025 } else {
2026 assert(streq(name, "poststop"));
2027 array = &s->oci_hooks_poststop;
2028 n_array = &s->n_oci_hooks_poststop;
2029 }
2030
2031 if (!GREEDY_REALLOC(*array, *n_array + 1))
2032 return log_oom();
2033
2034 new_item = *array + *n_array;
2035
2036 *new_item = (OciHook) {
2037 .timeout = USEC_INFINITY,
2038 };
2039
2040 r = oci_dispatch(e, table, flags, new_item);
2041 if (r < 0) {
2042 free(new_item->path);
2043 strv_free(new_item->args);
2044 strv_free(new_item->env);
2045 return r;
2046 }
2047
2048 (*n_array)++;
2049 }
2050
2051 return 0;
2052 }
2053
2054 static int oci_hooks(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
2055
2056 static const sd_json_dispatch_field table[] = {
2057 { "prestart", SD_JSON_VARIANT_ARRAY, oci_hooks_array, 0, 0 },
2058 { "poststart", SD_JSON_VARIANT_ARRAY, oci_hooks_array, 0, 0 },
2059 { "poststop", SD_JSON_VARIANT_ARRAY, oci_hooks_array, 0, 0 },
2060 {}
2061 };
2062
2063 return oci_dispatch(v, table, flags, userdata);
2064 }
2065
2066 static int oci_annotations(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
2067 sd_json_variant *w;
2068 const char *k;
2069
2070 JSON_VARIANT_OBJECT_FOREACH(k, w, v) {
2071
2072 if (isempty(k))
2073 return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
2074 "Annotation with empty key, refusing.");
2075
2076 if (!sd_json_variant_is_string(w))
2077 return json_log(w, flags, SYNTHETIC_ERRNO(EINVAL),
2078 "Annotation has non-string value, refusing.");
2079
2080 json_log(w, flags|SD_JSON_DEBUG, 0, "Ignoring annotation '%s' with value '%s'.", k, sd_json_variant_string(w));
2081 }
2082
2083 return 0;
2084 }
2085
2086 int oci_load(FILE *f, const char *bundle, Settings **ret) {
2087
2088 static const sd_json_dispatch_field table[] = {
2089 { "ociVersion", SD_JSON_VARIANT_STRING, NULL, 0, SD_JSON_MANDATORY },
2090 { "process", SD_JSON_VARIANT_OBJECT, oci_process, 0, 0 },
2091 { "root", SD_JSON_VARIANT_OBJECT, oci_root, 0, 0 },
2092 { "hostname", SD_JSON_VARIANT_STRING, oci_hostname, 0, 0 },
2093 { "mounts", SD_JSON_VARIANT_ARRAY, oci_mounts, 0, 0 },
2094 { "linux", SD_JSON_VARIANT_OBJECT, oci_linux, 0, 0 },
2095 { "hooks", SD_JSON_VARIANT_OBJECT, oci_hooks, 0, 0 },
2096 { "annotations", SD_JSON_VARIANT_OBJECT, oci_annotations, 0, 0 },
2097 {}
2098 };
2099
2100 _cleanup_(sd_json_variant_unrefp) sd_json_variant *oci = NULL;
2101 _cleanup_(settings_freep) Settings *s = NULL;
2102 unsigned line = 0, column = 0;
2103 sd_json_variant *v;
2104 const char *path;
2105 int r;
2106
2107 assert_se(bundle);
2108
2109 path = strjoina(bundle, "/config.json");
2110
2111 r = sd_json_parse_file(f, path, 0, &oci, &line, &column);
2112 if (r < 0) {
2113 if (line != 0 && column != 0)
2114 return log_error_errno(r, "Failed to parse '%s' at %u:%u: %m", path, line, column);
2115 else
2116 return log_error_errno(r, "Failed to parse '%s': %m", path);
2117 }
2118
2119 v = sd_json_variant_by_key(oci, "ociVersion");
2120 if (!v)
2121 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2122 "JSON file '%s' is not an OCI bundle configuration file. Refusing.",
2123 path);
2124 if (!streq_ptr(sd_json_variant_string(v), "1.0.0"))
2125 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2126 "OCI bundle version not supported: %s",
2127 strna(sd_json_variant_string(v)));
2128
2129 // {
2130 // _cleanup_free_ char *formatted = NULL;
2131 // assert_se(json_variant_format(oci, SD_JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR, &formatted) >= 0);
2132 // fputs(formatted, stdout);
2133 // }
2134
2135 s = settings_new();
2136 if (!s)
2137 return log_oom();
2138
2139 s->start_mode = START_PID1;
2140 s->resolv_conf = RESOLV_CONF_OFF;
2141 s->link_journal = LINK_NO;
2142 s->timezone = TIMEZONE_OFF;
2143
2144 s->bundle = strdup(bundle);
2145 if (!s->bundle)
2146 return log_oom();
2147
2148 r = oci_dispatch(oci, table, 0, s);
2149 if (r < 0)
2150 return r;
2151
2152 if (s->properties) {
2153 r = sd_bus_message_seal(s->properties, 0, 0);
2154 if (r < 0)
2155 return log_error_errno(r, "Cannot seal properties bus message: %m");
2156 }
2157
2158 *ret = TAKE_PTR(s);
2159 return 0;
2160 }