]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
core: add new RestrictNamespaces= unit file setting
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
57183d11
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
a8fbdf54 20#include <errno.h>
57183d11 21#include <seccomp.h>
a8fbdf54 22#include <stddef.h>
d347d902
FS
23#include <sys/prctl.h>
24#include <linux/seccomp.h>
57183d11 25
add00535 26#include "alloc-util.h"
a8fbdf54 27#include "macro.h"
add00535 28#include "nsflags.h"
cf0fbc49 29#include "seccomp-util.h"
07630cea 30#include "string-util.h"
8130926d 31#include "util.h"
57183d11
LP
32
33const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
34 /* Maintain order used in <seccomp.h>.
35 *
36 * Names used here should be the same as those used for ConditionArchitecture=,
37 * except for "subarchitectures" like x32. */
57183d11 38
aa34055f
ZJS
39 switch(c) {
40 case SCMP_ARCH_NATIVE:
57183d11 41 return "native";
aa34055f 42 case SCMP_ARCH_X86:
57183d11 43 return "x86";
aa34055f 44 case SCMP_ARCH_X86_64:
57183d11 45 return "x86-64";
aa34055f 46 case SCMP_ARCH_X32:
57183d11 47 return "x32";
aa34055f 48 case SCMP_ARCH_ARM:
57183d11 49 return "arm";
aa34055f
ZJS
50 case SCMP_ARCH_AARCH64:
51 return "arm64";
52 case SCMP_ARCH_MIPS:
53 return "mips";
54 case SCMP_ARCH_MIPS64:
55 return "mips64";
56 case SCMP_ARCH_MIPS64N32:
57 return "mips64-n32";
58 case SCMP_ARCH_MIPSEL:
59 return "mips-le";
60 case SCMP_ARCH_MIPSEL64:
61 return "mips64-le";
62 case SCMP_ARCH_MIPSEL64N32:
63 return "mips64-le-n32";
64 case SCMP_ARCH_PPC:
65 return "ppc";
66 case SCMP_ARCH_PPC64:
67 return "ppc64";
68 case SCMP_ARCH_PPC64LE:
69 return "ppc64-le";
70 case SCMP_ARCH_S390:
6abfd303 71 return "s390";
aa34055f 72 case SCMP_ARCH_S390X:
6abfd303 73 return "s390x";
aa34055f
ZJS
74 default:
75 return NULL;
76 }
57183d11
LP
77}
78
79int seccomp_arch_from_string(const char *n, uint32_t *ret) {
80 if (!n)
81 return -EINVAL;
82
83 assert(ret);
84
85 if (streq(n, "native"))
86 *ret = SCMP_ARCH_NATIVE;
87 else if (streq(n, "x86"))
88 *ret = SCMP_ARCH_X86;
89 else if (streq(n, "x86-64"))
90 *ret = SCMP_ARCH_X86_64;
91 else if (streq(n, "x32"))
92 *ret = SCMP_ARCH_X32;
93 else if (streq(n, "arm"))
94 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
95 else if (streq(n, "arm64"))
96 *ret = SCMP_ARCH_AARCH64;
97 else if (streq(n, "mips"))
98 *ret = SCMP_ARCH_MIPS;
99 else if (streq(n, "mips64"))
100 *ret = SCMP_ARCH_MIPS64;
101 else if (streq(n, "mips64-n32"))
102 *ret = SCMP_ARCH_MIPS64N32;
103 else if (streq(n, "mips-le"))
104 *ret = SCMP_ARCH_MIPSEL;
105 else if (streq(n, "mips64-le"))
106 *ret = SCMP_ARCH_MIPSEL64;
107 else if (streq(n, "mips64-le-n32"))
108 *ret = SCMP_ARCH_MIPSEL64N32;
109 else if (streq(n, "ppc"))
110 *ret = SCMP_ARCH_PPC;
111 else if (streq(n, "ppc64"))
112 *ret = SCMP_ARCH_PPC64;
113 else if (streq(n, "ppc64-le"))
114 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
115 else if (streq(n, "s390"))
116 *ret = SCMP_ARCH_S390;
117 else if (streq(n, "s390x"))
118 *ret = SCMP_ARCH_S390X;
57183d11
LP
119 else
120 return -EINVAL;
121
122 return 0;
123}
e9642be2 124
8d7b0c8f
LP
125int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) {
126 scmp_filter_ctx seccomp;
127 int r;
128
129 /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are
130 * added by default, and NNP is turned off. */
131
132 seccomp = seccomp_init(default_action);
133 if (!seccomp)
134 return -ENOMEM;
135
136 r = seccomp_add_secondary_archs(seccomp);
137 if (r < 0)
138 goto finish;
139
140 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
141 if (r < 0)
142 goto finish;
143
144 *ret = seccomp;
145 return 0;
146
147finish:
148 seccomp_release(seccomp);
149 return r;
150}
151
aa34055f 152int seccomp_add_secondary_archs(scmp_filter_ctx ctx) {
e9642be2
LP
153
154 /* Add in all possible secondary archs we are aware of that
155 * this kernel might support. */
156
aa34055f
ZJS
157 static const int seccomp_arches[] = {
158#if defined(__i386__) || defined(__x86_64__)
159 SCMP_ARCH_X86,
160 SCMP_ARCH_X86_64,
161 SCMP_ARCH_X32,
162
163#elif defined(__arm__) || defined(__aarch64__)
164 SCMP_ARCH_ARM,
165 SCMP_ARCH_AARCH64,
166
167#elif defined(__arm__) || defined(__aarch64__)
168 SCMP_ARCH_ARM,
169 SCMP_ARCH_AARCH64,
170
171#elif defined(__mips__) || defined(__mips64__)
172 SCMP_ARCH_MIPS,
173 SCMP_ARCH_MIPS64,
174 SCMP_ARCH_MIPS64N32,
175 SCMP_ARCH_MIPSEL,
176 SCMP_ARCH_MIPSEL64,
177 SCMP_ARCH_MIPSEL64N32,
178
179#elif defined(__powerpc__) || defined(__powerpc64__)
180 SCMP_ARCH_PPC,
181 SCMP_ARCH_PPC64,
182 SCMP_ARCH_PPC64LE,
e9642be2 183
6abfd303 184#elif defined(__s390__) || defined(__s390x__)
aa34055f
ZJS
185 SCMP_ARCH_S390,
186 SCMP_ARCH_S390X,
187#endif
188 };
6abfd303 189
aa34055f
ZJS
190 unsigned i;
191 int r;
6abfd303 192
aa34055f
ZJS
193 for (i = 0; i < ELEMENTSOF(seccomp_arches); i++) {
194 r = seccomp_arch_add(ctx, seccomp_arches[i]);
195 if (r < 0 && r != -EEXIST)
196 return r;
197 }
e9642be2
LP
198
199 return 0;
e9642be2 200}
201c1cc2 201
d347d902
FS
202static bool is_basic_seccomp_available(void) {
203 int r;
204 r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
205 return r >= 0;
206}
207
208static bool is_seccomp_filter_available(void) {
209 int r;
210 r = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
211 return r < 0 && errno == EFAULT;
212}
213
83f12b27 214bool is_seccomp_available(void) {
83f12b27
FS
215 static int cached_enabled = -1;
216 if (cached_enabled < 0)
d347d902 217 cached_enabled = is_basic_seccomp_available() && is_seccomp_filter_available();
83f12b27
FS
218 return cached_enabled;
219}
220
8130926d 221const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 222 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 223 .name = "@default",
d5efc18b 224 .help = "System calls that are always permitted",
40eb6a80
ZJS
225 .value =
226 "clock_getres\0"
227 "clock_gettime\0"
228 "clock_nanosleep\0"
229 "execve\0"
230 "exit\0"
231 "exit_group\0"
232 "getrlimit\0" /* make sure processes can query stack size and such */
233 "gettimeofday\0"
234 "nanosleep\0"
235 "pause\0"
236 "rt_sigreturn\0"
237 "sigreturn\0"
238 "time\0"
239 },
133ddbbe 240 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 241 .name = "@basic-io",
d5efc18b 242 .help = "Basic IO",
133ddbbe
LP
243 .value =
244 "close\0"
245 "dup2\0"
246 "dup3\0"
247 "dup\0"
248 "lseek\0"
249 "pread64\0"
250 "preadv\0"
251 "pwrite64\0"
252 "pwritev\0"
253 "read\0"
254 "readv\0"
255 "write\0"
256 "writev\0"
257 },
8130926d 258 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 259 .name = "@clock",
d5efc18b 260 .help = "Change the system time",
201c1cc2
TM
261 .value =
262 "adjtimex\0"
1f9ac68b
LP
263 "clock_adjtime\0"
264 "clock_settime\0"
201c1cc2 265 "settimeofday\0"
1f9ac68b 266 "stime\0"
8130926d
LP
267 },
268 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 269 .name = "@cpu-emulation",
d5efc18b 270 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
271 .value =
272 "modify_ldt\0"
273 "subpage_prot\0"
274 "switch_endian\0"
275 "vm86\0"
276 "vm86old\0"
8130926d
LP
277 },
278 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 279 .name = "@debug",
d5efc18b 280 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
281 .value =
282 "lookup_dcookie\0"
283 "perf_event_open\0"
284 "process_vm_readv\0"
285 "process_vm_writev\0"
286 "ptrace\0"
287 "rtas\0"
8130926d 288#ifdef __NR_s390_runtime_instr
1f9ac68b 289 "s390_runtime_instr\0"
8130926d 290#endif
1f9ac68b 291 "sys_debug_setcontext\0"
8130926d 292 },
8130926d 293 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 294 .name = "@io-event",
d5efc18b 295 .help = "Event loop system calls",
201c1cc2
TM
296 .value =
297 "_newselect\0"
298 "epoll_create1\0"
299 "epoll_create\0"
300 "epoll_ctl\0"
301 "epoll_ctl_old\0"
302 "epoll_pwait\0"
303 "epoll_wait\0"
304 "epoll_wait_old\0"
305 "eventfd2\0"
306 "eventfd\0"
307 "poll\0"
308 "ppoll\0"
309 "pselect6\0"
310 "select\0"
8130926d
LP
311 },
312 [SYSCALL_FILTER_SET_IPC] = {
8130926d 313 .name = "@ipc",
d5efc18b
ZJS
314 .help = "SysV IPC, POSIX Message Queues or other IPC",
315 .value =
316 "ipc\0"
cd5bfd7e 317 "memfd_create\0"
201c1cc2
TM
318 "mq_getsetattr\0"
319 "mq_notify\0"
320 "mq_open\0"
321 "mq_timedreceive\0"
322 "mq_timedsend\0"
323 "mq_unlink\0"
324 "msgctl\0"
325 "msgget\0"
326 "msgrcv\0"
327 "msgsnd\0"
cd5bfd7e
LP
328 "pipe2\0"
329 "pipe\0"
201c1cc2
TM
330 "process_vm_readv\0"
331 "process_vm_writev\0"
332 "semctl\0"
333 "semget\0"
334 "semop\0"
335 "semtimedop\0"
336 "shmat\0"
337 "shmctl\0"
338 "shmdt\0"
339 "shmget\0"
8130926d
LP
340 },
341 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 342 .name = "@keyring",
d5efc18b 343 .help = "Kernel keyring access",
1f9ac68b
LP
344 .value =
345 "add_key\0"
346 "keyctl\0"
347 "request_key\0"
8130926d
LP
348 },
349 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 350 .name = "@module",
d5efc18b 351 .help = "Loading and unloading of kernel modules",
201c1cc2 352 .value =
201c1cc2
TM
353 "delete_module\0"
354 "finit_module\0"
355 "init_module\0"
8130926d
LP
356 },
357 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 358 .name = "@mount",
d5efc18b 359 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
360 .value =
361 "chroot\0"
362 "mount\0"
201c1cc2
TM
363 "pivot_root\0"
364 "umount2\0"
365 "umount\0"
8130926d
LP
366 },
367 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 368 .name = "@network-io",
d5efc18b 369 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2
TM
370 .value =
371 "accept4\0"
372 "accept\0"
373 "bind\0"
374 "connect\0"
375 "getpeername\0"
376 "getsockname\0"
377 "getsockopt\0"
378 "listen\0"
379 "recv\0"
380 "recvfrom\0"
381 "recvmmsg\0"
382 "recvmsg\0"
383 "send\0"
384 "sendmmsg\0"
385 "sendmsg\0"
386 "sendto\0"
387 "setsockopt\0"
388 "shutdown\0"
389 "socket\0"
390 "socketcall\0"
391 "socketpair\0"
8130926d
LP
392 },
393 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 394 /* some unknown even to libseccomp */
8130926d 395 .name = "@obsolete",
d5efc18b 396 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
397 .value =
398 "_sysctl\0"
399 "afs_syscall\0"
400 "break\0"
1f9ac68b 401 "create_module\0"
201c1cc2
TM
402 "ftime\0"
403 "get_kernel_syms\0"
201c1cc2
TM
404 "getpmsg\0"
405 "gtty\0"
201c1cc2 406 "lock\0"
201c1cc2 407 "mpx\0"
201c1cc2
TM
408 "prof\0"
409 "profil\0"
201c1cc2
TM
410 "putpmsg\0"
411 "query_module\0"
201c1cc2
TM
412 "security\0"
413 "sgetmask\0"
414 "ssetmask\0"
415 "stty\0"
1f9ac68b 416 "sysfs\0"
201c1cc2
TM
417 "tuxcall\0"
418 "ulimit\0"
419 "uselib\0"
1f9ac68b 420 "ustat\0"
201c1cc2 421 "vserver\0"
8130926d
LP
422 },
423 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 424 .name = "@privileged",
d5efc18b 425 .help = "All system calls which need super-user capabilities",
201c1cc2
TM
426 .value =
427 "@clock\0"
428 "@module\0"
429 "@raw-io\0"
430 "acct\0"
431 "bdflush\0"
432 "bpf\0"
1f9ac68b 433 "capset\0"
201c1cc2
TM
434 "chown32\0"
435 "chown\0"
436 "chroot\0"
437 "fchown32\0"
438 "fchown\0"
439 "fchownat\0"
440 "kexec_file_load\0"
441 "kexec_load\0"
442 "lchown32\0"
443 "lchown\0"
444 "nfsservctl\0"
445 "pivot_root\0"
446 "quotactl\0"
447 "reboot\0"
448 "setdomainname\0"
449 "setfsuid32\0"
450 "setfsuid\0"
451 "setgroups32\0"
452 "setgroups\0"
453 "sethostname\0"
454 "setresuid32\0"
455 "setresuid\0"
456 "setreuid32\0"
457 "setreuid\0"
458 "setuid32\0"
459 "setuid\0"
201c1cc2
TM
460 "swapoff\0"
461 "swapon\0"
60f547cf 462 "_sysctl\0"
201c1cc2 463 "vhangup\0"
8130926d
LP
464 },
465 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 466 .name = "@process",
d5efc18b 467 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
468 .value =
469 "arch_prctl\0"
470 "clone\0"
201c1cc2
TM
471 "execveat\0"
472 "fork\0"
473 "kill\0"
474 "prctl\0"
475 "setns\0"
476 "tgkill\0"
477 "tkill\0"
478 "unshare\0"
479 "vfork\0"
8130926d
LP
480 },
481 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 482 .name = "@raw-io",
d5efc18b 483 .help = "Raw I/O port access",
201c1cc2
TM
484 .value =
485 "ioperm\0"
486 "iopl\0"
1f9ac68b 487 "pciconfig_iobase\0"
201c1cc2
TM
488 "pciconfig_read\0"
489 "pciconfig_write\0"
8130926d 490#ifdef __NR_s390_pci_mmio_read
201c1cc2 491 "s390_pci_mmio_read\0"
8130926d
LP
492#endif
493#ifdef __NR_s390_pci_mmio_write
201c1cc2 494 "s390_pci_mmio_write\0"
8130926d
LP
495#endif
496 },
133ddbbe
LP
497 [SYSCALL_FILTER_SET_RESOURCES] = {
498 /* Alter resource settings */
499 .name = "@resources",
500 .value =
501 "sched_setparam\0"
502 "sched_setscheduler\0"
503 "sched_setaffinity\0"
504 "setpriority\0"
505 "setrlimit\0"
506 "set_mempolicy\0"
507 "migrate_pages\0"
508 "move_pages\0"
509 "mbind\0"
510 "sched_setattr\0"
511 "prlimit64\0"
512 },
201c1cc2 513};
8130926d
LP
514
515const SyscallFilterSet *syscall_filter_set_find(const char *name) {
516 unsigned i;
517
518 if (isempty(name) || name[0] != '@')
519 return NULL;
520
521 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
522 if (streq(syscall_filter_sets[i].name, name))
523 return syscall_filter_sets + i;
524
525 return NULL;
526}
527
528int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) {
529 const char *sys;
530 int r;
531
532 assert(seccomp);
533 assert(set);
534
535 NULSTR_FOREACH(sys, set->value) {
536 int id;
537
538 if (sys[0] == '@') {
539 const SyscallFilterSet *other;
540
541 other = syscall_filter_set_find(sys);
542 if (!other)
543 return -EINVAL;
544
545 r = seccomp_add_syscall_filter_set(seccomp, other, action);
546 } else {
547 id = seccomp_syscall_resolve_name(sys);
548 if (id == __NR_SCMP_ERROR)
549 return -EINVAL;
550
551 r = seccomp_rule_add(seccomp, action, id, 0);
552 }
553 if (r < 0)
554 return r;
555 }
556
557 return 0;
558}
a3be2849
LP
559
560int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
561 scmp_filter_ctx seccomp;
562 int r;
563
564 assert(set);
565
566 /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */
567
568 r = seccomp_init_conservative(&seccomp, default_action);
569 if (r < 0)
570 return r;
571
572 r = seccomp_add_syscall_filter_set(seccomp, set, action);
573 if (r < 0)
574 goto finish;
575
576 r = seccomp_load(seccomp);
577
578finish:
579 seccomp_release(seccomp);
580 return r;
add00535
LP
581}
582
583int seccomp_restrict_namespaces(unsigned long retain) {
584 scmp_filter_ctx seccomp;
585 unsigned i;
586 int r;
587
588 if (log_get_max_level() >= LOG_DEBUG) {
589 _cleanup_free_ char *s = NULL;
590
591 (void) namespace_flag_to_string_many(retain, &s);
592 log_debug("Restricting namespace to: %s.", strna(s));
593 }
594
595 /* NOOP? */
596 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
597 return 0;
598
599 r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
600 if (r < 0)
601 return r;
602
603 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
604 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
605 * altogether. */
606 r = seccomp_rule_add(
607 seccomp,
608 SCMP_ACT_ERRNO(EPERM),
609 SCMP_SYS(setns),
610 0);
611 else
612 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
613 * special invocation with a zero flags argument, right here. */
614 r = seccomp_rule_add(
615 seccomp,
616 SCMP_ACT_ERRNO(EPERM),
617 SCMP_SYS(setns),
618 1,
619 SCMP_A1(SCMP_CMP_EQ, 0));
620 if (r < 0)
621 goto finish;
622
623 for (i = 0; namespace_flag_map[i].name; i++) {
624 unsigned long f;
625
626 f = namespace_flag_map[i].flag;
627 if ((retain & f) == f) {
628 log_debug("Permitting %s.", namespace_flag_map[i].name);
629 continue;
630 }
a3be2849 631
add00535
LP
632 log_debug("Blocking %s.", namespace_flag_map[i].name);
633
634 r = seccomp_rule_add(
635 seccomp,
636 SCMP_ACT_ERRNO(EPERM),
637 SCMP_SYS(unshare),
638 1,
639 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
640 if (r < 0)
641 goto finish;
642
643 r = seccomp_rule_add(
644 seccomp,
645 SCMP_ACT_ERRNO(EPERM),
646 SCMP_SYS(clone),
647 1,
648 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
649 if (r < 0)
650 goto finish;
651
652 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
653 r = seccomp_rule_add(
654 seccomp,
655 SCMP_ACT_ERRNO(EPERM),
656 SCMP_SYS(setns),
657 1,
658 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
659 if (r < 0)
660 goto finish;
661 }
662 }
663
664 r = seccomp_load(seccomp);
665
666finish:
667 seccomp_release(seccomp);
668 return r;
a3be2849 669}