]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn-seccomp.c
tree-wide: drop 'This file is part of systemd' blurb
[thirdparty/systemd.git] / src / nspawn / nspawn-seccomp.c
index 54db1b47f8352be61479761fa15821bccffde9c1..008d013af6b7f6f8fa33eac9d510fe0946275c5d 100644 (file)
@@ -1,20 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
 /***
-  This file is part of systemd.
-
   Copyright 2016 Lennart Poettering
-
-  systemd is free software; you can redistribute it and/or modify it
-  under the terms of the GNU Lesser General Public License as published by
-  the Free Software Foundation; either version 2.1 of the License, or
-  (at your option) any later version.
-
-  systemd is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public License
-  along with systemd; If not, see <http://www.gnu.org/licenses/>.
 ***/
 
 #include <errno.h>
 #include <sys/capability.h>
 #include <sys/types.h>
 
-#ifdef HAVE_SECCOMP
+#if HAVE_SECCOMP
 #include <seccomp.h>
 #endif
 
+#include "alloc-util.h"
 #include "log.h"
-
-#ifdef HAVE_SECCOMP
+#include "nspawn-seccomp.h"
+#if HAVE_SECCOMP
 #include "seccomp-util.h"
 #endif
+#include "string-util.h"
+#include "strv.h"
 
-#include "nspawn-seccomp.h"
+#if HAVE_SECCOMP
 
-#ifdef HAVE_SECCOMP
+static int seccomp_add_default_syscall_filter(
+                scmp_filter_ctx ctx,
+                uint32_t arch,
+                uint64_t cap_list_retain,
+                char **syscall_whitelist,
+                char **syscall_blacklist) {
 
-static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx,
-                                              uint64_t cap_list_retain) {
-        unsigned i;
-        int r;
         static const struct {
                 uint64_t capability;
-                int syscall_num;
-        } blacklist[] = {
-                { 0,              SCMP_SYS(_sysctl)             }, /* obsolete syscall */
-                { 0,              SCMP_SYS(add_key)             }, /* keyring is not namespaced */
-                { 0,              SCMP_SYS(afs_syscall)         }, /* obsolete syscall */
-                { 0,              SCMP_SYS(bdflush)             },
-#ifdef __NR_bpf
-                { 0,              SCMP_SYS(bpf)                 },
-#endif
-                { 0,              SCMP_SYS(break)               }, /* obsolete syscall */
-                { 0,              SCMP_SYS(create_module)       }, /* obsolete syscall */
-                { 0,              SCMP_SYS(ftime)               }, /* obsolete syscall */
-                { 0,              SCMP_SYS(get_kernel_syms)     }, /* obsolete syscall */
-                { 0,              SCMP_SYS(getpmsg)             }, /* obsolete syscall */
-                { 0,              SCMP_SYS(gtty)                }, /* obsolete syscall */
-#ifdef __NR_kexec_file_load
-                { 0,              SCMP_SYS(kexec_file_load)     },
-#endif
-                { 0,              SCMP_SYS(kexec_load)          },
-                { 0,              SCMP_SYS(keyctl)              }, /* keyring is not namespaced */
-                { 0,              SCMP_SYS(lock)                }, /* obsolete syscall */
-                { 0,              SCMP_SYS(lookup_dcookie)      },
-                { 0,              SCMP_SYS(mpx)                 }, /* obsolete syscall */
-                { 0,              SCMP_SYS(nfsservctl)          }, /* obsolete syscall */
-                { 0,              SCMP_SYS(open_by_handle_at)   },
-                { 0,              SCMP_SYS(perf_event_open)     },
-                { 0,              SCMP_SYS(prof)                }, /* obsolete syscall */
-                { 0,              SCMP_SYS(profil)              }, /* obsolete syscall */
-                { 0,              SCMP_SYS(putpmsg)             }, /* obsolete syscall */
-                { 0,              SCMP_SYS(query_module)        }, /* obsolete syscall */
-                { 0,              SCMP_SYS(quotactl)            },
-                { 0,              SCMP_SYS(request_key)         }, /* keyring is not namespaced */
-                { 0,              SCMP_SYS(security)            }, /* obsolete syscall */
-                { 0,              SCMP_SYS(sgetmask)            }, /* obsolete syscall */
-                { 0,              SCMP_SYS(ssetmask)            }, /* obsolete syscall */
-                { 0,              SCMP_SYS(stty)                }, /* obsolete syscall */
-                { 0,              SCMP_SYS(swapoff)             },
-                { 0,              SCMP_SYS(swapon)              },
-                { 0,              SCMP_SYS(sysfs)               }, /* obsolete syscall */
-                { 0,              SCMP_SYS(tuxcall)             }, /* obsolete syscall */
-                { 0,              SCMP_SYS(ulimit)              }, /* obsolete syscall */
-                { 0,              SCMP_SYS(uselib)              }, /* obsolete syscall */
-                { 0,              SCMP_SYS(ustat)               }, /* obsolete syscall */
-                { 0,              SCMP_SYS(vserver)             }, /* obsolete syscall */
-                { CAP_SYSLOG,     SCMP_SYS(syslog)              },
-                { CAP_SYS_MODULE, SCMP_SYS(delete_module)       },
-                { CAP_SYS_MODULE, SCMP_SYS(finit_module)        },
-                { CAP_SYS_MODULE, SCMP_SYS(init_module)         },
-                { CAP_SYS_PACCT,  SCMP_SYS(acct)                },
-                { CAP_SYS_PTRACE, SCMP_SYS(process_vm_readv)    },
-                { CAP_SYS_PTRACE, SCMP_SYS(process_vm_writev)   },
-                { CAP_SYS_PTRACE, SCMP_SYS(ptrace)              },
-                { CAP_SYS_RAWIO,  SCMP_SYS(ioperm)              },
-                { CAP_SYS_RAWIO,  SCMP_SYS(iopl)                },
-                { CAP_SYS_RAWIO,  SCMP_SYS(pciconfig_iobase)    },
-                { CAP_SYS_RAWIO,  SCMP_SYS(pciconfig_read)      },
-                { CAP_SYS_RAWIO,  SCMP_SYS(pciconfig_write)     },
-#ifdef __NR_s390_pci_mmio_read
-                { CAP_SYS_RAWIO,  SCMP_SYS(s390_pci_mmio_read)  },
-#endif
-#ifdef __NR_s390_pci_mmio_write
-                { CAP_SYS_RAWIO,  SCMP_SYS(s390_pci_mmio_write) },
-#endif
-                { CAP_SYS_TIME,   SCMP_SYS(adjtimex)            },
-                { CAP_SYS_TIME,   SCMP_SYS(clock_adjtime)       },
-                { CAP_SYS_TIME,   SCMP_SYS(clock_settime)       },
-                { CAP_SYS_TIME,   SCMP_SYS(settimeofday)        },
-                { CAP_SYS_TIME,   SCMP_SYS(stime)               },
+                const char* name;
+        } whitelist[] = {
+                /* Let's use set names where we can */
+                { 0,                  "@aio"                   },
+                { 0,                  "@basic-io"              },
+                { 0,                  "@chown"                 },
+                { 0,                  "@default"               },
+                { 0,                  "@file-system"           },
+                { 0,                  "@io-event"              },
+                { 0,                  "@ipc"                   },
+                { 0,                  "@mount"                 },
+                { 0,                  "@network-io"            },
+                { 0,                  "@process"               },
+                { 0,                  "@resources"             },
+                { 0,                  "@setuid"                },
+                { 0,                  "@signal"                },
+                { 0,                  "@sync"                  },
+                { 0,                  "@timer"                 },
+
+                /* The following four are sets we optionally enable, in case the caps have been configured for it */
+                { CAP_SYS_TIME,       "@clock"                 },
+                { CAP_SYS_MODULE,     "@module"                },
+                { CAP_SYS_RAWIO,      "@raw-io"                },
+                { CAP_IPC_LOCK,       "@memlock"               },
+
+                /* Plus a good set of additional syscalls which are not part of any of the groups above */
+                { 0,                  "brk"                    },
+                { 0,                  "capget"                 },
+                { 0,                  "capset"                 },
+                { 0,                  "copy_file_range"        },
+                { 0,                  "fadvise64"              },
+                { 0,                  "fadvise64_64"           },
+                { 0,                  "flock"                  },
+                { 0,                  "get_mempolicy"          },
+                { 0,                  "getcpu"                 },
+                { 0,                  "getpriority"            },
+                { 0,                  "getrandom"              },
+                { 0,                  "ioctl"                  },
+                { 0,                  "ioprio_get"             },
+                { 0,                  "kcmp"                   },
+                { 0,                  "madvise"                },
+                { 0,                  "mincore"                },
+                { 0,                  "mprotect"               },
+                { 0,                  "mremap"                 },
+                { 0,                  "name_to_handle_at"      },
+                { 0,                  "oldolduname"            },
+                { 0,                  "olduname"               },
+                { 0,                  "personality"            },
+                { 0,                  "readahead"              },
+                { 0,                  "readdir"                },
+                { 0,                  "remap_file_pages"       },
+                { 0,                  "sched_get_priority_max" },
+                { 0,                  "sched_get_priority_min" },
+                { 0,                  "sched_getaffinity"      },
+                { 0,                  "sched_getattr"          },
+                { 0,                  "sched_getparam"         },
+                { 0,                  "sched_getscheduler"     },
+                { 0,                  "sched_rr_get_interval"  },
+                { 0,                  "sched_yield"            },
+                { 0,                  "seccomp"                },
+                { 0,                  "sendfile"               },
+                { 0,                  "sendfile64"             },
+                { 0,                  "setdomainname"          },
+                { 0,                  "setfsgid"               },
+                { 0,                  "setfsgid32"             },
+                { 0,                  "setfsuid"               },
+                { 0,                  "setfsuid32"             },
+                { 0,                  "sethostname"            },
+                { 0,                  "setpgid"                },
+                { 0,                  "setsid"                 },
+                { 0,                  "splice"                 },
+                { 0,                  "sysinfo"                },
+                { 0,                  "tee"                    },
+                { 0,                  "umask"                  },
+                { 0,                  "uname"                  },
+                { 0,                  "userfaultfd"            },
+                { 0,                  "vmsplice"               },
+
+                /* The following individual syscalls are added depending on specified caps */
+                { CAP_SYS_PACCT,      "acct"                   },
+                { CAP_SYS_PTRACE,     "process_vm_readv"       },
+                { CAP_SYS_PTRACE,     "process_vm_writev"      },
+                { CAP_SYS_PTRACE,     "ptrace"                 },
+                { CAP_SYS_BOOT,       "reboot"                 },
+                { CAP_SYSLOG,         "syslog"                 },
+                { CAP_SYS_TTY_CONFIG, "vhangup"                },
+
+                /*
+                 * The following syscalls and groups are knowingly excluded:
+                 *
+                 * @cpu-emulation
+                 * @keyring           (NB: keyring is not namespaced!)
+                 * @obsolete
+                 * @swap
+                 *
+                 * bpf                (NB: bpffs is not namespaced!)
+                 * fanotify_init
+                 * fanotify_mark
+                 * kexec_file_load
+                 * kexec_load
+                 * lookup_dcookie
+                 * nfsservctl
+                 * open_by_handle_at
+                 * perf_event_open
+                 * pkey_alloc
+                 * pkey_free
+                 * pkey_mprotect
+                 * quotactl
+                 */
         };
 
-        for (i = 0; i < ELEMENTSOF(blacklist); i++) {
-                if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
+        int r, c = 0;
+        size_t i;
+        char **p;
+
+        for (i = 0; i < ELEMENTSOF(whitelist); i++) {
+                if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0)
                         continue;
 
-                r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
-                if (r == -EFAULT)
-                        continue; /* unknown syscall */
-                if (r < 0) {
-                        log_error_errno(r, "Failed to block syscall: %m");
-                        return r;
-                }
+                r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist);
+                if (r < 0)
+                        /* If the system call is not known on this architecture, then that's fine, let's ignore it */
+                        log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch));
+                else
+                        c++;
         }
 
-        return 0;
+        STRV_FOREACH(p, syscall_whitelist) {
+                r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch));
+                else
+                        c++;
+        }
+
+        return c;
 }
 
-int setup_seccomp(uint64_t cap_list_retain) {
-        scmp_filter_ctx seccomp;
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
+        uint32_t arch;
         int r;
 
-        seccomp = seccomp_init(SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return log_oom();
-
-        r = seccomp_add_secondary_archs(seccomp);
-        if (r < 0) {
-                log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
-                goto finish;
+        if (!is_seccomp_available()) {
+                log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
+                return 0;
         }
 
-        r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
-        if (r < 0)
-                goto finish;
-
-        /*
-           Audit is broken in containers, much of the userspace audit
-           hookup will fail if running inside a container. We don't
-           care and just turn off creation of audit sockets.
-
-           This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
-           with EAFNOSUPPORT which audit userspace uses as indication
-           that audit is disabled in the kernel.
-         */
-
-        r = seccomp_rule_add(
-                        seccomp,
-                        SCMP_ACT_ERRNO(EAFNOSUPPORT),
-                        SCMP_SYS(socket),
-                        2,
-                        SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
-                        SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
-        if (r < 0) {
-                log_error_errno(r, "Failed to add audit seccomp rule: %m");
-                goto finish;
-        }
+        SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+                _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
 
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0) {
-                log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
-                goto finish;
-        }
+                log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch));
+
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM));
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate seccomp object: %m");
 
-        r = seccomp_load(seccomp);
-        if (r == -EINVAL) {
-                log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
-                r = 0;
-                goto finish;
+                r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
+                if (r < 0)
+                        return r;
+
+                r = seccomp_load(seccomp);
+                if (IN_SET(r, -EPERM, -EACCES))
+                        return log_error_errno(r, "Failed to install seccomp filter: %m");
+                if (r < 0)
+                        log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
         }
-        if (r < 0) {
-                log_error_errno(r, "Failed to install seccomp audit filter: %m");
-                goto finish;
+
+        SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+                _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+                log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch));
+
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate seccomp object: %m");
+
+                /*
+                  Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
+                  container. We don't care and just turn off creation of audit sockets.
+
+                  This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses
+                  as indication that audit is disabled in the kernel.
+                */
+
+                r = seccomp_rule_add_exact(
+                                seccomp,
+                                SCMP_ACT_ERRNO(EAFNOSUPPORT),
+                                SCMP_SYS(socket),
+                                2,
+                                SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
+                                SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
+                        continue;
+                }
+
+                r = seccomp_load(seccomp);
+                if (IN_SET(r, -EPERM, -EACCES))
+                        return log_error_errno(r, "Failed to install seccomp audit filter: %m");
+                if (r < 0)
+                        log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
         }
 
-finish:
-        seccomp_release(seccomp);
-        return r;
+        return 0;
 }
 
 #else
 
-int setup_seccomp(uint64_t cap_list_retain) {
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
         return 0;
 }