From: Lukas Sismis Date: Fri, 3 Jan 2025 15:08:36 +0000 (+0100) Subject: threading: support thread autopinning and interface-specific affinity X-Git-Tag: suricata-8.0.0-rc1~134 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8817a959e8911d02eea4c8828f9c87f868f8f6d0;p=thirdparty%2Fsuricata.git threading: support thread autopinning and interface-specific affinity Using the new configuration format, it is now possible to set CPU affinity settings per interface. The threading.autopin option has been added to automatically use CPUs from the same NUMA node as the interface. The autopin option requires hwloc-devel / hwloc-dev to be installed and --enable-hwloc flag in configure script. Ticket: 7036 --- diff --git a/configure.ac b/configure.ac index 595371fb33..bda1c5ebdc 100644 --- a/configure.ac +++ b/configure.ac @@ -744,6 +744,33 @@ exit 1 fi + LIBHWLOC="" + AC_ARG_ENABLE(hwloc, + AS_HELP_STRING([--enable-hwloc], [Enable hwloc support [default=no]]), + [enable_hwloc=$enableval],[enable_hwloc=no]) + AS_IF([test "x$enable_hwloc" = "xyes"], [ + PKG_CHECK_MODULES([HWLOC], [hwloc >= 2.0.0], + [AC_DEFINE([HAVE_HWLOC], [1], [Define if hwloc library is present and meets version requirements])], + LIBHWLOC="no") + + if test "$LIBHWLOC" = "no"; then + echo + echo " ERROR! hwloc library version > 2.0.0 not found, go get it" + echo " from https://www.open-mpi.org/projects/hwloc/ " + echo " or your distribution:" + echo + echo " Ubuntu: apt-get install hwloc libhwloc-dev" + echo " Fedora: dnf install hwloc hwloc-devel" + echo " CentOS/RHEL: yum install hwloc hwloc-devel" + echo + exit 1 + else + CFLAGS="${CFLAGS} ${HWLOC_CFLAGS}" + LDFLAGS="${LDFLAGS} ${HWLOC_LIBS}" + enable_hwloc="yes" + fi + ]) + # libpthread AC_ARG_WITH(libpthread_includes, [ --with-libpthread-includes=DIR libpthread include directory], @@ -2535,6 +2562,7 @@ SURICATA_BUILD_CONF="Suricata Configuration: JA3 support: ${enable_ja3} JA4 support: ${enable_ja4} Hyperscan support: ${enable_hyperscan} + Hwloc support: ${enable_hwloc} Libnet support: ${enable_libnet} liblz4 support: ${enable_liblz4} Landlock support: ${enable_landlock} diff --git a/doc/userguide/configuration/suricata-yaml.rst b/doc/userguide/configuration/suricata-yaml.rst index 76668b9ec2..94fa3bb488 100644 --- a/doc/userguide/configuration/suricata-yaml.rst +++ b/doc/userguide/configuration/suricata-yaml.rst @@ -961,6 +961,7 @@ per available CPU/CPU core. threading: set-cpu-affinity: yes + autopin: no cpu-affinity: management-cpu-set: cpu: [ 0 ] # include only these cpus in affinity settings @@ -977,6 +978,13 @@ per available CPU/CPU core. medium: [ "1-2" ] high: [ 3 ] default: "medium" + interface-specific-cpu-set: + - interface: "enp4s0f0" # 0000:3b:00.0 # net_bonding0 # ens1f0 + cpu: [ 1,3,5,7,9 ] + mode: "exclusive" + prio: + high: [ "all" ] + default: "medium" verdict-cpu-set: cpu: [ 0 ] prio: @@ -1013,6 +1021,80 @@ Runmode Workers:: worker-cpu-set - used for receive,streamtcp,decode,detect,output(logging),respond/reject, verdict +Interface-specific CPU affinity settings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using the new configuration format introduced in Suricata 8.0 it is possible +to set CPU affinity settings per interface. This can be useful +when you have multiple interfaces and you want to dedicate specific CPU cores +to specific interfaces. This can be useful, for example, when Suricata runs on +multiple NUMA nodes and reads from interfaces on each NUMA node. + +Interface-specific affinity settings can be configured for the +``worker-cpu-set`` and the ``receive-cpu-set`` (only used in autofp mode). +This feature is available for capture modes which work with interfaces +(af-packet, dpdk, etc.). The value of the interface key can be the kernel +interface name (e.g. eth0 for af-packet), the PCI address of the interface +(e.g. 0000:3b:00.0 for DPDK capture mode), or the name of the virtual device +interface (e.g. net_bonding0 for DPDK capture mode). +The interface names needs to be unique and be specified in the capture mode +configuration. + +The interface-specific settings will override the global settings for the +``worker-cpu-set`` and ``receive-cpu-set``. The CPUs do not need to be contained in +the parent node settings. If the interface-specific settings are not defined, +the global settings will be used. + +:: + + threading: + set-cpu-affinity: yes + cpu-affinity: + worker-cpu-set: + interface-specific-cpu-set: + - interface: "eth0" # 0000:3b:00.0 # net_bonding0 + cpu: [ 1,3,5,7,9 ] + mode: "exclusive" + prio: + high: [ "all" ] + default: "medium" + +Automatic NUMA-aware CPU core pinning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When Suricata is running on a system with multiple NUMA nodes, it is possible +to automatically use CPUs from the same NUMA node as the network capture +interface. +CPU cores on the same NUMA node as the network capture interface can have +reduced memory access latency and can increase the performance of Suricata. +This is enabled by setting the ``autopin`` option to ``yes`` in the threading +section. This option is available for worker-cpu-set and receive-cpu-set. + +:: + + threading: + set-cpu-affinity: yes + autopin: yes + cpu-affinity: + worker-cpu-set: + cpu: [ "all" ] + mode: "exclusive" + prio: + high: [ "all" ] + +Consider 2 interfaces defined in the capture mode configuration, one on each +NUMA node. The ``autopin`` option is enabled to automatically use CPUs from the +same NUMA node as the interface. The worker-cpu-set is set to use all CPUs. +When interface on the first NUMA node is used, the worker threads will be +pinned to CPUs on the first NUMA node. When interface on the second NUMA node +is used, the worker threads will be pinned to CPUs on the second NUMA node. +If the number of CPU cores on a given NUMA node is exhausted then the worker +threads will be pinned to CPUs on the other NUMA node. + +The option ``threading.autopin`` can be combined with the interface-specific CPU +affinity settings. +To use the ``autopin`` option, the system must have the ``hwloc`` +dependency installed and pass ``--enable-hwloc`` to the configure script. IP Defrag --------- diff --git a/src/runmode-dpdk.c b/src/runmode-dpdk.c index 9a6f885c2b..496d830a28 100644 --- a/src/runmode-dpdk.c +++ b/src/runmode-dpdk.c @@ -387,12 +387,17 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str) SCReturnInt(-EINVAL); } - ThreadsAffinityType *wtaf = GetAffinityTypeFromName("worker-cpu-set"); + bool wtaf_periface = true; + ThreadsAffinityType *wtaf = GetAffinityTypeForNameAndIface("worker-cpu-set", iconf->iface); if (wtaf == NULL) { - SCLogError("Specify worker-cpu-set list in the threading section"); - SCReturnInt(-EINVAL); + wtaf_periface = false; + wtaf = GetAffinityTypeForNameAndIface("worker-cpu-set", NULL); // mandatory + if (wtaf == NULL) { + SCLogError("Specify worker-cpu-set list in the threading section"); + SCReturnInt(-EINVAL); + } } - ThreadsAffinityType *mtaf = GetAffinityTypeFromName("management-cpu-set"); + ThreadsAffinityType *mtaf = GetAffinityTypeForNameAndIface("management-cpu-set", NULL); if (mtaf == NULL) { SCLogError("Specify management-cpu-set list in the threading section"); SCReturnInt(-EINVAL); @@ -425,7 +430,13 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str) } if (strcmp(entry_str, "auto") == 0) { - uint16_t live_dev_count = (uint16_t)LiveGetDeviceCount(); + if (wtaf_periface) { + iconf->threads = (uint16_t)sched_cpus; + SCLogConfig("%s: auto-assigned %u threads", iconf->iface, iconf->threads); + SCReturnInt(0); + } + + uint16_t live_dev_count = (uint16_t)LiveGetDeviceCountWithoutAssignedThreading(); if (live_dev_count == 0) { SCLogError("No live devices found, cannot auto-assign threads"); SCReturnInt(-EINVAL); @@ -1019,23 +1030,46 @@ static int ConfigLoad(DPDKIfaceConfig *iconf, const char *iface) SCReturnInt(0); } -static int32_t ConfigValidateThreads(uint16_t iface_threads) +static bool ConfigThreadsGenericIsValid(uint16_t iface_threads, ThreadsAffinityType *wtaf) { static uint32_t total_cpus = 0; total_cpus += iface_threads; - ThreadsAffinityType *wtaf = GetAffinityTypeFromName("worker-cpu-set"); if (wtaf == NULL) { SCLogError("Specify worker-cpu-set list in the threading section"); - return -1; + return false; } if (total_cpus > UtilAffinityGetAffinedCPUNum(wtaf)) { - SCLogError("Interfaces requested more cores than configured in the threading section " - "(requested %d configured %d", + SCLogError("Interfaces requested more cores than configured in the worker-cpu-set " + "threading section (requested %d configured %d", total_cpus, UtilAffinityGetAffinedCPUNum(wtaf)); - return -1; + return false; } - return 0; + return true; +} + +static bool ConfigThreadsInterfaceIsValid(uint16_t iface_threads, ThreadsAffinityType *itaf) +{ + if (iface_threads > UtilAffinityGetAffinedCPUNum(itaf)) { + SCLogError("Interface requested more cores than configured in the interface-specific " + "threading section (requested %d configured %d", + iface_threads, UtilAffinityGetAffinedCPUNum(itaf)); + return false; + } + + return true; +} + +static bool ConfigIsThreadingValid(uint16_t iface_threads, const char *iface) +{ + ThreadsAffinityType *itaf = GetAffinityTypeForNameAndIface("worker-cpu-set", iface); + ThreadsAffinityType *wtaf = GetAffinityTypeForNameAndIface("worker-cpu-set", NULL); + if (itaf && !ConfigThreadsInterfaceIsValid(iface_threads, itaf)) { + return false; + } else if (itaf == NULL && !ConfigThreadsGenericIsValid(iface_threads, wtaf)) { + return false; + } + return true; } static DPDKIfaceConfig *ConfigParse(const char *iface) @@ -1048,7 +1082,7 @@ static DPDKIfaceConfig *ConfigParse(const char *iface) ConfigInit(&iconf); retval = ConfigLoad(iconf, iface); - if (retval < 0 || ConfigValidateThreads(iconf->threads) != 0) { + if (retval < 0 || !ConfigIsThreadingValid(iconf->threads, iface)) { iconf->DerefFunc(iconf); SCReturnPtr(NULL, "void *"); } diff --git a/src/suricata.c b/src/suricata.c index 4a87c4e57c..742ed6d000 100644 --- a/src/suricata.c +++ b/src/suricata.c @@ -112,6 +112,7 @@ #include "tmqh-packetpool.h" #include "tm-queuehandlers.h" +#include "util-affinity.h" #include "util-byte.h" #include "util-conf.h" #include "util-coredump-config.h" @@ -2319,6 +2320,9 @@ void PostRunDeinit(const int runmode, struct timeval *start_time) StreamTcpFreeConfig(STREAM_VERBOSE); DefragDestroy(); HttpRangeContainersDestroy(); +#ifdef HAVE_HWLOC + TopologyDestroy(); +#endif /* HAVE_HWLOC */ TmqResetQueues(); #ifdef PROFILING diff --git a/src/threadvars.h b/src/threadvars.h index d645d0645e..e94f6244ec 100644 --- a/src/threadvars.h +++ b/src/threadvars.h @@ -136,6 +136,9 @@ typedef struct ThreadVars_ { struct FlowQueue_ *flow_queue; bool break_loop; + /** Interface-specific thread affinity */ + char *iface_name; + Storage storage[]; } ThreadVars; diff --git a/src/tm-threads.c b/src/tm-threads.c index c3998a5114..fa05e72753 100644 --- a/src/tm-threads.c +++ b/src/tm-threads.c @@ -874,8 +874,24 @@ TmEcode TmThreadSetupOptions(ThreadVars *tv) TmThreadSetPrio(tv); if (tv->thread_setup_flags & THREAD_SET_AFFTYPE) { ThreadsAffinityType *taf = &thread_affinity[tv->cpu_affinity]; + bool use_iface_affinity = RunmodeIsAutofp() && tv->cpu_affinity == RECEIVE_CPU_SET && + FindAffinityByInterface(taf, tv->iface_name) != NULL; + use_iface_affinity |= RunmodeIsWorkers() && tv->cpu_affinity == WORKER_CPU_SET && + FindAffinityByInterface(taf, tv->iface_name) != NULL; + + if (use_iface_affinity) { + taf = FindAffinityByInterface(taf, tv->iface_name); + } + + if (UtilAffinityGetAffinedCPUNum(taf) == 0) { + if (!taf->nocpu_warned) { + SCLogWarning("No CPU affinity set for %s", AffinityGetYamlPath(taf)); + taf->nocpu_warned = true; + } + } + if (taf->mode_flag == EXCLUSIVE_AFFINITY) { - uint16_t cpu = AffinityGetNextCPU(taf); + uint16_t cpu = AffinityGetNextCPU(tv, taf); SetCPUAffinity(cpu); /* If CPU is in a set overwrite the default thread prio */ if (CPU_ISSET(cpu, &taf->lowprio_cpu)) { @@ -1621,6 +1637,10 @@ static void TmThreadFree(ThreadVars *tv) SCFree(tv->printable_name); } + if (tv->iface_name) { + SCFree(tv->iface_name); + } + if (tv->stream_pq_local) { BUG_ON(tv->stream_pq_local->len); SCMutexDestroy(&tv->stream_pq_local->mutex_q); diff --git a/src/util-affinity.c b/src/util-affinity.c index 0665374cdc..52edbffa2c 100644 --- a/src/util-affinity.c +++ b/src/util-affinity.c @@ -31,50 +31,177 @@ #include "util-cpu.h" #include "util-byte.h" #include "util-debug.h" +#include "util-dpdk.h" ThreadsAffinityType thread_affinity[MAX_CPU_SET] = { { .name = "receive-cpu-set", .mode_flag = EXCLUSIVE_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, { .name = "worker-cpu-set", .mode_flag = EXCLUSIVE_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, { .name = "verdict-cpu-set", .mode_flag = BALANCED_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, { .name = "management-cpu-set", .mode_flag = BALANCED_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, }; int thread_affinity_init_done = 0; +#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun +#ifdef HAVE_HWLOC +static hwloc_topology_t topology = NULL; +#endif /* HAVE_HWLOC */ +#endif /* OS_WIN32 and __OpenBSD__ */ + +static ThreadsAffinityType *AllocAndInitAffinityType( + const char *name, const char *interface_name, ThreadsAffinityType *parent) +{ + ThreadsAffinityType *new_affinity = SCCalloc(1, sizeof(ThreadsAffinityType)); + if (new_affinity == NULL) { + FatalError("Unable to allocate memory for new CPU affinity type"); + } + + new_affinity->name = SCStrdup(interface_name); + if (new_affinity->name == NULL) { + FatalError("Unable to allocate memory for new CPU affinity type name"); + } + new_affinity->parent = parent; + new_affinity->mode_flag = EXCLUSIVE_AFFINITY; + new_affinity->prio = PRIO_MEDIUM; + for (int i = 0; i < MAX_NUMA_NODES; i++) { + new_affinity->lcpu[i] = 0; + } + + if (parent != NULL) { + if (parent->nb_children == parent->nb_children_capacity) { + if (parent->nb_children_capacity == 0) { + parent->nb_children_capacity = 2; + } else { + parent->nb_children_capacity *= 2; + } + void *p = SCRealloc( + parent->children, parent->nb_children_capacity * sizeof(ThreadsAffinityType *)); + if (p == NULL) { + FatalError("Unable to reallocate memory for children CPU affinity types"); + } + parent->children = p; + } + parent->children[parent->nb_children++] = new_affinity; + } + + return new_affinity; +} + +ThreadsAffinityType *FindAffinityByInterface( + ThreadsAffinityType *parent, const char *interface_name) +{ + if (parent == NULL || interface_name == NULL || parent->nb_children == 0 || + parent->children == NULL) { + return NULL; + } + + for (uint32_t i = 0; i < parent->nb_children; i++) { + if (parent->children[i] && parent->children[i]->name && + strcmp(parent->children[i]->name, interface_name) == 0) { + return parent->children[i]; + } + } + return NULL; +} + /** - * \brief find affinity by its name + * \brief Find affinity by name (*-cpu-set name) and an interface name. + * \param name the name of the affinity (e.g. worker-cpu-set, receive-cpu-set). + * The name is required and cannot be NULL. + * \param interface_name the name of the interface. + * If NULL, the affinity is looked up by name only. + * \retval a pointer to the affinity or NULL if not found + */ +ThreadsAffinityType *GetAffinityTypeForNameAndIface(const char *name, const char *interface_name) +{ + if (name == NULL || *name == '\0') { + return NULL; + } + + ThreadsAffinityType *parent_affinity = NULL; + for (int i = 0; i < MAX_CPU_SET; i++) { + if (thread_affinity[i].name != NULL && strcmp(thread_affinity[i].name, name) == 0) { + parent_affinity = &thread_affinity[i]; + break; + } + } + + if (parent_affinity == NULL) { + SCLogError("CPU affinity with name \"%s\" not found", name); + return NULL; + } + + if (interface_name != NULL) { + ThreadsAffinityType *child_affinity = + FindAffinityByInterface(parent_affinity, interface_name); + // found or not found, it is returned + return child_affinity; + } + + return parent_affinity; +} + +/** + * \brief Finds affinity by its name and interface name. + * Interfaces are children of cpu-set names. If the queried interface is not + * found, then it is allocated, initialized and assigned to the queried cpu-set. + * \param name the name of the affinity (e.g. worker-cpu-set, receive-cpu-set). + * The name is required and cannot be NULL. + * \param interface_name the name of the interface. + * If NULL, the affinity is looked up by name only. * \retval a pointer to the affinity or NULL if not found */ -ThreadsAffinityType * GetAffinityTypeFromName(const char *name) +ThreadsAffinityType *GetOrAllocAffinityTypeForIfaceOfName( + const char *name, const char *interface_name) { int i; + ThreadsAffinityType *parent_affinity = NULL; + for (i = 0; i < MAX_CPU_SET; i++) { - if (!strcmp(thread_affinity[i].name, name)) { - return &thread_affinity[i]; + if (strcmp(thread_affinity[i].name, name) == 0) { + parent_affinity = &thread_affinity[i]; + break; } } - return NULL; + + if (parent_affinity == NULL) { + SCLogError("CPU affinity with name \"%s\" not found", name); + return NULL; + } + + if (interface_name != NULL) { + ThreadsAffinityType *child_affinity = + FindAffinityByInterface(parent_affinity, interface_name); + if (child_affinity != NULL) { + return child_affinity; + } + + // If not found, allocate and initialize a new child affinity + return AllocAndInitAffinityType(name, interface_name, parent_affinity); + } + + return parent_affinity; } #if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun @@ -284,38 +411,132 @@ static int SetupAffinityThreads(ThreadsAffinityType *taf, SCConfNode *affinity) return 0; } -static bool AllCPUsUsed(ThreadsAffinityType *taf) +/** + * \brief Get the YAML path for the given affinity type. + * The path is built using the parent name (if available) and the affinity name. + * Do not free the returned string. + * \param taf the affinity type - if NULL, the path is built for the root node + * \return a string containing the YAML path, or NULL if the path is too long + */ +char *AffinityGetYamlPath(ThreadsAffinityType *taf) { - if (taf->lcpu < UtilCpuGetNumProcessorsOnline()) { - return false; + static char rootpath[] = "threading.cpu-affinity"; + static char path[1024] = { 0 }; + char subpath[256] = { 0 }; + + if (taf == NULL) { + return rootpath; + } + + if (taf->parent != NULL) { + long r = snprintf( + subpath, sizeof(subpath), "%s.interface-specific-cpu-set.", taf->parent->name); + if (r < 0 || r >= (long)sizeof(subpath)) { + SCLogError("Unable to build YAML path for CPU affinity %s.%s", taf->parent->name, + taf->name); + return NULL; + } + } else { + subpath[0] = '\0'; + } + + long r = snprintf(path, sizeof(path), "%s.%s%s", rootpath, subpath, taf->name); + if (r < 0 || r >= (long)sizeof(path)) { + SCLogError("Unable to build YAML path for CPU affinity %s", taf->name); + return NULL; } - return true; + + return path; } static void ResetCPUs(ThreadsAffinityType *taf) { - taf->lcpu = 0; + for (int i = 0; i < MAX_NUMA_NODES; i++) { + taf->lcpu[i] = 0; + } +} + +/** + * \brief Check if the set name corresponds to a worker CPU set. + */ +static bool IsWorkerCpuSet(const char *setname) +{ + return (strcmp(setname, "worker-cpu-set") == 0); } -static uint16_t GetNextAvailableCPU(ThreadsAffinityType *taf) +/** + * \brief Check if the set name corresponds to a receive CPU set. + */ +static bool IsReceiveCpuSet(const char *setname) { - uint16_t cpu = taf->lcpu; - int attempts = 0; + return (strcmp(setname, "receive-cpu-set") == 0); +} - while (!CPU_ISSET(cpu, &taf->cpu_set) && attempts < 2) { - cpu = (cpu + 1) % UtilCpuGetNumProcessorsOnline(); - if (cpu == 0) - attempts++; +/** + * \brief Set up affinity configuration for a single interface. + */ +/** + * \brief Set up affinity configuration for a single interface. + * \retval 0 on success, -1 on error + */ +static int SetupSingleIfaceAffinity(ThreadsAffinityType *taf, SCConfNode *iface_node) +{ + // offload to Setup function + SCConfNode *child_node; + const char *interface_name = NULL; + TAILQ_FOREACH (child_node, &iface_node->head, next) { + if (strcmp(child_node->name, "interface") == 0) { + interface_name = child_node->val; + break; + } + } + if (interface_name == NULL) { + return 0; } - taf->lcpu = cpu + 1; + ThreadsAffinityType *iface_taf = + GetOrAllocAffinityTypeForIfaceOfName(taf->name, interface_name); + if (iface_taf == NULL) { + SCLogError("Failed to allocate CPU affinity type for interface: %s", interface_name); + return -1; + } - if (attempts == 2) { - SCLogError( - "cpu_set does not contain available CPUs, CPU affinity configuration is invalid"); + SetupCpuSets(iface_taf, iface_node, interface_name); + if (SetupAffinityPriority(iface_taf, iface_node, interface_name) < 0) { + return -1; + } + if (SetupAffinityMode(iface_taf, iface_node) < 0) { + return -1; } + if (SetupAffinityThreads(iface_taf, iface_node) < 0) { + return -1; + } + return 0; +} - return cpu; +/** + * \brief Set up per-interface affinity configurations. + * \retval 0 on success, -1 on error + */ +static int SetupPerIfaceAffinity(ThreadsAffinityType *taf, SCConfNode *affinity) +{ + char if_af[] = "interface-specific-cpu-set"; + SCConfNode *per_iface_node = SCConfNodeLookupChild(affinity, if_af); + if (per_iface_node == NULL) { + return 0; + } + + SCConfNode *iface_node; + TAILQ_FOREACH (iface_node, &per_iface_node->head, next) { + if (strcmp(iface_node->val, "interface") == 0) { + if (SetupSingleIfaceAffinity(taf, iface_node) < 0) { + return -1; + } + } else { + SCLogWarning("Unknown node in %s: %s", if_af, iface_node->name); + } + } + return 0; } /** @@ -333,7 +554,7 @@ static bool AffinityConfigIsLegacy(void) return is_using_legacy_affinity_format; } - SCConfNode *root = SCConfGetNode("threading.cpu-affinity"); + SCConfNode *root = SCConfGetNode(AffinityGetYamlPath(NULL)); if (root == NULL) { return is_using_legacy_affinity_format; } @@ -365,10 +586,10 @@ void AffinitySetupLoadFromConfig(void) thread_affinity_init_done = 1; } - SCLogDebug("Loading threading.cpu-affinity from config"); - SCConfNode *root = SCConfGetNode("threading.cpu-affinity"); + SCLogDebug("Loading %s from config", AffinityGetYamlPath(NULL)); + SCConfNode *root = SCConfGetNode(AffinityGetYamlPath(NULL)); if (root == NULL) { - SCLogInfo("Cannot find threading.cpu-affinity node in config"); + SCLogInfo("Cannot find %s node in config", AffinityGetYamlPath(NULL)); return; } @@ -380,7 +601,7 @@ void AffinitySetupLoadFromConfig(void) continue; } - ThreadsAffinityType *taf = GetAffinityTypeFromName(setname); + ThreadsAffinityType *taf = GetOrAllocAffinityTypeForIfaceOfName(setname, NULL); if (taf == NULL) { SCLogError("Failed to allocate CPU affinity type: %s", setname); continue; @@ -402,25 +623,402 @@ void AffinitySetupLoadFromConfig(void) SCLogError("Failed to setup threads for CPU affinity type: %s", setname); continue; } + + if (!AffinityConfigIsLegacy() && (IsWorkerCpuSet(setname) || IsReceiveCpuSet(setname))) { + if (SetupPerIfaceAffinity(taf, affinity) < 0) { + SCLogError("Failed to setup per-interface affinity for CPU affinity type: %s", + setname); + continue; + } + } } #endif /* OS_WIN32 and __OpenBSD__ */ } +#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun +#ifdef HAVE_HWLOC +static int HwLocDeviceNumaGet(hwloc_topology_t topo, hwloc_obj_t obj) +{ +#if HWLOC_VERSION_MAJOR >= 2 && HWLOC_VERSION_MINOR >= 5 + hwloc_obj_t nodes[MAX_NUMA_NODES]; + unsigned num_nodes = MAX_NUMA_NODES; + struct hwloc_location location; + + location.type = HWLOC_LOCATION_TYPE_OBJECT; + location.location.object = obj; + + int result = hwloc_get_local_numanode_objs(topo, &location, &num_nodes, nodes, 0); + if (result == 0 && num_nodes > 0 && num_nodes <= MAX_NUMA_NODES) { + return nodes[0]->logical_index; + } + return -1; +#endif /* HWLOC_VERSION_MAJOR >= 2 && HWLOC_VERSION_MINOR >= 5 */ + + hwloc_obj_t non_io_ancestor = hwloc_get_non_io_ancestor_obj(topo, obj); + if (non_io_ancestor == NULL) { + return -1; + } + + // Iterate over NUMA nodes and check their nodeset + hwloc_obj_t numa_node = NULL; + while ((numa_node = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_NUMANODE, numa_node)) != NULL) { + if (hwloc_bitmap_isset(non_io_ancestor->nodeset, numa_node->os_index)) { + return numa_node->logical_index; + } + } + + return -1; +} + +static hwloc_obj_t HwLocDeviceGetByKernelName(hwloc_topology_t topo, const char *interface_name) +{ + hwloc_obj_t obj = NULL; + + while ((obj = hwloc_get_next_osdev(topo, obj)) != NULL) { + if (obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK && + strcmp(obj->name, interface_name) == 0) { + hwloc_obj_t parent = obj->parent; + while (parent) { + if (parent->type == HWLOC_OBJ_PCI_DEVICE) { + return parent; + } + parent = parent->parent; + } + } + } + return NULL; +} + +// Static function to deparse PCIe interface string name to individual components /** - * \brief Return next cpu to use for a given thread family - * \retval the cpu to used given by its id + * \brief Parse PCIe address string to individual components + * \param[in] pcie_address PCIe address string + * \param[out] domain Domain component + * \param[out] bus Bus component + * \param[out] device Device component + * \param[out] function Function component */ -uint16_t AffinityGetNextCPU(ThreadsAffinityType *taf) +static int PcieAddressToComponents(const char *pcie_address, unsigned int *domain, + unsigned int *bus, unsigned int *device, unsigned int *function) { - uint16_t ncpu = 0; -#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun + // Handle both full and short PCIe address formats + if (sscanf(pcie_address, "%x:%x:%x.%x", domain, bus, device, function) != 4) { + if (sscanf(pcie_address, "%x:%x.%x", bus, device, function) != 3) { + return -1; + } + *domain = 0; // Default domain to 0 if not provided + } + return 0; +} + +// Function to convert PCIe address to hwloc object +static hwloc_obj_t HwLocDeviceGetByPcie(hwloc_topology_t topo, const char *pcie_address) +{ + hwloc_obj_t obj = NULL; + unsigned int domain, bus, device, function; + int r = PcieAddressToComponents(pcie_address, &domain, &bus, &device, &function); + if (r == 0) { + while ((obj = hwloc_get_next_pcidev(topo, obj)) != NULL) { + if (obj->attr->pcidev.domain == domain && obj->attr->pcidev.bus == bus && + obj->attr->pcidev.dev == device && obj->attr->pcidev.func == function) { + return obj; + } + } + } + return NULL; +} + +static void HwlocObjectDump(hwloc_obj_t obj, const char *iface_name) +{ + if (!obj) { + SCLogDebug("No object found for the given PCIe address.\n"); + return; + } + + static char pcie_address[32]; + snprintf(pcie_address, sizeof(pcie_address), "%04x:%02x:%02x.%x", obj->attr->pcidev.domain, + obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func); + SCLogDebug("Interface (%s / %s) has NUMA ID %d", iface_name, pcie_address, + HwLocDeviceNumaGet(topology, obj)); + + SCLogDebug("Object type: %s\n", hwloc_obj_type_string(obj->type)); + SCLogDebug("Logical index: %u\n", obj->logical_index); + SCLogDebug("Depth: %u\n", obj->depth); + SCLogDebug("Attributes:\n"); + if (obj->type == HWLOC_OBJ_PCI_DEVICE) { + SCLogDebug(" Domain: %04x\n", obj->attr->pcidev.domain); + SCLogDebug(" Bus: %02x\n", obj->attr->pcidev.bus); + SCLogDebug(" Device: %02x\n", obj->attr->pcidev.dev); + SCLogDebug(" Function: %01x\n", obj->attr->pcidev.func); + SCLogDebug(" Class ID: %04x\n", obj->attr->pcidev.class_id); + SCLogDebug(" Vendor ID: %04x\n", obj->attr->pcidev.vendor_id); + SCLogDebug(" Device ID: %04x\n", obj->attr->pcidev.device_id); + SCLogDebug(" Subvendor ID: %04x\n", obj->attr->pcidev.subvendor_id); + SCLogDebug(" Subdevice ID: %04x\n", obj->attr->pcidev.subdevice_id); + SCLogDebug(" Revision: %02x\n", obj->attr->pcidev.revision); + SCLogDebug(" Link speed: %f GB/s\n", obj->attr->pcidev.linkspeed); + } else { + SCLogDebug(" No PCI device attributes available.\n"); + } +} + +static bool TopologyShouldAutopin(ThreadVars *tv, ThreadsAffinityType *taf) +{ + bool cond; SCMutexLock(&taf->taf_mutex); - ncpu = GetNextAvailableCPU(taf); + cond = tv->type == TVT_PPT && tv->iface_name && + (strcmp(tv->iface_name, taf->name) == 0 || + (strcmp("worker-cpu-set", taf->name) == 0 && RunmodeIsWorkers()) || + (strcmp("receive-cpu-set", taf->name) == 0 && RunmodeIsAutofp())); + SCMutexUnlock(&taf->taf_mutex); + return cond; +} + +/** + * \brief Initialize the hardware topology. + * \retval 0 on success, -1 on error + */ +static int TopologyInitialize(void) +{ + if (topology == NULL) { + if (hwloc_topology_init(&topology) == -1) { + SCLogError("Failed to initialize topology"); + return -1; + } + + if (hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM) == -1 || + hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL) == -1 || + hwloc_topology_load(topology) == -1) { + SCLogError("Failed to set/load topology"); + hwloc_topology_destroy(topology); + topology = NULL; + return -1; + } + } + return 0; +} + +void TopologyDestroy() +{ + if (topology != NULL) { + hwloc_topology_destroy(topology); + topology = NULL; + } +} + +static int InterfaceGetNumaNode(ThreadVars *tv) +{ + hwloc_obj_t if_obj = HwLocDeviceGetByKernelName(topology, tv->iface_name); + if (if_obj == NULL) { + if_obj = HwLocDeviceGetByPcie(topology, tv->iface_name); + } + + if (if_obj != NULL && SCLogGetLogLevel() == SC_LOG_DEBUG) { + HwlocObjectDump(if_obj, tv->iface_name); + } + + int32_t numa_id = HwLocDeviceNumaGet(topology, if_obj); + if (numa_id < 0 && SCRunmodeGet() == RUNMODE_DPDK) { + // DPDK fallback for e.g. net_bonding (vdev) PMDs + int32_t r = DPDKDeviceNameSetSocketID(tv->iface_name, &numa_id); + if (r < 0) { + numa_id = -1; + } + } + + if (numa_id < 0) { + SCLogDebug("Unable to find NUMA node for interface %s", tv->iface_name); + } + + return numa_id; +} +#endif /* HAVE_HWLOC */ + +static bool CPUIsFromNuma(uint16_t ncpu, uint16_t numa) +{ +#ifdef HAVE_HWLOC + int core_id = ncpu; + int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); + hwloc_obj_t numa_node = NULL; + bool found = false; + uint16_t found_numa = 0; + + // Invalid depth or no NUMA nodes available + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) { + return false; + } + + while ((numa_node = hwloc_get_next_obj_by_depth(topology, depth, numa_node)) != NULL) { + hwloc_cpuset_t cpuset = hwloc_bitmap_alloc(); + if (cpuset == NULL) { + SCLogDebug("Failed to allocate cpuset"); + continue; + } + hwloc_bitmap_copy(cpuset, numa_node->cpuset); + + if (hwloc_bitmap_isset(cpuset, core_id)) { + SCLogDebug("Core %d - NUMA %d", core_id, numa_node->logical_index); + found = true; + found_numa = numa_node->logical_index; + hwloc_bitmap_free(cpuset); + break; + } + hwloc_bitmap_free(cpuset); + } + + // After loop, check if we found the CPU and match the requested NUMA node + if (found && numa == found_numa) { + return true; + } + + // CPU was not found in any NUMA node or did not match requested NUMA +#endif /* HAVE_HWLOC */ + + return false; +} + +static int16_t FindCPUInNumaNode(int numa_node, ThreadsAffinityType *taf) +{ + if (numa_node < 0) { + return -1; + } + + if (taf->lcpu[numa_node] >= UtilCpuGetNumProcessorsOnline()) { + return -1; + } + + uint16_t cpu = taf->lcpu[numa_node]; + while (cpu < UtilCpuGetNumProcessorsOnline() && + (!CPU_ISSET(cpu, &taf->cpu_set) || !CPUIsFromNuma(cpu, (uint16_t)numa_node))) { + cpu++; + } + + taf->lcpu[numa_node] = + (CPU_ISSET(cpu, &taf->cpu_set) && CPUIsFromNuma(cpu, (uint16_t)numa_node)) + ? cpu + 1 + : UtilCpuGetNumProcessorsOnline(); + return (CPU_ISSET(cpu, &taf->cpu_set) && CPUIsFromNuma(cpu, (uint16_t)numa_node)) ? (int16_t)cpu + : -1; +} + +static int16_t CPUSelectFromNuma(int iface_numa, ThreadsAffinityType *taf) +{ + if (iface_numa != -1) { + return FindCPUInNumaNode(iface_numa, taf); + } + return -1; +} - if (AllCPUsUsed(taf)) { - ResetCPUs(taf); +static int16_t CPUSelectAlternative(int iface_numa, ThreadsAffinityType *taf) +{ + for (int nid = 0; nid < MAX_NUMA_NODES; nid++) { + if (iface_numa == nid) { + continue; + } + + int16_t cpu = FindCPUInNumaNode(nid, taf); + if (cpu != -1) { + SCLogPerf("CPU %d from NUMA %d assigned to a network interface located on NUMA %d", cpu, + nid, iface_numa); + return cpu; + } } + return -1; +} +/** + * \brief Select the next available CPU for the given affinity type. + * taf->cpu_set is a bit array where each bit represents a CPU core. + * The function iterates over the bit array and returns the first available CPU. + * If last used CPU core index is higher than the indexes of available cores, + * we reach the end of the array, and we reset the CPU selection. + * On the second reset attempt, the function bails out with a default value. + * The second attempt should only happen with an empty CPU set. + */ +static uint16_t CPUSelectDefault(ThreadsAffinityType *taf) +{ + uint16_t cpu = taf->lcpu[0]; + int attempts = 0; + while (!CPU_ISSET(cpu, &taf->cpu_set) && attempts < 2) { + cpu = (cpu + 1) % UtilCpuGetNumProcessorsOnline(); + if (cpu == 0) { + attempts++; + } + } + + taf->lcpu[0] = cpu + 1; + return cpu; +} + +static uint16_t CPUSelectFromNumaOrDefault(int iface_numa, ThreadsAffinityType *taf) +{ + uint16_t attempts = 0; + int16_t cpu = -1; + while (attempts < 2) { + cpu = CPUSelectFromNuma(iface_numa, taf); + if (cpu == -1) { + cpu = CPUSelectAlternative(iface_numa, taf); + if (cpu == -1) { + // All CPUs from all NUMAs are used at this point + ResetCPUs(taf); + attempts++; + } + } + + if (cpu >= 0) { + return (uint16_t)cpu; + } + } + return CPUSelectDefault(taf); +} + +static uint16_t GetNextAvailableCPU(int iface_numa, ThreadsAffinityType *taf) +{ + if (iface_numa < 0) { + return CPUSelectDefault(taf); + } + + return CPUSelectFromNumaOrDefault(iface_numa, taf); +} + +static bool AutopinEnabled(void) +{ + int autopin = 0; + if (SCConfGetBool("threading.autopin", &autopin) != 1) { + return false; + } + return (bool)autopin; +} + +#endif /* OS_WIN32 and __OpenBSD__ */ + +uint16_t AffinityGetNextCPU(ThreadVars *tv, ThreadsAffinityType *taf) +{ + uint16_t ncpu = 0; +#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun + int iface_numa = -1; + if (AutopinEnabled()) { +#ifdef HAVE_HWLOC + if (TopologyShouldAutopin(tv, taf)) { + if (TopologyInitialize() < 0) { + SCLogError("Failed to initialize topology for CPU affinity"); + return ncpu; + } + iface_numa = InterfaceGetNumaNode(tv); + } +#else + static bool printed = false; + if (!printed) { + printed = true; + SCLogWarning( + "threading.autopin option is enabled but hwloc support is not compiled in. " + "Make sure to pass --enable-hwloc to configure when building Suricata."); + } +#endif /* HAVE_HWLOC */ + } + + SCMutexLock(&taf->taf_mutex); + ncpu = GetNextAvailableCPU(iface_numa, taf); SCLogDebug("Setting affinity on CPU %d", ncpu); SCMutexUnlock(&taf->taf_mutex); #endif /* OS_WIN32 and __OpenBSD__ */ diff --git a/src/util-affinity.h b/src/util-affinity.h index c4e0aab285..202f5ba8b0 100644 --- a/src/util-affinity.h +++ b/src/util-affinity.h @@ -26,6 +26,11 @@ #include "suricata-common.h" #include "conf.h" #include "threads.h" +#include "threadvars.h" + +#ifdef HAVE_HWLOC +#include +#endif /* HAVE_HWLOC */ #if defined OS_FREEBSD #include @@ -62,12 +67,12 @@ enum { MAX_AFFINITY }; +#define MAX_NUMA_NODES 16 + typedef struct ThreadsAffinityType_ { const char *name; - uint8_t mode_flag; - uint16_t lcpu; /* use by exclusive mode */ - int prio; - uint32_t nb_threads; + struct ThreadsAffinityType_ **children; + struct ThreadsAffinityType_ *parent; // e.g. worker-cpu-set for interfaces SCMutex taf_mutex; #if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun @@ -76,6 +81,14 @@ typedef struct ThreadsAffinityType_ { cpu_set_t medprio_cpu; cpu_set_t hiprio_cpu; #endif + int prio; + uint32_t nb_threads; + uint32_t nb_children; + uint32_t nb_children_capacity; + uint16_t lcpu[MAX_NUMA_NODES]; /* use by exclusive mode */ + uint8_t mode_flag; + // a flag to avoid multiple warnings when no CPU is set + bool nocpu_warned; } ThreadsAffinityType; /** store thread affinity mode for all type of threads */ @@ -83,10 +96,16 @@ typedef struct ThreadsAffinityType_ { extern ThreadsAffinityType thread_affinity[MAX_CPU_SET]; #endif +char *AffinityGetYamlPath(ThreadsAffinityType *taf); void AffinitySetupLoadFromConfig(void); -ThreadsAffinityType * GetAffinityTypeFromName(const char *name); +ThreadsAffinityType *GetOrAllocAffinityTypeForIfaceOfName( + const char *name, const char *interface_name); +ThreadsAffinityType *GetAffinityTypeForNameAndIface(const char *name, const char *interface_name); +ThreadsAffinityType *FindAffinityByInterface( + ThreadsAffinityType *parent, const char *interface_name); -uint16_t AffinityGetNextCPU(ThreadsAffinityType *taf); +void TopologyDestroy(void); +uint16_t AffinityGetNextCPU(ThreadVars *tv, ThreadsAffinityType *taf); uint16_t UtilAffinityGetAffinedCPUNum(ThreadsAffinityType *taf); #ifdef HAVE_DPDK uint16_t UtilAffinityCpusOverlap(ThreadsAffinityType *taf1, ThreadsAffinityType *taf2); diff --git a/src/util-device.c b/src/util-device.c index 764d398410..0d5409c3a6 100644 --- a/src/util-device.c +++ b/src/util-device.c @@ -24,6 +24,7 @@ #include "device-storage.h" #include "util-debug.h" +#include "util-affinity.h" #define MAX_DEVNAME 10 @@ -178,6 +179,20 @@ int LiveGetDeviceCount(void) return i; } +int LiveGetDeviceCountWithoutAssignedThreading(void) +{ + int i = 0; + LiveDevice *pd; + + TAILQ_FOREACH (pd, &live_devices, next) { + if (GetAffinityTypeForNameAndIface("worker-cpu-set", pd->dev) == NULL) { + i++; + } + } + + return i; +} + /** * \brief Get a pointer to the device name at idx * diff --git a/src/util-device.h b/src/util-device.h index ebddcdd00d..8c91c75b3d 100644 --- a/src/util-device.h +++ b/src/util-device.h @@ -52,6 +52,7 @@ void LiveDevAddBypassStats(LiveDevice *dev, uint64_t cnt, int family); void LiveDevSubBypassStats(LiveDevice *dev, uint64_t cnt, int family); void LiveDevAddBypassFail(LiveDevice *dev, uint64_t cnt, int family); void LiveDevAddBypassSuccess(LiveDevice *dev, uint64_t cnt, int family); +int LiveGetDeviceCountWithoutAssignedThreading(void); int LiveGetDeviceCount(void); const char *LiveGetDeviceName(int number); LiveDevice *LiveGetDevice(const char *dev); diff --git a/src/util-runmodes.c b/src/util-runmodes.c index cac50203f3..53ff10a7fd 100644 --- a/src/util-runmodes.c +++ b/src/util-runmodes.c @@ -175,6 +175,11 @@ int RunModeSetLiveCaptureAutoFp(ConfigIfaceParserFunc ConfigParser, FatalError("TmThreadsCreate failed"); } tv_receive->printable_name = printable_threadname; + tv_receive->iface_name = SCStrdup(dev); + if (tv_receive->iface_name == NULL) { + FatalError("Failed to allocate memory for iface name"); + } + TmModule *tm_module = TmModuleGetByName(recv_mod_name); if (tm_module == NULL) { FatalError("TmModuleGetByName failed for %s", recv_mod_name); @@ -283,6 +288,14 @@ static int RunModeSetLiveCaptureWorkersForDevice(ConfigIfaceThreadsCountFunc Mod FatalError("TmThreadsCreate failed"); } tv->printable_name = printable_threadname; + if (live_dev) { + tv->iface_name = SCStrdup(live_dev); + if (tv->iface_name == NULL) { + FatalError("Failed to allocate memory for iface name"); + } + } else { + tv->iface_name = NULL; + } tm_module = TmModuleGetByName(recv_mod_name); if (tm_module == NULL) { diff --git a/suricata.yaml.in b/suricata.yaml.in index 138a094bd5..49ee5eb92b 100644 --- a/suricata.yaml.in +++ b/suricata.yaml.in @@ -1867,6 +1867,7 @@ spm-algo: auto # Suricata is multi-threaded. Here the threading can be influenced. threading: set-cpu-affinity: no + autopin: no # Tune cpu affinity of threads. Each family of threads can be bound # to specific CPUs. # @@ -1883,6 +1884,13 @@ threading: cpu: [ 0 ] # include only these CPUs in affinity settings receive-cpu-set: cpu: [ 0 ] # include only these CPUs in affinity settings + # interface-specific-cpu-set: + # - interface: "enp4s0f0" + # cpu: [ 1,3,5,7,9 ] + # mode: "exclusive" + # prio: + # high: [ "all" ] + # default: "medium" worker-cpu-set: cpu: [ "all" ] mode: "exclusive" @@ -1894,6 +1902,13 @@ threading: medium: [ "1-2" ] high: [ 3 ] default: "medium" + interface-specific-cpu-set: + - interface: "enp4s0f0" # 0000:3b:00.0 # net_bonding0 # ens1f0 + cpu: [ 1,3,5,7,9 ] + mode: "exclusive" + prio: + high: [ "all" ] + default: "medium" #verdict-cpu-set: # cpu: [ 0 ] # prio: