s390/hiperdispatch: Introduce hiperdispatch

author Mete Durlu <meted@linux.ibm.com>

Mon, 12 Aug 2024 11:39:34 +0000 (13:39 +0200)

committer Vasily Gorbik <gor@linux.ibm.com>

Thu, 29 Aug 2024 20:56:35 +0000 (22:56 +0200)
author Mete Durlu <meted@linux.ibm.com>
Mon, 12 Aug 2024 11:39:34 +0000 (13:39 +0200)
committer Vasily Gorbik <gor@linux.ibm.com>
Thu, 29 Aug 2024 20:56:35 +0000 (22:56 +0200)
diff --git a/arch/s390/include/asm/hiperdispatch.h b/arch/s390/include/asm/hiperdispatch.h

new file mode 100644 (file)

index 0000000..27e23aa
--- /dev/null
+++ b/arch/s390/include/asm/hiperdispatch.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2024
+ */
+
+#ifndef _ASM_HIPERDISPATCH_H
+#define _ASM_HIPERDISPATCH_H
+
+void hd_reset_state(void);
+void hd_add_core(int cpu);
+void hd_disable_hiperdispatch(void);
+int hd_enable_hiperdispatch(void);
+
+#endif /* _ASM_HIPERDISPATCH_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile

index badeaa5ccd83e1c9417c3dcf4d113aafb42f01bd..5ceb08b338d32d8d533a2dd625bc4fb5aa2dafb1 100644 (file)
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -51,7 +51,7 @@ obj-$(CONFIG_SYSFS)           += nospec-sysfs.o
  CFLAGS_REMOVE_nospec-branch.o  += $(CC_FLAGS_EXPOLINE)
  
  obj-$(CONFIG_MODULES)          += module.o
-obj-$(CONFIG_SCHED_TOPOLOGY)   += topology.o
+obj-$(CONFIG_SCHED_TOPOLOGY)   += topology.o hiperdispatch.o
  obj-$(CONFIG_NUMA)             += numa.o
  obj-$(CONFIG_AUDIT)            += audit.o
  compat-obj-$(CONFIG_AUDIT)     += compat_audit.o
diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c

new file mode 100644 (file)

index 0000000..233872d
--- /dev/null
+++ b/arch/s390/kernel/hiperdispatch.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2024
+ */
+
+#define KMSG_COMPONENT "hd"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+/*
+ * Hiperdispatch:
+ * Dynamically calculates the optimum number of high capacity COREs
+ * by considering the state the system is in. When hiperdispatch decides
+ * that a capacity update is necessary, it schedules a topology update.
+ * During topology updates the CPU capacities are always re-adjusted.
+ *
+ * There is two places where CPU capacities are being accessed within
+ * hiperdispatch.
+ * -> hiperdispatch's reoccuring work function reads CPU capacities to
+ *    determine high capacity CPU count.
+ * -> during a topology update hiperdispatch's adjustment function
+ *    updates CPU capacities.
+ * These two can run on different CPUs in parallel which can cause
+ * hiperdispatch to make wrong decisions. This can potentially cause
+ * some overhead by leading to extra rebuild_sched_domains() calls
+ * for correction. Access to capacities within hiperdispatch has to be
+ * serialized to prevent the overhead.
+ *
+ * Hiperdispatch decision making revolves around steal time.
+ * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
+ * crosses the threshold value hiperdispatch falls back to giving high
+ * capacities to entitled CPUs. When steal time drops below the
+ * threshold boundary, hiperdispatch utilizes all CPUs by giving all
+ * of them high capacity.
+ *
+ * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
+ * performance. Comparing the throughput of;
+ * - single CORE, with N threads, running N tasks
+ * - N separate COREs running N tasks,
+ * using individual COREs for individual tasks yield better
+ * performance. This performance difference is roughly ~30% (can change
+ * between machine generations)
+ *
+ * Hiperdispatch tries to hint scheduler to use individual COREs for
+ * each task, as long as steal time on those COREs are less than 30%,
+ * therefore delaying the throughput loss caused by using SMP threads.
+ */
+
+#include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
+#include <linux/ktime.h>
+#include <linux/workqueue.h>
+#include <asm/hiperdispatch.h>
+#include <asm/smp.h>
+#include <asm/topology.h>
+
+#define HD_DELAY_FACTOR                        (4)
+#define HD_DELAY_INTERVAL              (HZ / 4)
+#define HD_STEAL_THRESHOLD             30
+
+static cpumask_t hd_vl_coremask;       /* Mask containing all vertical low COREs */
+static cpumask_t hd_vmvl_cpumask;      /* Mask containing vertical medium and low CPUs */
+static int hd_high_capacity_cores;     /* Current CORE count with high capacity */
+static int hd_entitled_cores;          /* Total vertical high and medium CORE count */
+static int hd_online_cores;            /* Current online CORE count */
+
+static unsigned long hd_previous_steal;        /* Previous iteration's CPU steal timer total */
+
+static void hd_capacity_work_fn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
+
+void hd_reset_state(void)
+{
+       cpumask_clear(&hd_vl_coremask);
+       cpumask_clear(&hd_vmvl_cpumask);
+       hd_entitled_cores = 0;
+       hd_online_cores = 0;
+}
+
+void hd_add_core(int cpu)
+{
+       const struct cpumask *siblings;
+       int polarization;
+
+       hd_online_cores++;
+       polarization = smp_cpu_get_polarization(cpu);
+       siblings = topology_sibling_cpumask(cpu);
+       switch (polarization) {
+       case POLARIZATION_VH:
+               hd_entitled_cores++;
+               break;
+       case POLARIZATION_VM:
+               hd_entitled_cores++;
+               cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
+               break;
+       case POLARIZATION_VL:
+               cpumask_set_cpu(cpu, &hd_vl_coremask);
+               cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
+               break;
+       }
+}
+
+static void hd_update_capacities(void)
+{
+       int cpu, upscaling_cores;
+       unsigned long capacity;
+
+       upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
+       capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
+       hd_high_capacity_cores = hd_entitled_cores;
+       for_each_cpu(cpu, &hd_vl_coremask) {
+               smp_set_core_capacity(cpu, capacity);
+               if (capacity != CPU_CAPACITY_HIGH)
+                       continue;
+               hd_high_capacity_cores++;
+               upscaling_cores--;
+               if (upscaling_cores == 0)
+                       capacity = CPU_CAPACITY_LOW;
+       }
+}
+
+void hd_disable_hiperdispatch(void)
+{
+       cancel_delayed_work_sync(&hd_capacity_work);
+       hd_high_capacity_cores = hd_online_cores;
+       hd_previous_steal = 0;
+}
+
+int hd_enable_hiperdispatch(void)
+{
+       if (hd_entitled_cores == 0)
+               return 0;
+       if (hd_online_cores <= hd_entitled_cores)
+               return 0;
+       mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * HD_DELAY_FACTOR);
+       hd_update_capacities();
+       return 1;
+}
+
+static unsigned long hd_calculate_steal_percentage(void)
+{
+       unsigned long time_delta, steal_delta, steal, percentage;
+       static ktime_t prev;
+       int cpus, cpu;
+       ktime_t now;
+
+       cpus = 0;
+       steal = 0;
+       percentage = 0;
+       for_each_cpu(cpu, &hd_vmvl_cpumask) {
+               steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+               cpus++;
+       }
+       /*
+        * If there is no vertical medium and low CPUs steal time
+        * is 0 as vertical high CPUs shouldn't experience steal time.
+        */
+       if (cpus == 0)
+               return percentage;
+       now = ktime_get();
+       time_delta = ktime_to_ns(ktime_sub(now, prev));
+       if (steal > hd_previous_steal && hd_previous_steal != 0) {
+               steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
+               percentage = steal_delta / cpus;
+       }
+       hd_previous_steal = steal;
+       prev = now;
+       return percentage;
+}
+
+static void hd_capacity_work_fn(struct work_struct *work)
+{
+       unsigned long steal_percentage, new_cores;
+
+       mutex_lock(&smp_cpu_state_mutex);
+       /*
+        * If online cores are less or equal to entitled cores hiperdispatch
+        * does not need to make any adjustments, call a topology update to
+        * disable hiperdispatch.
+        * Normally this check is handled on topology update, but during cpu
+        * unhotplug, topology and cpu mask updates are done in reverse
+        * order, causing hd_enable_hiperdispatch() to get stale data.
+        */
+       if (hd_online_cores <= hd_entitled_cores) {
+               topology_schedule_update();
+               mutex_unlock(&smp_cpu_state_mutex);
+               return;
+       }
+       steal_percentage = hd_calculate_steal_percentage();
+       if (steal_percentage < HD_STEAL_THRESHOLD)
+               new_cores = hd_online_cores;
+       else
+               new_cores = hd_entitled_cores;
+       if (hd_high_capacity_cores != new_cores) {
+               hd_high_capacity_cores = new_cores;
+               topology_schedule_update();
+       }
+       mutex_unlock(&smp_cpu_state_mutex);
+       schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
+}
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c

index 58da6d1bae45bcc492214007b18ab16fe42a0097..813e5da9a9737e05ec3ef85a495fe38cbb40c49b 100644 (file)
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -24,6 +24,7 @@
  #include <linux/mm.h>
  #include <linux/nodemask.h>
  #include <linux/node.h>
+#include <asm/hiperdispatch.h>
  #include <asm/sysinfo.h>
  
  #define PTF_HORIZONTAL (0UL)
@@ -47,6 +48,7 @@ static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED;
  static void set_topology_timer(void);
  static void topology_work_fn(struct work_struct *work);
  static struct sysinfo_15_1_x *tl_info;
+static int cpu_management;
  
  static DECLARE_WORK(topology_work, topology_work_fn);
  
@@ -144,6 +146,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
                         cpumask_set_cpu(cpu, &book->mask);
                         cpumask_set_cpu(cpu, &socket->mask);
                         smp_cpu_set_polarization(cpu, tl_core->pp);
+                       smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH);
                 }
         }
  }
@@ -270,6 +273,7 @@ void update_cpu_masks(void)
                         topo->drawer_id = id;
                 }
         }
+       hd_reset_state();
         for_each_online_cpu(cpu) {
                 topo = &cpu_topology[cpu];
                 pkg_first = cpumask_first(&topo->core_mask);
@@ -278,8 +282,10 @@ void update_cpu_masks(void)
                         for_each_cpu(sibling, &topo->core_mask) {
                                 topo_sibling = &cpu_topology[sibling];
                                 smt_first = cpumask_first(&topo_sibling->thread_mask);
-                               if (sibling == smt_first)
+                               if (sibling == smt_first) {
                                         topo_package->booted_cores++;
+                                       hd_add_core(sibling);
+                               }
                         }
                 } else {
                         topo->booted_cores = topo_package->booted_cores;
@@ -303,8 +309,10 @@ static void __arch_update_dedicated_flag(void *arg)
  static int __arch_update_cpu_topology(void)
  {
         struct sysinfo_15_1_x *info = tl_info;
-       int rc = 0;
+       int rc, hd_status;
  
+       hd_status = 0;
+       rc = 0;
         mutex_lock(&smp_cpu_state_mutex);
         if (MACHINE_HAS_TOPOLOGY) {
                 rc = 1;
@@ -314,7 +322,11 @@ static int __arch_update_cpu_topology(void)
         update_cpu_masks();
         if (!MACHINE_HAS_TOPOLOGY)
                 topology_update_polarization_simple();
+       if (cpu_management == 1)
+               hd_status = hd_enable_hiperdispatch();
         mutex_unlock(&smp_cpu_state_mutex);
+       if (hd_status == 0)
+               hd_disable_hiperdispatch();
         return rc;
  }
  
@@ -374,8 +386,6 @@ void topology_expect_change(void)
         set_topology_timer();
  }
  
-static int cpu_management;
-
  static int set_polarization(int polarization)
  {
         int rc = 0;
author	Mete Durlu <meted@linux.ibm.com>
	Mon, 12 Aug 2024 11:39:34 +0000 (13:39 +0200)
committer	Vasily Gorbik <gor@linux.ibm.com>
	Thu, 29 Aug 2024 20:56:35 +0000 (22:56 +0200)
arch/s390/include/asm/hiperdispatch.h	[new file with mode: 0644]	patch \| blob
arch/s390/kernel/Makefile		patch \| blob \| blame \| history
arch/s390/kernel/hiperdispatch.c	[new file with mode: 0644]	patch \| blob
arch/s390/kernel/topology.c		patch \| blob \| blame \| history