core: introduce NUMAPolicy and NUMAMask options

author Michal Sekletar <msekleta@redhat.com>

Tue, 12 Mar 2019 17:58:26 +0000 (18:58 +0100)

committer Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>

Mon, 24 Jun 2019 14:58:54 +0000 (16:58 +0200)
author Michal Sekletar <msekleta@redhat.com>
Tue, 12 Mar 2019 17:58:26 +0000 (18:58 +0100)
committer Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Mon, 24 Jun 2019 14:58:54 +0000 (16:58 +0200)
diff --git a/NEWS b/NEWS

index e5a66126479b3213d978462dd9fc24ed7c0edba4..d4a60c9d061fa93983a14290962188dd43ab245b 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -104,6 +104,12 @@ CHANGES WITH 243 in spe:
            all" pattern instead, e.g. OriginalName=* or Name=* in case all
            interfaces should really be matched.
  
+        * A new setting NUMAPolicy= may be used to set process memory
+          allocation policy. Setting can be specified in system.conf and
+          hence will set the default policy for PID1. Default policy can be
+          overriden on per-service basis. Related setting NUMAMask= is used to
+          specify NUMA node mask that should be associated with the selected
+          policy.
            …
  
  CHANGES WITH 242:
diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml

index f5d419c5196a4b30ba3bd46b5250a9bdf5419a58..9de04a7879e3cc90e7c6c9f35b0690cadb91c1cd 100644 (file)
--- a/man/systemd-system.conf.xml
+++ b/man/systemd-system.conf.xml
@@ -106,6 +106,25 @@
          <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para></listitem>
        </varlistentry>
  
+      <varlistentry>
+        <term><varname>NUMAPolicy=</varname></term>
+
+        <listitem><para>Configures the NUMA memory policy for the service manager and the default NUMA memory policy
+        for all forked off processes. Individual services may override the default policy with the
+        <varname>NUMAPolicy=</varname> setting in unit files, see
+        <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>NUMAMask=</varname></term>
+
+        <listitem><para>Configures the NUMA node mask that will be associated with the selected NUMA policy. Note that
+        <option>default</option> and <option>local</option> NUMA policies don't require explicit NUMA node mask and
+        value of the option can be empty. Similarly to <varname>NUMAPolicy=</varname>, value can be overriden
+        by individual services in unit files, see
+        <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para></listitem>
+      </varlistentry>
+
        <varlistentry>
          <term><varname>RuntimeWatchdogSec=</varname></term>
          <term><varname>ShutdownWatchdogSec=</varname></term>
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml

index 8f7d64d0176f79113901dad51bc29c07f5ed21ce..8963764bf6bd05944712fd83c1feb1e6ddc44519 100644 (file)
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -762,6 +762,28 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
          details.</para></listitem>
        </varlistentry>
  
+      <varlistentry>
+        <term><varname>NUMAPolicy=</varname></term>
+
+        <listitem><para>Controls the NUMA memory policy of the executed processes. Takes a policy type, one of:
+        <option>default</option>, <option>preferred</option>, <option>bind</option>, <option>interleave</option> and
+        <option>local</option>. A list of NUMA nodes that should be associated with the policy must be specified
+        in <varname>NUMAMask=</varname>. For more details on each policy please see,
+        <citerefentry><refentrytitle>set_mempolicy</refentrytitle><manvolnum>2</manvolnum></citerefentry>. For overall
+        overview of NUMA support in Linux see,
+        <citerefentry><refentrytitle>numa</refentrytitle><manvolnum>7</manvolnum></citerefentry>
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>NUMAMask=</varname></term>
+
+        <listitem><para>Controls the NUMA node list which will be applied alongside with selected NUMA policy.
+        Takes a list of NUMA nodes and has the same syntax as a list of CPUs for <varname>CPUAffinity=</varname>
+        option. Note that the list of NUMA nodes is not required for <option>default</option> and <option>local</option>
+        policies and for <option>preferred</option> policy we expect a single NUMA node.</para></listitem>
+      </varlistentry>
+
        <varlistentry>
          <term><varname>IOSchedulingClass=</varname></term>
  
@@ -2918,6 +2940,12 @@ StandardInputData=SWNrIHNpdHplIGRhIHVuJyBlc3NlIEtsb3BzLAp1ZmYgZWVtYWwga2xvcHAncy
              <entry><constant>EXIT_CONFIGURATION_DIRECTORY</constant></entry>
              <entry>Failed to set up unit's configuration directory. See <varname>ConfigurationDirectory=</varname> above.</entry>
            </row>
+          <row>
+            <entry>242</entry>
+            <entry><constant>EXIT_NUMA_POLICY</constant></entry>
+            <entry>Failed to set up unit's NUMA memory policy. See <varname>NUMAPolicy=</varname> and <varname>NUMAMask=</varname>above.</entry>
+          </row>
+
          </tbody>
        </tgroup>
      </table>
diff --git a/meson.build b/meson.build

index 0a9b3d5b857e1e679314f8caf8aae9116163eeea..e9c44bbb94f4e20938e608604c7b68ae441a8796 100644 (file)
--- a/meson.build
+++ b/meson.build
@@ -496,6 +496,10 @@ foreach ident : [
                                   #include <unistd.h>'''],
          ['explicit_bzero' ,   '''#include <string.h>'''],
          ['reallocarray',      '''#include <malloc.h>'''],
+        ['set_mempolicy',     '''#include <stdlib.h>
+                                 #include <unistd.h>'''],
+        ['get_mempolicy',     '''#include <stdlib.h>
+                                 #include <unistd.h>'''],
  ]
  
          have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE')
diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h

index d1aa32218b34ad34df72bc5d8bd843cc93a22838..cd455eb47cda51da4bc69808969a4dce74d13b84 100644 (file)
--- a/src/basic/missing_syscall.h
+++ b/src/basic/missing_syscall.h
@@ -444,3 +444,46 @@ static inline ssize_t missing_statx(int dfd, const char *filename, unsigned flag
  
  #  define statx missing_statx
  #endif
+
+#if !HAVE_SET_MEMPOLICY
+
+enum {
+        MPOL_DEFAULT,
+        MPOL_PREFERRED,
+        MPOL_BIND,
+        MPOL_INTERLEAVE,
+        MPOL_LOCAL,
+};
+
+static inline long missing_set_mempolicy(int mode, const unsigned long *nodemask,
+                           unsigned long maxnode) {
+        long i;
+#  ifdef __NR_set_mempolicy
+        i = syscall(__NR_set_mempolicy, mode, nodemask, maxnode);
+#  else
+        errno = ENOSYS;
+        i = -1;
+#  endif
+        return i;
+}
+
+#  define set_mempolicy missing_set_mempolicy
+#endif
+
+
+#if !HAVE_GET_MEMPOLICY
+static inline long missing_get_mempolicy(int *mode, unsigned long *nodemask,
+                           unsigned long maxnode, void *addr,
+                           unsigned long flags) {
+        long i;
+#  ifdef __NR_get_mempolicy
+        i = syscall(__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags);
+#  else
+        errno = ENOSYS;
+        i = -1;
+#  endif
+        return i;
+}
+
+#define get_mempolicy missing_get_mempolicy
+#endif
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c

index 4d5fb2eb1089234cac0b6c0e491881d7b03beffc..c816569f2b6fb6604cea677318a6d6ebf4d7174a 100644 (file)
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -225,6 +225,48 @@ static int property_get_cpu_affinity(
          return sd_bus_message_append_array(reply, 'y', array, allocated);
  }
  
+static int property_get_numa_mask(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+
+        ExecContext *c = userdata;
+        _cleanup_free_ uint8_t *array = NULL;
+        size_t allocated;
+
+        assert(bus);
+        assert(reply);
+        assert(c);
+
+        (void) cpu_set_to_dbus(&c->numa_policy.nodes, &array, &allocated);
+
+        return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_numa_policy(
+                sd_bus *bus,
+                const char *path,
+                const char *interface,
+                const char *property,
+                sd_bus_message *reply,
+                void *userdata,
+                sd_bus_error *error) {
+        ExecContext *c = userdata;
+        int32_t policy;
+
+        assert(bus);
+        assert(reply);
+        assert(c);
+
+        policy = numa_policy_get_type(&c->numa_policy);
+
+        return sd_bus_message_append_basic(reply, 'i', &policy);
+}
+
  static int property_get_timer_slack_nsec(
                  sd_bus *bus,
                  const char *path,
@@ -700,6 +742,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
          SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
          SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
          SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST),
          SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
          SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST),
          SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1650,9 +1694,10 @@ int bus_exec_context_set_transient_property(
                  return 1;
          }
  #endif
-        if (streq(name, "CPUAffinity")) {
+        if (STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
                  const void *a;
                  size_t n;
+                bool affinity = streq(name, "CPUAffinity");
                  _cleanup_(cpu_set_reset) CPUSet set = {};
  
                  r = sd_bus_message_read_array(message, 'y', &a, &n);
@@ -1665,7 +1710,7 @@ int bus_exec_context_set_transient_property(
  
                  if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
                          if (n == 0) {
-                                cpu_set_reset(&c->cpu_set);
+                                cpu_set_reset(affinity ? &c->cpu_set : &c->numa_policy.nodes);
                                  unit_write_settingf(u, flags, name, "%s=", name);
                          } else {
                                  _cleanup_free_ char *str = NULL;
@@ -1677,7 +1722,7 @@ int bus_exec_context_set_transient_property(
                                  /* We forego any optimizations here, and always create the structure using
                                   * cpu_set_add_all(), because we don't want to care if the existing size we
                                   * got over dbus is appropriate. */
-                                r = cpu_set_add_all(&c->cpu_set, &set);
+                                r = cpu_set_add_all(affinity ? &c->cpu_set : &c->numa_policy.nodes, &set);
                                  if (r < 0)
                                          return r;
  
@@ -1687,6 +1732,20 @@ int bus_exec_context_set_transient_property(
  
                  return 1;
  
+        } else if (streq(name, "NUMAPolicy")) {
+                int32_t type;
+
+                r = sd_bus_message_read(message, "i", &type);
+                if (r < 0)
+                        return r;
+
+                if (!mpol_is_valid(type))
+                        return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NUMAPolicy value: %i", type);
+
+                if (!UNIT_WRITE_FLAGS_NOOP(flags))
+                        c->numa_policy.type = type;
+
+                return 1;
          } else if (streq(name, "Nice")) {
                  int32_t q;
  
diff --git a/src/core/execute.c b/src/core/execute.c

index 921449391d4530faaf3e61012355acf2774bbbe3..426e57b8e0e6ace14aee07c956ac84170fcb386d 100644 (file)
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -3148,6 +3148,16 @@ static int exec_child(
                          return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
                  }
  
+        if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
+                r = apply_numa_policy(&context->numa_policy);
+                if (r == -EOPNOTSUPP)
+                        log_unit_debug_errno(unit, SYNTHETIC_ERRNO(r), "NUMA support not available, ignoring.");
+                else if (r < 0) {
+                        *exit_status = EXIT_NUMA_POLICY;
+                        return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
+                }
+        }
+
          if (context->ioprio_set)
                  if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
                          *exit_status = EXIT_IOPRIO;
@@ -3854,6 +3864,7 @@ void exec_context_init(ExecContext *c) {
          assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
          c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
          c->log_level_max = -1;
+        numa_policy_reset(&c->numa_policy);
  }
  
  void exec_context_done(ExecContext *c) {
@@ -3898,6 +3909,7 @@ void exec_context_done(ExecContext *c) {
          c->n_temporary_filesystems = 0;
  
          cpu_set_reset(&c->cpu_set);
+        numa_policy_reset(&c->numa_policy);
  
          c->utmp_id = mfree(c->utmp_id);
          c->selinux_context = mfree(c->selinux_context);
@@ -4336,6 +4348,14 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
                  fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
          }
  
+        if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
+                _cleanup_free_ char *nodes = NULL;
+
+                nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
+                fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
+                fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
+        }
+
          if (c->timer_slack_nsec != NSEC_INFINITY)
                  fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
  
diff --git a/src/core/execute.h b/src/core/execute.h

index 780876826f2fd0afdb4adf34f7be011412416068..609e15fc07383cba8fcfa48eaa59d007de4f61f0 100644 (file)
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -167,6 +167,7 @@ struct ExecContext {
          int cpu_sched_priority;
  
          CPUSet cpu_set;
+        NUMAPolicy numa_policy;
  
          ExecInput std_input;
          ExecOutput std_output;
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4

index b868a367f1ebd299ea6563f355f67c9c58a0d03b..5e6fb640939862a18180327929b556a218fb26b6 100644 (file)
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -36,6 +36,8 @@ $1.CPUSchedulingPolicy,          config_parse_exec_cpu_sched_policy, 0,
  $1.CPUSchedulingPriority,        config_parse_exec_cpu_sched_prio,   0,                             offsetof($1, exec_context)
  $1.CPUSchedulingResetOnFork,     config_parse_bool,                  0,                             offsetof($1, exec_context.cpu_sched_reset_on_fork)
  $1.CPUAffinity,                  config_parse_exec_cpu_affinity,     0,                             offsetof($1, exec_context)
+$1.NUMAPolicy,                   config_parse_numa_policy,           0,                             offsetof($1, exec_context.numa_policy.type)
+$1.NUMAMask,                     config_parse_numa_mask,             0,                             offsetof($1, exec_context.numa_policy)
  $1.UMask,                        config_parse_mode,                  0,                             offsetof($1, exec_context.umask)
  $1.Environment,                  config_parse_environ,               0,                             offsetof($1, exec_context.environment)
  $1.EnvironmentFile,              config_parse_unit_env_file,         0,                             offsetof($1, exec_context.environment_files)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c

index 5c413be08fc658ff5e7aef5af7c49257e6d3a6e7..274d9d2fef315430ac3ee5482747f44fffe09776 100644 (file)
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -92,6 +92,7 @@ DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint
  DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");
  DEFINE_CONFIG_PARSE_PTR(config_parse_cpu_shares, cg_cpu_shares_parse, uint64_t, "Invalid CPU shares");
  DEFINE_CONFIG_PARSE_PTR(config_parse_exec_mount_flags, mount_propagation_flags_from_string, unsigned long, "Failed to parse mount flag");
+DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_numa_policy, mpol, int, -1, "Invalid NUMA policy type");
  
  int config_parse_unit_deps(
                  const char *unit,
@@ -1211,6 +1212,33 @@ int config_parse_exec_cpu_sched_policy(const char *unit,
          return 0;
  }
  
+int config_parse_numa_mask(const char *unit,
+                           const char *filename,
+                           unsigned line,
+                           const char *section,
+                           unsigned section_line,
+                           const char *lvalue,
+                           int ltype,
+                           const char *rvalue,
+                           void *data,
+                           void *userdata) {
+        int r;
+        NUMAPolicy *p = data;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+        assert(data);
+
+        r = parse_cpu_set_extend(rvalue, &p->nodes, true, unit, filename, line, lvalue);
+        if (r < 0) {
+                log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse NUMA node mask, ignoring: %s", rvalue);
+                return 0;
+        }
+
+        return r;
+}
+
  int config_parse_exec_cpu_sched_prio(const char *unit,
                                       const char *filename,
                                       unsigned line,
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h

index 0891f367604f1bb9c22a8265cb2d5b6bfa45dd0d..ddcc8d216da34b7d00e77cae87b556f535c89a32 100644 (file)
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@@ -108,6 +108,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
  CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
  CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
  CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask);
  
  /* gperf prototypes */
  const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
diff --git a/src/core/main.c b/src/core/main.c

index 3a41573ef04a8d8a411fe805c02b5fc215ca17d6..d74e8737e6bc6952eae02c7aa4beffe9a5aa6861 100644 (file)
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -142,6 +142,7 @@ static sd_id128_t arg_machine_id;
  static EmergencyAction arg_cad_burst_action;
  static OOMPolicy arg_default_oom_policy;
  static CPUSet arg_cpu_affinity;
+static NUMAPolicy arg_numa_policy;
  
  static int parse_configuration(void);
  
@@ -720,6 +721,8 @@ static int parse_config_file(void) {
                  { "Manager", "CrashReboot",               config_parse_bool,             0, &arg_crash_reboot                      },
                  { "Manager", "ShowStatus",                config_parse_show_status,      0, &arg_show_status                       },
                  { "Manager", "CPUAffinity",               config_parse_cpu_affinity2,    0, &arg_cpu_affinity                      },
+                { "Manager", "NUMAPolicy",                config_parse_numa_policy,      0, &arg_numa_policy.type                  },
+                { "Manager", "NUMAMask",                  config_parse_numa_mask,        0, &arg_numa_policy                       },
                  { "Manager", "JoinControllers",           config_parse_warn_compat,      DISABLED_CONFIGURATION, NULL              },
                  { "Manager", "RuntimeWatchdogSec",        config_parse_sec,              0, &arg_runtime_watchdog                  },
                  { "Manager", "ShutdownWatchdogSec",       config_parse_sec,              0, &arg_shutdown_watchdog                 },
@@ -1753,6 +1756,27 @@ static void update_cpu_affinity(bool skip_setup) {
                  log_warning_errno(errno, "Failed to set CPU affinity: %m");
  }
  
+static void update_numa_policy(bool skip_setup) {
+        int r;
+        _cleanup_free_ char *nodes = NULL;
+        const char * policy = NULL;
+
+        if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
+                return;
+
+        if (DEBUG_LOGGING) {
+                policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
+                nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
+                log_debug("Setting NUMA policy to %s, with nodes %s.", strnull(policy), strnull(nodes));
+        }
+
+        r = apply_numa_policy(&arg_numa_policy);
+        if (r == -EOPNOTSUPP)
+                log_debug_errno(r, "NUMA support not available, ignoring.");
+        else if (r < 0)
+                log_warning_errno(r, "Failed to set NUMA memory policy: %m");
+}
+
  static void do_reexecute(
                  int argc,
                  char *argv[],
@@ -1924,6 +1948,7 @@ static int invoke_main_loop(
                          set_manager_defaults(m);
  
                          update_cpu_affinity(false);
+                        update_numa_policy(false);
  
                          if (saved_log_level >= 0)
                                  manager_override_log_level(m, saved_log_level);
@@ -2084,6 +2109,7 @@ static int initialize_runtime(
                  return 0;
  
          update_cpu_affinity(skip_setup);
+        update_numa_policy(skip_setup);
  
          if (arg_system) {
                  /* Make sure we leave a core dump without panicking the kernel. */
@@ -2262,6 +2288,7 @@ static void reset_arguments(void) {
          arg_default_oom_policy = OOM_STOP;
  
          cpu_set_reset(&arg_cpu_affinity);
+        numa_policy_reset(&arg_numa_policy);
  }
  
  static int parse_configuration(void) {
diff --git a/src/core/system.conf.in b/src/core/system.conf.in

index 548e6dfb8c99fa031feb43c522977c11a3164fa1..20f56969cc801e8925f8d5a15ec022a620beb4d2 100644 (file)
--- a/src/core/system.conf.in
+++ b/src/core/system.conf.in
@@ -23,6 +23,8 @@
  #CrashReboot=no
  #CtrlAltDelBurstAction=reboot-force
  #CPUAffinity=1 2
+#NUMAPolicy=default
+#NUMAMask=
  #RuntimeWatchdogSec=0
  #ShutdownWatchdogSec=10min
  #WatchdogDevice=
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c

index fd26b86359b5aa470d16b02b681c902358f2ec8b..bb30e8f1514a57f2b91b17a3f6af10dcf5b20757 100644 (file)
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -1049,6 +1049,34 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
                  return bus_append_byte_array(m, field, array, allocated);
          }
  
+        if (streq(field, "NUMAPolicy")) {
+                r = mpol_from_string(eq);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
+
+                r = sd_bus_message_append(m, "(sv)", field, "i", (int32_t) r);
+                if (r < 0)
+                        return bus_log_create_error(r);
+
+                return 1;
+        }
+
+        if (streq(field, "NUMAMask")) {
+                _cleanup_(cpu_set_reset) CPUSet nodes = {};
+                _cleanup_free_ uint8_t *array = NULL;
+                size_t allocated;
+
+                r = parse_cpu_set(eq, &nodes);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
+
+                r = cpu_set_to_dbus(&nodes, &array, &allocated);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to serialize NUMAMask: %m");
+
+                return bus_append_byte_array(m, field, array, allocated);
+        }
+
          if (STR_IN_SET(field, "RestrictAddressFamilies", "SystemCallFilter")) {
                  int whitelist = 1;
                  const char *p = eq;
diff --git a/src/shared/cpu-set-util.c b/src/shared/cpu-set-util.c

index b0036c7f619bea6736e6496265413e25e392ec1e..f27543dfe2aa8b9da09ade00158691422071f88d 100644 (file)
--- a/src/shared/cpu-set-util.c
+++ b/src/shared/cpu-set-util.c
@@ -7,12 +7,20 @@
  
  #include "alloc-util.h"
  #include "cpu-set-util.h"
+#include "dirent-util.h"
+#include "errno-util.h"
  #include "extract-word.h"
+#include "fd-util.h"
  #include "log.h"
  #include "macro.h"
  #include "memory-util.h"
+#include "missing_syscall.h"
  #include "parse-util.h"
+#include "stat-util.h"
  #include "string-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "util.h"
  
  char* cpu_set_to_string(const CPUSet *a) {
          _cleanup_free_ char *str = NULL;
@@ -287,3 +295,88 @@ int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set) {
          s = (CPUSet) {};
          return 0;
  }
+
+bool numa_policy_is_valid(const NUMAPolicy *policy) {
+        assert(policy);
+
+        if (!mpol_is_valid(numa_policy_get_type(policy)))
+                return false;
+
+        if (!policy->nodes.set &&
+            !IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL, MPOL_PREFERRED))
+                return false;
+
+        if (policy->nodes.set &&
+            numa_policy_get_type(policy) == MPOL_PREFERRED &&
+            CPU_COUNT_S(policy->nodes.allocated, policy->nodes.set) != 1)
+                return false;
+
+        return true;
+}
+
+static int numa_policy_to_mempolicy(const NUMAPolicy *policy, unsigned long *ret_maxnode, unsigned long **ret_nodes) {
+        unsigned node, bits = 0, ulong_bits;
+        _cleanup_free_ unsigned long *out = NULL;
+
+        assert(policy);
+        assert(ret_maxnode);
+        assert(ret_nodes);
+
+        if (IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL) ||
+            (numa_policy_get_type(policy) == MPOL_PREFERRED && !policy->nodes.set)) {
+                *ret_nodes = NULL;
+                *ret_maxnode = 0;
+                return 0;
+        }
+
+        bits = policy->nodes.allocated * 8;
+        ulong_bits = sizeof(unsigned long) * 8;
+
+        out = new0(unsigned long, DIV_ROUND_UP(policy->nodes.allocated, sizeof(unsigned long)));
+        if (!out)
+                return -ENOMEM;
+
+        /* We don't make any assumptions about internal type libc is using to store NUMA node mask.
+           Hence we need to convert the node mask to the representation expected by set_mempolicy() */
+        for (node = 0; node < bits; node++)
+                if (CPU_ISSET_S(node, policy->nodes.allocated, policy->nodes.set))
+                        out[node / ulong_bits] |= 1ul << (node % ulong_bits);
+
+        *ret_nodes = TAKE_PTR(out);
+        *ret_maxnode = bits + 1;
+        return 0;
+}
+
+int apply_numa_policy(const NUMAPolicy *policy) {
+        int r;
+        _cleanup_free_ unsigned long *nodes = NULL;
+        unsigned long maxnode;
+
+        assert(policy);
+
+        if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
+                return -EOPNOTSUPP;
+
+        if (!numa_policy_is_valid(policy))
+                return -EINVAL;
+
+        r = numa_policy_to_mempolicy(policy, &maxnode, &nodes);
+        if (r < 0)
+                return r;
+
+        r = set_mempolicy(numa_policy_get_type(policy), nodes, maxnode);
+        if (r < 0)
+                return -errno;
+
+        return 0;
+}
+
+static const char* const mpol_table[] = {
+        [MPOL_DEFAULT]    = "default",
+        [MPOL_PREFERRED]  = "preferred",
+        [MPOL_BIND]       = "bind",
+        [MPOL_INTERLEAVE] = "interleave",
+        [MPOL_LOCAL]      = "local",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mpol, int);
diff --git a/src/shared/cpu-set-util.h b/src/shared/cpu-set-util.h

index fd6a15f4468fa4becce701bfa8b4ec584e8a22bf..27812dfd5923606675a3b73bb0f295053ae9bce3 100644 (file)
--- a/src/shared/cpu-set-util.h
+++ b/src/shared/cpu-set-util.h
@@ -4,6 +4,7 @@
  #include <sched.h>
  
  #include "macro.h"
+#include "missing_syscall.h"
  
  /* This wraps the libc interface with a variable to keep the allocated size. */
  typedef struct CPUSet {
@@ -48,3 +49,30 @@ int cpu_set_to_dbus(const CPUSet *set, uint8_t **ret, size_t *allocated);
  int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set);
  
  int cpus_in_affinity_mask(void);
+
+static inline bool mpol_is_valid(int t) {
+        return t >= MPOL_DEFAULT && t <= MPOL_LOCAL;
+}
+
+typedef struct NUMAPolicy {
+        /* Always use numa_policy_get_type() to read the value */
+        int type;
+        CPUSet nodes;
+} NUMAPolicy;
+
+bool numa_policy_is_valid(const NUMAPolicy *p);
+
+static inline int numa_policy_get_type(const NUMAPolicy *p) {
+        return p->type < 0 ? (p->nodes.set ? MPOL_PREFERRED : -1) : p->type;
+}
+
+static inline void numa_policy_reset(NUMAPolicy *p) {
+        assert(p);
+        cpu_set_reset(&p->nodes);
+        p->type = -1;
+}
+
+int apply_numa_policy(const NUMAPolicy *policy);
+
+const char* mpol_to_string(int i) _const_;
+int mpol_from_string(const char *s) _pure_;
diff --git a/src/shared/exit-status.c b/src/shared/exit-status.c

index 26b3060d9b939198a3240954ca7af1ae7a6b5df2..58ebc3ca4d6558cf1d28ba544f9e9bcf81c43da0 100644 (file)
--- a/src/shared/exit-status.c
+++ b/src/shared/exit-status.c
@@ -157,6 +157,9 @@ const char* exit_status_to_string(int status, ExitStatusLevel level) {
                  case EXIT_CONFIGURATION_DIRECTORY:
                          return "CONFIGURATION_DIRECTORY";
  
+                case EXIT_NUMA_POLICY:
+                        return "NUMA_POLICY";
+
                  case EXIT_EXCEPTION:
                          return "EXCEPTION";
                  }
diff --git a/src/shared/exit-status.h b/src/shared/exit-status.h

index 510eb319cf6b2c06ce93d5074e724a5f82624e0f..5637e6aa04ddd9df731caf843baebc5b29667539 100644 (file)
--- a/src/shared/exit-status.h
+++ b/src/shared/exit-status.h
@@ -69,6 +69,7 @@ enum {
          EXIT_CACHE_DIRECTORY,
          EXIT_LOGS_DIRECTORY, /* 240 */
          EXIT_CONFIGURATION_DIRECTORY,
+        EXIT_NUMA_POLICY,
  
          EXIT_EXCEPTION = 255,  /* Whenever we want to propagate an abnormal/signal exit, in line with bash */
  };
diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c

index 31bc776449eec0b8984720beee49288296d68d6b..cf0c612923f9e4ed347315c2358cdc1fe176d641 100644 (file)
--- a/src/systemctl/systemctl.c
+++ b/src/systemctl/systemctl.c
@@ -4838,6 +4838,16 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m
                          else if (all)
                                  bus_print_property_value(name, expected_value, value, "[not set]");
  
+                        return 1;
+                } else if (streq(name, "NUMAPolicy")) {
+                        int32_t i;
+
+                        r = sd_bus_message_read_basic(m, bus_type, &i);
+                        if (r < 0)
+                                return r;
+
+                        bus_print_property_valuef(name, expected_value, value, "%s", strna(mpol_to_string(i)));
+
                          return 1;
                  }
                  break;
@@ -5451,7 +5461,7 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m
                                  bus_print_property_value(name, expected_value, value, strempty(fields));
  
                          return 1;
-                } else if (contents[0] == SD_BUS_TYPE_BYTE && streq(name, "CPUAffinity")) {
+                } else if (contents[0] == SD_BUS_TYPE_BYTE && STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
                          _cleanup_free_ char *affinity = NULL;
                          _cleanup_(cpu_set_reset) CPUSet set = {};
                          const void *a;
@@ -5463,7 +5473,7 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m
  
                          r = cpu_set_from_dbus(a, n, &set);
                          if (r < 0)
-                                return log_error_errno(r, "Failed to deserialize CPUAffinity: %m");
+                                return log_error_errno(r, "Failed to deserialize %s: %m", name);
  
                          affinity = cpu_set_to_range_string(&set);
                          if (!affinity)
author	Michal Sekletar <msekleta@redhat.com>
	Tue, 12 Mar 2019 17:58:26 +0000 (18:58 +0100)
committer	Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
	Mon, 24 Jun 2019 14:58:54 +0000 (16:58 +0200)
NEWS		patch \| blob \| blame \| history
man/systemd-system.conf.xml		patch \| blob \| blame \| history
man/systemd.exec.xml		patch \| blob \| blame \| history
meson.build		patch \| blob \| blame \| history
src/basic/missing_syscall.h		patch \| blob \| blame \| history
src/core/dbus-execute.c		patch \| blob \| blame \| history
src/core/execute.c		patch \| blob \| blame \| history
src/core/execute.h		patch \| blob \| blame \| history
src/core/load-fragment-gperf.gperf.m4		patch \| blob \| blame \| history
src/core/load-fragment.c		patch \| blob \| blame \| history
src/core/load-fragment.h		patch \| blob \| blame \| history
src/core/main.c		patch \| blob \| blame \| history
src/core/system.conf.in		patch \| blob \| blame \| history
src/shared/bus-unit-util.c		patch \| blob \| blame \| history
src/shared/cpu-set-util.c		patch \| blob \| blame \| history
src/shared/cpu-set-util.h		patch \| blob \| blame \| history
src/shared/exit-status.c		patch \| blob \| blame \| history
src/shared/exit-status.h		patch \| blob \| blame \| history
src/systemctl/systemctl.c		patch \| blob \| blame \| history