NUMA balancing: optimize page placement for memory tiering system

author Huang Ying <ying.huang@intel.com>

Tue, 22 Mar 2022 21:46:23 +0000 (14:46 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 22 Mar 2022 22:57:09 +0000 (15:57 -0700)
author Huang Ying <ying.huang@intel.com>
Tue, 22 Mar 2022 21:46:23 +0000 (14:46 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 22 Mar 2022 22:57:09 +0000 (15:57 -0700)
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst

index d359bcfadd39a79a404ae32000f32a991fcb3f61..fdfd2b6848220b56b006806da997c569b36df6ea 100644 (file)
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -595,16 +595,23 @@ Documentation/admin-guide/kernel-parameters.rst).
  numa_balancing
  ==============
  
-Enables/disables automatic page fault based NUMA memory
-balancing. Memory is moved automatically to nodes
-that access it often.
+Enables/disables and configures automatic page fault based NUMA memory
+balancing.  Memory is moved automatically to nodes that access it often.
+The value to set can be the result of ORing the following:
  
-Enables/disables automatic NUMA memory balancing. On NUMA machines, there
-is a performance penalty if remote memory is accessed by a CPU. When this
-feature is enabled the kernel samples what task thread is accessing memory
-by periodically unmapping pages and later trapping a page fault. At the
-time of the page fault, it is determined if the data being accessed should
-be migrated to a local memory node.
+= =================================
+0 NUMA_BALANCING_DISABLED
+1 NUMA_BALANCING_NORMAL
+2 NUMA_BALANCING_MEMORY_TIERING
+= =================================
+
+Or NUMA_BALANCING_NORMAL to optimize page placement among different
+NUMA nodes to reduce remote accessing.  On NUMA machines, there is a
+performance penalty if remote memory is accessed by a CPU. When this
+feature is enabled the kernel samples what task thread is accessing
+memory by periodically unmapping pages and later trapping a page
+fault. At the time of the page fault, it is determined if the data
+being accessed should be migrated to a local memory node.
  
  The unmapping of pages and trapping faults incur additional overhead that
  ideally is offset by improved memory locality but there is no universal
@@ -615,6 +622,10 @@ faults may be controlled by the `numa_balancing_scan_period_min_ms,
  numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
  numa_balancing_scan_size_mb`_, and numa_balancing_settle_count sysctls.
  
+Or NUMA_BALANCING_MEMORY_TIERING to optimize page placement among
+different types of memory (represented as different NUMA nodes) to
+place the hot pages in the fast memory.  This is implemented based on
+unmapping and page fault too.
  
  numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
  ===============================================================================================================================
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 310b6e7ce58aa30ee43deb98eca108c738135fc0..962b14d403e8fc4a8144ed0e02f2593883843ecd 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -353,6 +353,7 @@ enum zone_watermarks {
         WMARK_MIN,
         WMARK_LOW,
         WMARK_HIGH,
+       WMARK_PROMO,
         NR_WMARK
  };
  
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index c19dd5a2c05c679ad7d47f6edd4d9ac27af330d0..b5eec8854c5a5b4dd63999a82d9d99681c4fb996 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -23,6 +23,16 @@ enum sched_tunable_scaling {
         SCHED_TUNABLESCALING_END,
  };
  
+#define NUMA_BALANCING_DISABLED                0x0
+#define NUMA_BALANCING_NORMAL          0x1
+#define NUMA_BALANCING_MEMORY_TIERING  0x2
+
+#ifdef CONFIG_NUMA_BALANCING
+extern int sysctl_numa_balancing_mode;
+#else
+#define sysctl_numa_balancing_mode     0
+#endif
+
  /*
   *  control realtime throttling:
   *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 9745613d531ce078ab4e2908a8633b716ef3c0ed..da6a60383645e923eb0f2cb82f1adcc1961c807a 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4279,7 +4279,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
  
  #ifdef CONFIG_NUMA_BALANCING
  
-void set_numabalancing_state(bool enabled)
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
  {
         if (enabled)
                 static_branch_enable(&sched_numa_balancing);
@@ -4287,13 +4289,22 @@ void set_numabalancing_state(bool enabled)
                 static_branch_disable(&sched_numa_balancing);
  }
  
+void set_numabalancing_state(bool enabled)
+{
+       if (enabled)
+               sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
+       else
+               sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+       __set_numabalancing_state(enabled);
+}
+
  #ifdef CONFIG_PROC_SYSCTL
  int sysctl_numa_balancing(struct ctl_table *table, int write,
                           void *buffer, size_t *lenp, loff_t *ppos)
  {
         struct ctl_table t;
         int err;
-       int state = static_branch_likely(&sched_numa_balancing);
+       int state = sysctl_numa_balancing_mode;
  
         if (write && !capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@ -4303,8 +4314,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
         err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
         if (err < 0)
                 return err;
-       if (write)
-               set_numabalancing_state(state);
+       if (write) {
+               sysctl_numa_balancing_mode = state;
+               __set_numabalancing_state(state);
+       }
         return err;
  }
  #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 730ab56d9e92e65759198f1635dd92564dfd7cbc..3395b99d59a46c4955fcdd77e6b0def0dcba87af 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1696,7 +1696,7 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = sysctl_numa_balancing,
                 .extra1         = SYSCTL_ZERO,
-               .extra2         = SYSCTL_ONE,
+               .extra2         = SYSCTL_FOUR,
         },
  #endif /* CONFIG_NUMA_BALANCING */
         {
diff --git a/mm/migrate.c b/mm/migrate.c

index dc4adf9792018d6fa4702e5ff2d6776ec1d75ebb..78b2cf87946d28c09b31d364ff96d19c4d8349c7 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -51,6 +51,7 @@
  #include <linux/oom.h>
  #include <linux/memory.h>
  #include <linux/random.h>
+#include <linux/sched/sysctl.h>
  
  #include <asm/tlbflush.h>
  
@@ -2031,16 +2032,27 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
  {
         int page_lru;
         int nr_pages = thp_nr_pages(page);
+       int order = compound_order(page);
  
-       VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+       VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
  
         /* Do not migrate THP mapped by multiple processes */
         if (PageTransHuge(page) && total_mapcount(page) > 1)
                 return 0;
  
         /* Avoid migrating to a node that is nearly full */
-       if (!migrate_balanced_pgdat(pgdat, nr_pages))
+       if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
+               int z;
+
+               if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
+                       return 0;
+               for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+                       if (populated_zone(pgdat->node_zones + z))
+                               break;
+               }
+               wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
                 return 0;
+       }
  
         if (isolate_lru_page(page))
                 return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index a573aa9f5160b11b27c3a4327737b47559a82972..8b18a077c409724ec32cdb4fae81cc5668a9d44e 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8441,7 +8441,8 @@ static void __setup_per_zone_wmarks(void)
  
                 zone->watermark_boost = 0;
                 zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
-               zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+               zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
+               zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
  
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
diff --git a/mm/vmscan.c b/mm/vmscan.c

index f5ec53f19f3b28d317c347a7cc4fe0295ffdc234..499fa86e754a07c0c387b5e4cf11aef090cad107 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,7 @@
  
  #include <linux/swapops.h>
  #include <linux/balloon_compaction.h>
+#include <linux/sched/sysctl.h>
  
  #include "internal.h"
  
@@ -3895,7 +3896,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
                 if (!managed_zone(zone))
                         continue;
  
-               mark = high_wmark_pages(zone);
+               if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
+                       mark = wmark_pages(zone, WMARK_PROMO);
+               else
+                       mark = high_wmark_pages(zone);
                 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
                         return true;
         }
author	Huang Ying <ying.huang@intel.com>
	Tue, 22 Mar 2022 21:46:23 +0000 (14:46 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 22 Mar 2022 22:57:09 +0000 (15:57 -0700)
Documentation/admin-guide/sysctl/kernel.rst		patch \| blob \| blame \| history
include/linux/mmzone.h		patch \| blob \| blame \| history
include/linux/sched/sysctl.h		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sysctl.c		patch \| blob \| blame \| history
mm/migrate.c		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history
mm/vmscan.c		patch \| blob \| blame \| history