mm/migrate: move node demotion code to near its user

author Huang Ying <ying.huang@intel.com>

Fri, 14 Jan 2022 22:08:49 +0000 (14:08 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 15 Jan 2022 14:30:31 +0000 (16:30 +0200)
author Huang Ying <ying.huang@intel.com>
Fri, 14 Jan 2022 22:08:49 +0000 (14:08 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 15 Jan 2022 14:30:31 +0000 (16:30 +0200)
diff --git a/mm/migrate.c b/mm/migrate.c

index f50087d3ebf2959deed0c77e5f64018651750947..e50b80534d805a3097fb4acd38f7aa5864974395 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1093,139 +1093,6 @@ out:
         return rc;
  }
  
-
-/*
- * node_demotion[] example:
- *
- * Consider a system with two sockets.  Each socket has
- * three classes of memory attached: fast, medium and slow.
- * Each memory class is placed in its own NUMA node.  The
- * CPUs are placed in the node with the "fast" memory.  The
- * 6 NUMA nodes (0-5) might be split among the sockets like
- * this:
- *
- *     Socket A: 0, 1, 2
- *     Socket B: 3, 4, 5
- *
- * When Node 0 fills up, its memory should be migrated to
- * Node 1.  When Node 1 fills up, it should be migrated to
- * Node 2.  The migration path start on the nodes with the
- * processors (since allocations default to this node) and
- * fast memory, progress through medium and end with the
- * slow memory:
- *
- *     0 -> 1 -> 2 -> stop
- *     3 -> 4 -> 5 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *
- *     {  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
- *     {  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
- *     {  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
- *     {  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
- *     {  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
- *     {  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
- *
- * Moreover some systems may have multiple slow memory nodes.
- * Suppose a system has one socket with 3 memory nodes, node 0
- * is fast memory type, and node 1/2 both are slow memory
- * type, and the distance between fast memory node and slow
- * memory node is same. So the migration path should be:
- *
- *     0 -> 1/2 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *     { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
- *     { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
- *     { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
- */
-
-/*
- * Writes to this array occur without locking.  Cycles are
- * not allowed: Node X demotes to Y which demotes to X...
- *
- * If multiple reads are performed, a single rcu_read_lock()
- * must be held over all reads to ensure that no cycles are
- * observed.
- */
-#define DEFAULT_DEMOTION_TARGET_NODES 15
-
-#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
-#define DEMOTION_TARGET_NODES  (MAX_NUMNODES - 1)
-#else
-#define DEMOTION_TARGET_NODES  DEFAULT_DEMOTION_TARGET_NODES
-#endif
-
-struct demotion_nodes {
-       unsigned short nr;
-       short nodes[DEMOTION_TARGET_NODES];
-};
-
-static struct demotion_nodes *node_demotion __read_mostly;
-
-/**
- * next_demotion_node() - Get the next node in the demotion path
- * @node: The starting node to lookup the next node
- *
- * Return: node id for next memory node in the demotion path hierarchy
- * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
- * @node online or guarantee that it *continues* to be the next demotion
- * target.
- */
-int next_demotion_node(int node)
-{
-       struct demotion_nodes *nd;
-       unsigned short target_nr, index;
-       int target;
-
-       if (!node_demotion)
-               return NUMA_NO_NODE;
-
-       nd = &node_demotion[node];
-
-       /*
-        * node_demotion[] is updated without excluding this
-        * function from running.  RCU doesn't provide any
-        * compiler barriers, so the READ_ONCE() is required
-        * to avoid compiler reordering or read merging.
-        *
-        * Make sure to use RCU over entire code blocks if
-        * node_demotion[] reads need to be consistent.
-        */
-       rcu_read_lock();
-       target_nr = READ_ONCE(nd->nr);
-
-       switch (target_nr) {
-       case 0:
-               target = NUMA_NO_NODE;
-               goto out;
-       case 1:
-               index = 0;
-               break;
-       default:
-               /*
-                * If there are multiple target nodes, just select one
-                * target node randomly.
-                *
-                * In addition, we can also use round-robin to select
-                * target node, but we should introduce another variable
-                * for node_demotion[] to record last selected target node,
-                * that may cause cache ping-pong due to the changing of
-                * last target node. Or introducing per-cpu data to avoid
-                * caching issue, which seems more complicated. So selecting
-                * target node randomly seems better until now.
-                */
-               index = get_random_int() % target_nr;
-               break;
-       }
-
-       target = READ_ONCE(nd->nodes[index]);
-
-out:
-       rcu_read_unlock();
-       return target;
-}
-
  /*
   * Obtain the lock on page, remove all ptes and migrate the page
   * to the newly allocated page in newpage.
@@ -3059,6 +2926,138 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
  EXPORT_SYMBOL(migrate_vma_finalize);
  #endif /* CONFIG_DEVICE_PRIVATE */
  
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets.  Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node.  The
+ * CPUs are placed in the node with the "fast" memory.  The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ *     Socket A: 0, 1, 2
+ *     Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1.  When Node 1 fills up, it should be migrated to
+ * Node 2.  The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ *     0 -> 1 -> 2 -> stop
+ *     3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ *     {  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
+ *     {  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
+ *     {  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
+ *     {  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
+ *     {  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
+ *     {  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
+ *
+ * Moreover some systems may have multiple slow memory nodes.
+ * Suppose a system has one socket with 3 memory nodes, node 0
+ * is fast memory type, and node 1/2 both are slow memory
+ * type, and the distance between fast memory node and slow
+ * memory node is same. So the migration path should be:
+ *
+ *     0 -> 1/2 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *     { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
+ *     { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
+ *     { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking.  Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+#define DEFAULT_DEMOTION_TARGET_NODES 15
+
+#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
+#define DEMOTION_TARGET_NODES  (MAX_NUMNODES - 1)
+#else
+#define DEMOTION_TARGET_NODES  DEFAULT_DEMOTION_TARGET_NODES
+#endif
+
+struct demotion_nodes {
+       unsigned short nr;
+       short nodes[DEMOTION_TARGET_NODES];
+};
+
+static struct demotion_nodes *node_demotion __read_mostly;
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+       struct demotion_nodes *nd;
+       unsigned short target_nr, index;
+       int target;
+
+       if (!node_demotion)
+               return NUMA_NO_NODE;
+
+       nd = &node_demotion[node];
+
+       /*
+        * node_demotion[] is updated without excluding this
+        * function from running.  RCU doesn't provide any
+        * compiler barriers, so the READ_ONCE() is required
+        * to avoid compiler reordering or read merging.
+        *
+        * Make sure to use RCU over entire code blocks if
+        * node_demotion[] reads need to be consistent.
+        */
+       rcu_read_lock();
+       target_nr = READ_ONCE(nd->nr);
+
+       switch (target_nr) {
+       case 0:
+               target = NUMA_NO_NODE;
+               goto out;
+       case 1:
+               index = 0;
+               break;
+       default:
+               /*
+                * If there are multiple target nodes, just select one
+                * target node randomly.
+                *
+                * In addition, we can also use round-robin to select
+                * target node, but we should introduce another variable
+                * for node_demotion[] to record last selected target node,
+                * that may cause cache ping-pong due to the changing of
+                * last target node. Or introducing per-cpu data to avoid
+                * caching issue, which seems more complicated. So selecting
+                * target node randomly seems better until now.
+                */
+               index = get_random_int() % target_nr;
+               break;
+       }
+
+       target = READ_ONCE(nd->nodes[index]);
+
+out:
+       rcu_read_unlock();
+       return target;
+}
+
  #if defined(CONFIG_HOTPLUG_CPU)
  /* Disable reclaim-based migration. */
  static void __disable_all_migrate_targets(void)
author	Huang Ying <ying.huang@intel.com>
	Fri, 14 Jan 2022 22:08:49 +0000 (14:08 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 15 Jan 2022 14:30:31 +0000 (16:30 +0200)