1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/slab.h>
3 #include <linux/lockdep.h>
4 #include <linux/sysfs.h>
5 #include <linux/kobject.h>
6 #include <linux/memory.h>
7 #include <linux/memory-tiers.h>
12 /* hierarchy of memory tiers */
13 struct list_head list
;
14 /* list of all memory types part of this tier */
15 struct list_head memory_types
;
17 * start value of abstract distance. memory tier maps
18 * an abstract distance range,
19 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
23 /* All the nodes that are part of all the lower memory tiers. */
24 nodemask_t lower_tier_mask
;
27 struct demotion_nodes
{
31 struct node_memory_type_map
{
32 struct memory_dev_type
*memtype
;
36 static DEFINE_MUTEX(memory_tier_lock
);
37 static LIST_HEAD(memory_tiers
);
38 static struct node_memory_type_map node_memory_types
[MAX_NUMNODES
];
39 static struct memory_dev_type
*default_dram_type
;
41 static struct bus_type memory_tier_subsys
= {
42 .name
= "memory_tiering",
43 .dev_name
= "memory_tier",
46 #ifdef CONFIG_MIGRATION
47 static int top_tier_adistance
;
49 * node_demotion[] examples:
53 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
65 * node_demotion[0].preferred = 2
66 * node_demotion[1].preferred = 3
67 * node_demotion[2].preferred = <empty>
68 * node_demotion[3].preferred = <empty>
72 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
82 * node_demotion[0].preferred = <empty>
83 * node_demotion[1].preferred = <empty>
84 * node_demotion[2].preferred = <empty>
88 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
100 * node_demotion[0].preferred = 2
101 * node_demotion[1].preferred = 0
102 * node_demotion[2].preferred = <empty>
105 static struct demotion_nodes
*node_demotion __read_mostly
;
106 #endif /* CONFIG_MIGRATION */
108 static inline struct memory_tier
*to_memory_tier(struct device
*device
)
110 return container_of(device
, struct memory_tier
, dev
);
113 static __always_inline nodemask_t
get_memtier_nodemask(struct memory_tier
*memtier
)
115 nodemask_t nodes
= NODE_MASK_NONE
;
116 struct memory_dev_type
*memtype
;
118 list_for_each_entry(memtype
, &memtier
->memory_types
, tier_sibiling
)
119 nodes_or(nodes
, nodes
, memtype
->nodes
);
124 static void memory_tier_device_release(struct device
*dev
)
126 struct memory_tier
*tier
= to_memory_tier(dev
);
128 * synchronize_rcu in clear_node_memory_tier makes sure
129 * we don't have rcu access to this memory tier.
134 static ssize_t
nodelist_show(struct device
*dev
,
135 struct device_attribute
*attr
, char *buf
)
140 mutex_lock(&memory_tier_lock
);
141 nmask
= get_memtier_nodemask(to_memory_tier(dev
));
142 ret
= sysfs_emit(buf
, "%*pbl\n", nodemask_pr_args(&nmask
));
143 mutex_unlock(&memory_tier_lock
);
146 static DEVICE_ATTR_RO(nodelist
);
148 static struct attribute
*memtier_dev_attrs
[] = {
149 &dev_attr_nodelist
.attr
,
153 static const struct attribute_group memtier_dev_group
= {
154 .attrs
= memtier_dev_attrs
,
157 static const struct attribute_group
*memtier_dev_groups
[] = {
162 static struct memory_tier
*find_create_memory_tier(struct memory_dev_type
*memtype
)
165 bool found_slot
= false;
166 struct memory_tier
*memtier
, *new_memtier
;
167 int adistance
= memtype
->adistance
;
168 unsigned int memtier_adistance_chunk_size
= MEMTIER_CHUNK_SIZE
;
170 lockdep_assert_held_once(&memory_tier_lock
);
172 adistance
= round_down(adistance
, memtier_adistance_chunk_size
);
174 * If the memtype is already part of a memory tier,
177 if (!list_empty(&memtype
->tier_sibiling
)) {
178 list_for_each_entry(memtier
, &memory_tiers
, list
) {
179 if (adistance
== memtier
->adistance_start
)
183 return ERR_PTR(-EINVAL
);
186 list_for_each_entry(memtier
, &memory_tiers
, list
) {
187 if (adistance
== memtier
->adistance_start
) {
189 } else if (adistance
< memtier
->adistance_start
) {
195 new_memtier
= kzalloc(sizeof(struct memory_tier
), GFP_KERNEL
);
197 return ERR_PTR(-ENOMEM
);
199 new_memtier
->adistance_start
= adistance
;
200 INIT_LIST_HEAD(&new_memtier
->list
);
201 INIT_LIST_HEAD(&new_memtier
->memory_types
);
203 list_add_tail(&new_memtier
->list
, &memtier
->list
);
205 list_add_tail(&new_memtier
->list
, &memory_tiers
);
207 new_memtier
->dev
.id
= adistance
>> MEMTIER_CHUNK_BITS
;
208 new_memtier
->dev
.bus
= &memory_tier_subsys
;
209 new_memtier
->dev
.release
= memory_tier_device_release
;
210 new_memtier
->dev
.groups
= memtier_dev_groups
;
212 ret
= device_register(&new_memtier
->dev
);
214 list_del(&new_memtier
->list
);
215 put_device(&new_memtier
->dev
);
218 memtier
= new_memtier
;
221 list_add(&memtype
->tier_sibiling
, &memtier
->memory_types
);
225 static struct memory_tier
*__node_get_memory_tier(int node
)
229 pgdat
= NODE_DATA(node
);
233 * Since we hold memory_tier_lock, we can avoid
234 * RCU read locks when accessing the details. No
235 * parallel updates are possible here.
237 return rcu_dereference_check(pgdat
->memtier
,
238 lockdep_is_held(&memory_tier_lock
));
241 #ifdef CONFIG_MIGRATION
242 bool node_is_toptier(int node
)
246 struct memory_tier
*memtier
;
248 pgdat
= NODE_DATA(node
);
253 memtier
= rcu_dereference(pgdat
->memtier
);
258 if (memtier
->adistance_start
<= top_tier_adistance
)
267 void node_get_allowed_targets(pg_data_t
*pgdat
, nodemask_t
*targets
)
269 struct memory_tier
*memtier
;
272 * pg_data_t.memtier updates includes a synchronize_rcu()
273 * which ensures that we either find NULL or a valid memtier
274 * in NODE_DATA. protect the access via rcu_read_lock();
277 memtier
= rcu_dereference(pgdat
->memtier
);
279 *targets
= memtier
->lower_tier_mask
;
281 *targets
= NODE_MASK_NONE
;
286 * next_demotion_node() - Get the next node in the demotion path
287 * @node: The starting node to lookup the next node
289 * Return: node id for next memory node in the demotion path hierarchy
290 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
291 * @node online or guarantee that it *continues* to be the next demotion
294 int next_demotion_node(int node
)
296 struct demotion_nodes
*nd
;
302 nd
= &node_demotion
[node
];
305 * node_demotion[] is updated without excluding this
306 * function from running.
308 * Make sure to use RCU over entire code blocks if
309 * node_demotion[] reads need to be consistent.
313 * If there are multiple target nodes, just select one
314 * target node randomly.
316 * In addition, we can also use round-robin to select
317 * target node, but we should introduce another variable
318 * for node_demotion[] to record last selected target node,
319 * that may cause cache ping-pong due to the changing of
320 * last target node. Or introducing per-cpu data to avoid
321 * caching issue, which seems more complicated. So selecting
322 * target node randomly seems better until now.
324 target
= node_random(&nd
->preferred
);
330 static void disable_all_demotion_targets(void)
332 struct memory_tier
*memtier
;
335 for_each_node_state(node
, N_MEMORY
) {
336 node_demotion
[node
].preferred
= NODE_MASK_NONE
;
338 * We are holding memory_tier_lock, it is safe
339 * to access pgda->memtier.
341 memtier
= __node_get_memory_tier(node
);
343 memtier
->lower_tier_mask
= NODE_MASK_NONE
;
346 * Ensure that the "disable" is visible across the system.
347 * Readers will see either a combination of before+disable
348 * state or disable+after. They will never see before and
349 * after state together.
355 * Find an automatic demotion target for all memory
356 * nodes. Failing here is OK. It might just indicate
357 * being at the end of a chain.
359 static void establish_demotion_targets(void)
361 struct memory_tier
*memtier
;
362 struct demotion_nodes
*nd
;
363 int target
= NUMA_NO_NODE
, node
;
364 int distance
, best_distance
;
365 nodemask_t tier_nodes
, lower_tier
;
367 lockdep_assert_held_once(&memory_tier_lock
);
372 disable_all_demotion_targets();
374 for_each_node_state(node
, N_MEMORY
) {
376 nd
= &node_demotion
[node
];
378 memtier
= __node_get_memory_tier(node
);
379 if (!memtier
|| list_is_last(&memtier
->list
, &memory_tiers
))
382 * Get the lower memtier to find the demotion node list.
384 memtier
= list_next_entry(memtier
, list
);
385 tier_nodes
= get_memtier_nodemask(memtier
);
387 * find_next_best_node, use 'used' nodemask as a skip list.
388 * Add all memory nodes except the selected memory tier
389 * nodelist to skip list so that we find the best node from the
392 nodes_andnot(tier_nodes
, node_states
[N_MEMORY
], tier_nodes
);
395 * Find all the nodes in the memory tier node list of same best distance.
396 * add them to the preferred mask. We randomly select between nodes
397 * in the preferred mask when allocating pages during demotion.
400 target
= find_next_best_node(node
, &tier_nodes
);
401 if (target
== NUMA_NO_NODE
)
404 distance
= node_distance(node
, target
);
405 if (distance
== best_distance
|| best_distance
== -1) {
406 best_distance
= distance
;
407 node_set(target
, nd
->preferred
);
414 * Promotion is allowed from a memory tier to higher
415 * memory tier only if the memory tier doesn't include
416 * compute. We want to skip promotion from a memory tier,
417 * if any node that is part of the memory tier have CPUs.
418 * Once we detect such a memory tier, we consider that tier
419 * as top tiper from which promotion is not allowed.
421 list_for_each_entry_reverse(memtier
, &memory_tiers
, list
) {
422 tier_nodes
= get_memtier_nodemask(memtier
);
423 nodes_and(tier_nodes
, node_states
[N_CPU
], tier_nodes
);
424 if (!nodes_empty(tier_nodes
)) {
426 * abstract distance below the max value of this memtier
427 * is considered toptier.
429 top_tier_adistance
= memtier
->adistance_start
+
430 MEMTIER_CHUNK_SIZE
- 1;
435 * Now build the lower_tier mask for each node collecting node mask from
436 * all memory tier below it. This allows us to fallback demotion page
437 * allocation to a set of nodes that is closer the above selected
440 lower_tier
= node_states
[N_MEMORY
];
441 list_for_each_entry(memtier
, &memory_tiers
, list
) {
443 * Keep removing current tier from lower_tier nodes,
444 * This will remove all nodes in current and above
445 * memory tier from the lower_tier mask.
447 tier_nodes
= get_memtier_nodemask(memtier
);
448 nodes_andnot(lower_tier
, lower_tier
, tier_nodes
);
449 memtier
->lower_tier_mask
= lower_tier
;
454 static inline void establish_demotion_targets(void) {}
455 #endif /* CONFIG_MIGRATION */
457 static inline void __init_node_memory_type(int node
, struct memory_dev_type
*memtype
)
459 if (!node_memory_types
[node
].memtype
)
460 node_memory_types
[node
].memtype
= memtype
;
462 * for each device getting added in the same NUMA node
463 * with this specific memtype, bump the map count. We
464 * Only take memtype device reference once, so that
465 * changing a node memtype can be done by droping the
466 * only reference count taken here.
469 if (node_memory_types
[node
].memtype
== memtype
) {
470 if (!node_memory_types
[node
].map_count
++)
471 kref_get(&memtype
->kref
);
475 static struct memory_tier
*set_node_memory_tier(int node
)
477 struct memory_tier
*memtier
;
478 struct memory_dev_type
*memtype
;
479 pg_data_t
*pgdat
= NODE_DATA(node
);
482 lockdep_assert_held_once(&memory_tier_lock
);
484 if (!node_state(node
, N_MEMORY
))
485 return ERR_PTR(-EINVAL
);
487 __init_node_memory_type(node
, default_dram_type
);
489 memtype
= node_memory_types
[node
].memtype
;
490 node_set(node
, memtype
->nodes
);
491 memtier
= find_create_memory_tier(memtype
);
492 if (!IS_ERR(memtier
))
493 rcu_assign_pointer(pgdat
->memtier
, memtier
);
497 static void destroy_memory_tier(struct memory_tier
*memtier
)
499 list_del(&memtier
->list
);
500 device_unregister(&memtier
->dev
);
503 static bool clear_node_memory_tier(int node
)
505 bool cleared
= false;
507 struct memory_tier
*memtier
;
509 pgdat
= NODE_DATA(node
);
514 * Make sure that anybody looking at NODE_DATA who finds
515 * a valid memtier finds memory_dev_types with nodes still
516 * linked to the memtier. We achieve this by waiting for
517 * rcu read section to finish using synchronize_rcu.
518 * This also enables us to free the destroyed memory tier
519 * with kfree instead of kfree_rcu
521 memtier
= __node_get_memory_tier(node
);
523 struct memory_dev_type
*memtype
;
525 rcu_assign_pointer(pgdat
->memtier
, NULL
);
527 memtype
= node_memory_types
[node
].memtype
;
528 node_clear(node
, memtype
->nodes
);
529 if (nodes_empty(memtype
->nodes
)) {
530 list_del_init(&memtype
->tier_sibiling
);
531 if (list_empty(&memtier
->memory_types
))
532 destroy_memory_tier(memtier
);
539 static void release_memtype(struct kref
*kref
)
541 struct memory_dev_type
*memtype
;
543 memtype
= container_of(kref
, struct memory_dev_type
, kref
);
547 struct memory_dev_type
*alloc_memory_type(int adistance
)
549 struct memory_dev_type
*memtype
;
551 memtype
= kmalloc(sizeof(*memtype
), GFP_KERNEL
);
553 return ERR_PTR(-ENOMEM
);
555 memtype
->adistance
= adistance
;
556 INIT_LIST_HEAD(&memtype
->tier_sibiling
);
557 memtype
->nodes
= NODE_MASK_NONE
;
558 kref_init(&memtype
->kref
);
561 EXPORT_SYMBOL_GPL(alloc_memory_type
);
563 void put_memory_type(struct memory_dev_type
*memtype
)
565 kref_put(&memtype
->kref
, release_memtype
);
567 EXPORT_SYMBOL_GPL(put_memory_type
);
569 void init_node_memory_type(int node
, struct memory_dev_type
*memtype
)
572 mutex_lock(&memory_tier_lock
);
573 __init_node_memory_type(node
, memtype
);
574 mutex_unlock(&memory_tier_lock
);
576 EXPORT_SYMBOL_GPL(init_node_memory_type
);
578 void clear_node_memory_type(int node
, struct memory_dev_type
*memtype
)
580 mutex_lock(&memory_tier_lock
);
581 if (node_memory_types
[node
].memtype
== memtype
)
582 node_memory_types
[node
].map_count
--;
584 * If we umapped all the attached devices to this node,
585 * clear the node memory type.
587 if (!node_memory_types
[node
].map_count
) {
588 node_memory_types
[node
].memtype
= NULL
;
589 put_memory_type(memtype
);
591 mutex_unlock(&memory_tier_lock
);
593 EXPORT_SYMBOL_GPL(clear_node_memory_type
);
595 static int __meminit
memtier_hotplug_callback(struct notifier_block
*self
,
596 unsigned long action
, void *_arg
)
598 struct memory_tier
*memtier
;
599 struct memory_notify
*arg
= _arg
;
602 * Only update the node migration order when a node is
603 * changing status, like online->offline.
605 if (arg
->status_change_nid
< 0)
606 return notifier_from_errno(0);
610 mutex_lock(&memory_tier_lock
);
611 if (clear_node_memory_tier(arg
->status_change_nid
))
612 establish_demotion_targets();
613 mutex_unlock(&memory_tier_lock
);
616 mutex_lock(&memory_tier_lock
);
617 memtier
= set_node_memory_tier(arg
->status_change_nid
);
618 if (!IS_ERR(memtier
))
619 establish_demotion_targets();
620 mutex_unlock(&memory_tier_lock
);
624 return notifier_from_errno(0);
627 static int __init
memory_tier_init(void)
630 struct memory_tier
*memtier
;
632 ret
= subsys_virtual_register(&memory_tier_subsys
, NULL
);
634 panic("%s() failed to register memory tier subsystem\n", __func__
);
636 #ifdef CONFIG_MIGRATION
637 node_demotion
= kcalloc(nr_node_ids
, sizeof(struct demotion_nodes
),
639 WARN_ON(!node_demotion
);
641 mutex_lock(&memory_tier_lock
);
643 * For now we can have 4 faster memory tiers with smaller adistance
644 * than default DRAM tier.
646 default_dram_type
= alloc_memory_type(MEMTIER_ADISTANCE_DRAM
);
647 if (IS_ERR(default_dram_type
))
648 panic("%s() failed to allocate default DRAM tier\n", __func__
);
651 * Look at all the existing N_MEMORY nodes and add them to
652 * default memory tier or to a tier if we already have memory
655 for_each_node_state(node
, N_MEMORY
) {
656 memtier
= set_node_memory_tier(node
);
659 * Continue with memtiers we are able to setup
663 establish_demotion_targets();
664 mutex_unlock(&memory_tier_lock
);
666 hotplug_memory_notifier(memtier_hotplug_callback
, MEMTIER_HOTPLUG_PRI
);
669 subsys_initcall(memory_tier_init
);
671 bool numa_demotion_enabled
= false;
673 #ifdef CONFIG_MIGRATION
675 static ssize_t
demotion_enabled_show(struct kobject
*kobj
,
676 struct kobj_attribute
*attr
, char *buf
)
678 return sysfs_emit(buf
, "%s\n",
679 numa_demotion_enabled
? "true" : "false");
682 static ssize_t
demotion_enabled_store(struct kobject
*kobj
,
683 struct kobj_attribute
*attr
,
684 const char *buf
, size_t count
)
688 ret
= kstrtobool(buf
, &numa_demotion_enabled
);
695 static struct kobj_attribute numa_demotion_enabled_attr
=
696 __ATTR_RW(demotion_enabled
);
698 static struct attribute
*numa_attrs
[] = {
699 &numa_demotion_enabled_attr
.attr
,
703 static const struct attribute_group numa_attr_group
= {
707 static int __init
numa_init_sysfs(void)
710 struct kobject
*numa_kobj
;
712 numa_kobj
= kobject_create_and_add("numa", mm_kobj
);
714 pr_err("failed to create numa kobject\n");
717 err
= sysfs_create_group(numa_kobj
, &numa_attr_group
);
719 pr_err("failed to register numa group\n");
725 kobject_put(numa_kobj
);
728 subsys_initcall(numa_init_sysfs
);
729 #endif /* CONFIG_SYSFS */