2 * zsmalloc memory allocator
4 * Copyright (C) 2011 Nitin Gupta
5 * Copyright (C) 2012, 2013 Minchan Kim
7 * This code is released using a dual license strategy: BSD/GPL
8 * You can choose the license that better fits your requirements.
10 * Released under the terms of 3-clause BSD License
11 * Released under the terms of GNU General Public License Version 2.0
15 * Following is how we use various fields and flags of underlying
16 * struct page(s) to form a zspage.
18 * Usage of struct page fields:
19 * page->private: points to zspage
20 * page->index: links together all component pages of a zspage
21 * For the huge page, this is always 0, so we use this field
23 * page->page_type: first object offset in a subpage of zspage
25 * Usage of struct page flags:
26 * PG_private: identifies the first component page
27 * PG_owner_priv_1: identifies the huge component page
31 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41 #include <linux/module.h>
42 #include <linux/kernel.h>
43 #include <linux/sched.h>
44 #include <linux/bitops.h>
45 #include <linux/errno.h>
46 #include <linux/highmem.h>
47 #include <linux/string.h>
48 #include <linux/slab.h>
49 #include <linux/pgtable.h>
50 #include <asm/tlbflush.h>
51 #include <linux/cpumask.h>
52 #include <linux/cpu.h>
53 #include <linux/vmalloc.h>
54 #include <linux/preempt.h>
55 #include <linux/spinlock.h>
56 #include <linux/shrinker.h>
57 #include <linux/types.h>
58 #include <linux/debugfs.h>
59 #include <linux/zsmalloc.h>
60 #include <linux/zpool.h>
61 #include <linux/migrate.h>
62 #include <linux/wait.h>
63 #include <linux/pagemap.h>
65 #include <linux/local_lock.h>
67 #define ZSPAGE_MAGIC 0x58
70 * This must be power of 2 and greater than or equal to sizeof(link_free).
71 * These two conditions ensure that any 'struct link_free' itself doesn't
72 * span more than 1 page which avoids complex case of mapping 2 pages simply
73 * to restore link_free pointer values.
78 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
79 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
81 #define ZS_MAX_ZSPAGE_ORDER 2
82 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
84 #define ZS_HANDLE_SIZE (sizeof(unsigned long))
87 * Object location (<PFN>, <obj_idx>) is encoded as
88 * a single (unsigned long) handle value.
90 * Note that object index <obj_idx> starts from 0.
92 * This is made more complicated by various memory models and PAE.
95 #ifndef MAX_POSSIBLE_PHYSMEM_BITS
96 #ifdef MAX_PHYSMEM_BITS
97 #define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
100 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
103 #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
107 #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
110 * Head in allocated object should have OBJ_ALLOCATED_TAG
111 * to identify the object was allocated or not.
112 * It's okay to add the status bit in the least bit because
113 * header keeps handle which is 4byte-aligned address so we
114 * have room for two bit at least.
116 #define OBJ_ALLOCATED_TAG 1
117 #define OBJ_TAG_BITS 1
118 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
119 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
122 #define FULLNESS_BITS 2
124 #define ISOLATED_BITS 3
125 #define MAGIC_VAL_BITS 8
127 #define MAX(a, b) ((a) >= (b) ? (a) : (b))
128 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
129 #define ZS_MIN_ALLOC_SIZE \
130 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
131 /* each chunk includes extra space to keep handle */
132 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE
135 * On systems with 4K page size, this gives 255 size classes! There is a
137 * - Large number of size classes is potentially wasteful as free page are
138 * spread across these classes
139 * - Small number of size classes causes large internal fragmentation
140 * - Probably its better to use specific size classes (empirically
141 * determined). NOTE: all those class sizes must be set as multiple of
142 * ZS_ALIGN to make sure link_free itself never has to span 2 pages.
144 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
147 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
148 #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
149 ZS_SIZE_CLASS_DELTA) + 1)
151 enum fullness_group
{
159 enum class_stat_type
{
169 struct zs_size_stat
{
170 unsigned long objs
[NR_ZS_STAT_TYPE
];
173 #ifdef CONFIG_ZSMALLOC_STAT
174 static struct dentry
*zs_stat_root
;
178 * We assign a page to ZS_ALMOST_EMPTY fullness group when:
180 * n = number of allocated objects
181 * N = total number of objects zspage can store
182 * f = fullness_threshold_frac
184 * Similarly, we assign zspage to:
185 * ZS_ALMOST_FULL when n > N / f
186 * ZS_EMPTY when n == 0
187 * ZS_FULL when n == N
189 * (see: fix_fullness_group())
191 static const int fullness_threshold_frac
= 4;
192 static size_t huge_class_size
;
196 struct list_head fullness_list
[NR_ZS_FULLNESS
];
198 * Size of objects stored in this class. Must be multiple
203 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
204 int pages_per_zspage
;
207 struct zs_size_stat stats
;
211 * Placed within free objects to form a singly linked list.
212 * For every zspage, zspage->freeobj gives head of this list.
214 * This must be power of 2 and less than or equal to ZS_ALIGN
220 * It's valid for non-allocated object
224 * Handle of allocated object.
226 unsigned long handle
;
233 struct size_class
*size_class
[ZS_SIZE_CLASSES
];
234 struct kmem_cache
*handle_cachep
;
235 struct kmem_cache
*zspage_cachep
;
237 atomic_long_t pages_allocated
;
239 struct zs_pool_stats stats
;
241 /* Compact classes */
242 struct shrinker shrinker
;
244 #ifdef CONFIG_ZSMALLOC_STAT
245 struct dentry
*stat_dentry
;
247 #ifdef CONFIG_COMPACTION
248 struct work_struct free_work
;
250 /* protect page/zspage migration */
251 rwlock_t migrate_lock
;
256 unsigned int huge
:HUGE_BITS
;
257 unsigned int fullness
:FULLNESS_BITS
;
258 unsigned int class:CLASS_BITS
+ 1;
259 unsigned int isolated
:ISOLATED_BITS
;
260 unsigned int magic
:MAGIC_VAL_BITS
;
263 unsigned int freeobj
;
264 struct page
*first_page
;
265 struct list_head list
; /* fullness list */
266 struct zs_pool
*pool
;
267 #ifdef CONFIG_COMPACTION
272 struct mapping_area
{
274 char *vm_buf
; /* copy buffer for objects that span pages */
275 char *vm_addr
; /* address of kmap_atomic()'ed pages */
276 enum zs_mapmode vm_mm
; /* mapping mode */
279 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
280 static void SetZsHugePage(struct zspage
*zspage
)
285 static bool ZsHugePage(struct zspage
*zspage
)
290 #ifdef CONFIG_COMPACTION
291 static void migrate_lock_init(struct zspage
*zspage
);
292 static void migrate_read_lock(struct zspage
*zspage
);
293 static void migrate_read_unlock(struct zspage
*zspage
);
294 static void migrate_write_lock(struct zspage
*zspage
);
295 static void migrate_write_lock_nested(struct zspage
*zspage
);
296 static void migrate_write_unlock(struct zspage
*zspage
);
297 static void kick_deferred_free(struct zs_pool
*pool
);
298 static void init_deferred_free(struct zs_pool
*pool
);
299 static void SetZsPageMovable(struct zs_pool
*pool
, struct zspage
*zspage
);
301 static void migrate_lock_init(struct zspage
*zspage
) {}
302 static void migrate_read_lock(struct zspage
*zspage
) {}
303 static void migrate_read_unlock(struct zspage
*zspage
) {}
304 static void migrate_write_lock(struct zspage
*zspage
) {}
305 static void migrate_write_lock_nested(struct zspage
*zspage
) {}
306 static void migrate_write_unlock(struct zspage
*zspage
) {}
307 static void kick_deferred_free(struct zs_pool
*pool
) {}
308 static void init_deferred_free(struct zs_pool
*pool
) {}
309 static void SetZsPageMovable(struct zs_pool
*pool
, struct zspage
*zspage
) {}
312 static int create_cache(struct zs_pool
*pool
)
314 pool
->handle_cachep
= kmem_cache_create("zs_handle", ZS_HANDLE_SIZE
,
316 if (!pool
->handle_cachep
)
319 pool
->zspage_cachep
= kmem_cache_create("zspage", sizeof(struct zspage
),
321 if (!pool
->zspage_cachep
) {
322 kmem_cache_destroy(pool
->handle_cachep
);
323 pool
->handle_cachep
= NULL
;
330 static void destroy_cache(struct zs_pool
*pool
)
332 kmem_cache_destroy(pool
->handle_cachep
);
333 kmem_cache_destroy(pool
->zspage_cachep
);
336 static unsigned long cache_alloc_handle(struct zs_pool
*pool
, gfp_t gfp
)
338 return (unsigned long)kmem_cache_alloc(pool
->handle_cachep
,
339 gfp
& ~(__GFP_HIGHMEM
|__GFP_MOVABLE
));
342 static void cache_free_handle(struct zs_pool
*pool
, unsigned long handle
)
344 kmem_cache_free(pool
->handle_cachep
, (void *)handle
);
347 static struct zspage
*cache_alloc_zspage(struct zs_pool
*pool
, gfp_t flags
)
349 return kmem_cache_zalloc(pool
->zspage_cachep
,
350 flags
& ~(__GFP_HIGHMEM
|__GFP_MOVABLE
));
353 static void cache_free_zspage(struct zs_pool
*pool
, struct zspage
*zspage
)
355 kmem_cache_free(pool
->zspage_cachep
, zspage
);
358 /* class->lock(which owns the handle) synchronizes races */
359 static void record_obj(unsigned long handle
, unsigned long obj
)
361 *(unsigned long *)handle
= obj
;
368 static void *zs_zpool_create(const char *name
, gfp_t gfp
,
369 const struct zpool_ops
*zpool_ops
,
373 * Ignore global gfp flags: zs_malloc() may be invoked from
374 * different contexts and its caller must provide a valid
377 return zs_create_pool(name
);
380 static void zs_zpool_destroy(void *pool
)
382 zs_destroy_pool(pool
);
385 static int zs_zpool_malloc(void *pool
, size_t size
, gfp_t gfp
,
386 unsigned long *handle
)
388 *handle
= zs_malloc(pool
, size
, gfp
);
390 if (IS_ERR((void *)(*handle
)))
391 return PTR_ERR((void *)*handle
);
394 static void zs_zpool_free(void *pool
, unsigned long handle
)
396 zs_free(pool
, handle
);
399 static void *zs_zpool_map(void *pool
, unsigned long handle
,
400 enum zpool_mapmode mm
)
402 enum zs_mapmode zs_mm
;
417 return zs_map_object(pool
, handle
, zs_mm
);
419 static void zs_zpool_unmap(void *pool
, unsigned long handle
)
421 zs_unmap_object(pool
, handle
);
424 static u64
zs_zpool_total_size(void *pool
)
426 return zs_get_total_pages(pool
) << PAGE_SHIFT
;
429 static struct zpool_driver zs_zpool_driver
= {
431 .owner
= THIS_MODULE
,
432 .create
= zs_zpool_create
,
433 .destroy
= zs_zpool_destroy
,
434 .malloc_support_movable
= true,
435 .malloc
= zs_zpool_malloc
,
436 .free
= zs_zpool_free
,
438 .unmap
= zs_zpool_unmap
,
439 .total_size
= zs_zpool_total_size
,
442 MODULE_ALIAS("zpool-zsmalloc");
443 #endif /* CONFIG_ZPOOL */
445 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
446 static DEFINE_PER_CPU(struct mapping_area
, zs_map_area
) = {
447 .lock
= INIT_LOCAL_LOCK(lock
),
450 static __maybe_unused
int is_first_page(struct page
*page
)
452 return PagePrivate(page
);
455 /* Protected by class->lock */
456 static inline int get_zspage_inuse(struct zspage
*zspage
)
458 return zspage
->inuse
;
462 static inline void mod_zspage_inuse(struct zspage
*zspage
, int val
)
464 zspage
->inuse
+= val
;
467 static inline struct page
*get_first_page(struct zspage
*zspage
)
469 struct page
*first_page
= zspage
->first_page
;
471 VM_BUG_ON_PAGE(!is_first_page(first_page
), first_page
);
475 static inline unsigned int get_first_obj_offset(struct page
*page
)
477 return page
->page_type
;
480 static inline void set_first_obj_offset(struct page
*page
, unsigned int offset
)
482 page
->page_type
= offset
;
485 static inline unsigned int get_freeobj(struct zspage
*zspage
)
487 return zspage
->freeobj
;
490 static inline void set_freeobj(struct zspage
*zspage
, unsigned int obj
)
492 zspage
->freeobj
= obj
;
495 static void get_zspage_mapping(struct zspage
*zspage
,
496 unsigned int *class_idx
,
497 enum fullness_group
*fullness
)
499 BUG_ON(zspage
->magic
!= ZSPAGE_MAGIC
);
501 *fullness
= zspage
->fullness
;
502 *class_idx
= zspage
->class;
505 static struct size_class
*zspage_class(struct zs_pool
*pool
,
506 struct zspage
*zspage
)
508 return pool
->size_class
[zspage
->class];
511 static void set_zspage_mapping(struct zspage
*zspage
,
512 unsigned int class_idx
,
513 enum fullness_group fullness
)
515 zspage
->class = class_idx
;
516 zspage
->fullness
= fullness
;
520 * zsmalloc divides the pool into various size classes where each
521 * class maintains a list of zspages where each zspage is divided
522 * into equal sized chunks. Each allocation falls into one of these
523 * classes depending on its size. This function returns index of the
524 * size class which has chunk size big enough to hold the given size.
526 static int get_size_class_index(int size
)
530 if (likely(size
> ZS_MIN_ALLOC_SIZE
))
531 idx
= DIV_ROUND_UP(size
- ZS_MIN_ALLOC_SIZE
,
532 ZS_SIZE_CLASS_DELTA
);
534 return min_t(int, ZS_SIZE_CLASSES
- 1, idx
);
537 /* type can be of enum type class_stat_type or fullness_group */
538 static inline void class_stat_inc(struct size_class
*class,
539 int type
, unsigned long cnt
)
541 class->stats
.objs
[type
] += cnt
;
544 /* type can be of enum type class_stat_type or fullness_group */
545 static inline void class_stat_dec(struct size_class
*class,
546 int type
, unsigned long cnt
)
548 class->stats
.objs
[type
] -= cnt
;
551 /* type can be of enum type class_stat_type or fullness_group */
552 static inline unsigned long zs_stat_get(struct size_class
*class,
555 return class->stats
.objs
[type
];
558 #ifdef CONFIG_ZSMALLOC_STAT
560 static void __init
zs_stat_init(void)
562 if (!debugfs_initialized()) {
563 pr_warn("debugfs not available, stat dir not created\n");
567 zs_stat_root
= debugfs_create_dir("zsmalloc", NULL
);
570 static void __exit
zs_stat_exit(void)
572 debugfs_remove_recursive(zs_stat_root
);
575 static unsigned long zs_can_compact(struct size_class
*class);
577 static int zs_stats_size_show(struct seq_file
*s
, void *v
)
580 struct zs_pool
*pool
= s
->private;
581 struct size_class
*class;
583 unsigned long class_almost_full
, class_almost_empty
;
584 unsigned long obj_allocated
, obj_used
, pages_used
, freeable
;
585 unsigned long total_class_almost_full
= 0, total_class_almost_empty
= 0;
586 unsigned long total_objs
= 0, total_used_objs
= 0, total_pages
= 0;
587 unsigned long total_freeable
= 0;
589 seq_printf(s
, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
590 "class", "size", "almost_full", "almost_empty",
591 "obj_allocated", "obj_used", "pages_used",
592 "pages_per_zspage", "freeable");
594 for (i
= 0; i
< ZS_SIZE_CLASSES
; i
++) {
595 class = pool
->size_class
[i
];
597 if (class->index
!= i
)
600 spin_lock(&class->lock
);
601 class_almost_full
= zs_stat_get(class, CLASS_ALMOST_FULL
);
602 class_almost_empty
= zs_stat_get(class, CLASS_ALMOST_EMPTY
);
603 obj_allocated
= zs_stat_get(class, OBJ_ALLOCATED
);
604 obj_used
= zs_stat_get(class, OBJ_USED
);
605 freeable
= zs_can_compact(class);
606 spin_unlock(&class->lock
);
608 objs_per_zspage
= class->objs_per_zspage
;
609 pages_used
= obj_allocated
/ objs_per_zspage
*
610 class->pages_per_zspage
;
612 seq_printf(s
, " %5u %5u %11lu %12lu %13lu"
613 " %10lu %10lu %16d %8lu\n",
614 i
, class->size
, class_almost_full
, class_almost_empty
,
615 obj_allocated
, obj_used
, pages_used
,
616 class->pages_per_zspage
, freeable
);
618 total_class_almost_full
+= class_almost_full
;
619 total_class_almost_empty
+= class_almost_empty
;
620 total_objs
+= obj_allocated
;
621 total_used_objs
+= obj_used
;
622 total_pages
+= pages_used
;
623 total_freeable
+= freeable
;
627 seq_printf(s
, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
628 "Total", "", total_class_almost_full
,
629 total_class_almost_empty
, total_objs
,
630 total_used_objs
, total_pages
, "", total_freeable
);
634 DEFINE_SHOW_ATTRIBUTE(zs_stats_size
);
636 static void zs_pool_stat_create(struct zs_pool
*pool
, const char *name
)
639 pr_warn("no root stat dir, not creating <%s> stat dir\n", name
);
643 pool
->stat_dentry
= debugfs_create_dir(name
, zs_stat_root
);
645 debugfs_create_file("classes", S_IFREG
| 0444, pool
->stat_dentry
, pool
,
646 &zs_stats_size_fops
);
649 static void zs_pool_stat_destroy(struct zs_pool
*pool
)
651 debugfs_remove_recursive(pool
->stat_dentry
);
654 #else /* CONFIG_ZSMALLOC_STAT */
655 static void __init
zs_stat_init(void)
659 static void __exit
zs_stat_exit(void)
663 static inline void zs_pool_stat_create(struct zs_pool
*pool
, const char *name
)
667 static inline void zs_pool_stat_destroy(struct zs_pool
*pool
)
674 * For each size class, zspages are divided into different groups
675 * depending on how "full" they are. This was done so that we could
676 * easily find empty or nearly empty zspages when we try to shrink
677 * the pool (not yet implemented). This function returns fullness
678 * status of the given page.
680 static enum fullness_group
get_fullness_group(struct size_class
*class,
681 struct zspage
*zspage
)
683 int inuse
, objs_per_zspage
;
684 enum fullness_group fg
;
686 inuse
= get_zspage_inuse(zspage
);
687 objs_per_zspage
= class->objs_per_zspage
;
691 else if (inuse
== objs_per_zspage
)
693 else if (inuse
<= 3 * objs_per_zspage
/ fullness_threshold_frac
)
694 fg
= ZS_ALMOST_EMPTY
;
702 * Each size class maintains various freelists and zspages are assigned
703 * to one of these freelists based on the number of live objects they
704 * have. This functions inserts the given zspage into the freelist
705 * identified by <class, fullness_group>.
707 static void insert_zspage(struct size_class
*class,
708 struct zspage
*zspage
,
709 enum fullness_group fullness
)
713 class_stat_inc(class, fullness
, 1);
714 head
= list_first_entry_or_null(&class->fullness_list
[fullness
],
715 struct zspage
, list
);
717 * We want to see more ZS_FULL pages and less almost empty/full.
718 * Put pages with higher ->inuse first.
720 if (head
&& get_zspage_inuse(zspage
) < get_zspage_inuse(head
))
721 list_add(&zspage
->list
, &head
->list
);
723 list_add(&zspage
->list
, &class->fullness_list
[fullness
]);
727 * This function removes the given zspage from the freelist identified
728 * by <class, fullness_group>.
730 static void remove_zspage(struct size_class
*class,
731 struct zspage
*zspage
,
732 enum fullness_group fullness
)
734 VM_BUG_ON(list_empty(&class->fullness_list
[fullness
]));
736 list_del_init(&zspage
->list
);
737 class_stat_dec(class, fullness
, 1);
741 * Each size class maintains zspages in different fullness groups depending
742 * on the number of live objects they contain. When allocating or freeing
743 * objects, the fullness status of the page can change, say, from ALMOST_FULL
744 * to ALMOST_EMPTY when freeing an object. This function checks if such
745 * a status change has occurred for the given page and accordingly moves the
746 * page from the freelist of the old fullness group to that of the new
749 static enum fullness_group
fix_fullness_group(struct size_class
*class,
750 struct zspage
*zspage
)
753 enum fullness_group currfg
, newfg
;
755 get_zspage_mapping(zspage
, &class_idx
, &currfg
);
756 newfg
= get_fullness_group(class, zspage
);
760 remove_zspage(class, zspage
, currfg
);
761 insert_zspage(class, zspage
, newfg
);
762 set_zspage_mapping(zspage
, class_idx
, newfg
);
768 * We have to decide on how many pages to link together
769 * to form a zspage for each size class. This is important
770 * to reduce wastage due to unusable space left at end of
771 * each zspage which is given as:
772 * wastage = Zp % class_size
773 * usage = Zp - wastage
774 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
776 * For example, for size class of 3/8 * PAGE_SIZE, we should
777 * link together 3 PAGE_SIZE sized pages to form a zspage
778 * since then we can perfectly fit in 8 such objects.
780 static int get_pages_per_zspage(int class_size
)
782 int i
, max_usedpc
= 0;
783 /* zspage order which gives maximum used size per KB */
784 int max_usedpc_order
= 1;
786 for (i
= 1; i
<= ZS_MAX_PAGES_PER_ZSPAGE
; i
++) {
790 zspage_size
= i
* PAGE_SIZE
;
791 waste
= zspage_size
% class_size
;
792 usedpc
= (zspage_size
- waste
) * 100 / zspage_size
;
794 if (usedpc
> max_usedpc
) {
796 max_usedpc_order
= i
;
800 return max_usedpc_order
;
803 static struct zspage
*get_zspage(struct page
*page
)
805 struct zspage
*zspage
= (struct zspage
*)page_private(page
);
807 BUG_ON(zspage
->magic
!= ZSPAGE_MAGIC
);
811 static struct page
*get_next_page(struct page
*page
)
813 struct zspage
*zspage
= get_zspage(page
);
815 if (unlikely(ZsHugePage(zspage
)))
818 return (struct page
*)page
->index
;
822 * obj_to_location - get (<page>, <obj_idx>) from encoded object value
823 * @obj: the encoded object value
824 * @page: page object resides in zspage
825 * @obj_idx: object index
827 static void obj_to_location(unsigned long obj
, struct page
**page
,
828 unsigned int *obj_idx
)
830 obj
>>= OBJ_TAG_BITS
;
831 *page
= pfn_to_page(obj
>> OBJ_INDEX_BITS
);
832 *obj_idx
= (obj
& OBJ_INDEX_MASK
);
835 static void obj_to_page(unsigned long obj
, struct page
**page
)
837 obj
>>= OBJ_TAG_BITS
;
838 *page
= pfn_to_page(obj
>> OBJ_INDEX_BITS
);
842 * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
843 * @page: page object resides in zspage
844 * @obj_idx: object index
846 static unsigned long location_to_obj(struct page
*page
, unsigned int obj_idx
)
850 obj
= page_to_pfn(page
) << OBJ_INDEX_BITS
;
851 obj
|= obj_idx
& OBJ_INDEX_MASK
;
852 obj
<<= OBJ_TAG_BITS
;
857 static unsigned long handle_to_obj(unsigned long handle
)
859 return *(unsigned long *)handle
;
862 static bool obj_allocated(struct page
*page
, void *obj
, unsigned long *phandle
)
864 unsigned long handle
;
865 struct zspage
*zspage
= get_zspage(page
);
867 if (unlikely(ZsHugePage(zspage
))) {
868 VM_BUG_ON_PAGE(!is_first_page(page
), page
);
869 handle
= page
->index
;
871 handle
= *(unsigned long *)obj
;
873 if (!(handle
& OBJ_ALLOCATED_TAG
))
876 *phandle
= handle
& ~OBJ_ALLOCATED_TAG
;
880 static void reset_page(struct page
*page
)
882 __ClearPageMovable(page
);
883 ClearPagePrivate(page
);
884 set_page_private(page
, 0);
885 page_mapcount_reset(page
);
889 static int trylock_zspage(struct zspage
*zspage
)
891 struct page
*cursor
, *fail
;
893 for (cursor
= get_first_page(zspage
); cursor
!= NULL
; cursor
=
894 get_next_page(cursor
)) {
895 if (!trylock_page(cursor
)) {
903 for (cursor
= get_first_page(zspage
); cursor
!= fail
; cursor
=
904 get_next_page(cursor
))
910 static void __free_zspage(struct zs_pool
*pool
, struct size_class
*class,
911 struct zspage
*zspage
)
913 struct page
*page
, *next
;
914 enum fullness_group fg
;
915 unsigned int class_idx
;
917 get_zspage_mapping(zspage
, &class_idx
, &fg
);
919 assert_spin_locked(&class->lock
);
921 VM_BUG_ON(get_zspage_inuse(zspage
));
922 VM_BUG_ON(fg
!= ZS_EMPTY
);
924 next
= page
= get_first_page(zspage
);
926 VM_BUG_ON_PAGE(!PageLocked(page
), page
);
927 next
= get_next_page(page
);
930 dec_zone_page_state(page
, NR_ZSPAGES
);
933 } while (page
!= NULL
);
935 cache_free_zspage(pool
, zspage
);
937 class_stat_dec(class, OBJ_ALLOCATED
, class->objs_per_zspage
);
938 atomic_long_sub(class->pages_per_zspage
,
939 &pool
->pages_allocated
);
942 static void free_zspage(struct zs_pool
*pool
, struct size_class
*class,
943 struct zspage
*zspage
)
945 VM_BUG_ON(get_zspage_inuse(zspage
));
946 VM_BUG_ON(list_empty(&zspage
->list
));
949 * Since zs_free couldn't be sleepable, this function cannot call
950 * lock_page. The page locks trylock_zspage got will be released
953 if (!trylock_zspage(zspage
)) {
954 kick_deferred_free(pool
);
958 remove_zspage(class, zspage
, ZS_EMPTY
);
959 __free_zspage(pool
, class, zspage
);
962 /* Initialize a newly allocated zspage */
963 static void init_zspage(struct size_class
*class, struct zspage
*zspage
)
965 unsigned int freeobj
= 1;
966 unsigned long off
= 0;
967 struct page
*page
= get_first_page(zspage
);
970 struct page
*next_page
;
971 struct link_free
*link
;
974 set_first_obj_offset(page
, off
);
976 vaddr
= kmap_atomic(page
);
977 link
= (struct link_free
*)vaddr
+ off
/ sizeof(*link
);
979 while ((off
+= class->size
) < PAGE_SIZE
) {
980 link
->next
= freeobj
++ << OBJ_TAG_BITS
;
981 link
+= class->size
/ sizeof(*link
);
985 * We now come to the last (full or partial) object on this
986 * page, which must point to the first object on the next
989 next_page
= get_next_page(page
);
991 link
->next
= freeobj
++ << OBJ_TAG_BITS
;
994 * Reset OBJ_TAG_BITS bit to last link to tell
995 * whether it's allocated object or not.
997 link
->next
= -1UL << OBJ_TAG_BITS
;
999 kunmap_atomic(vaddr
);
1004 set_freeobj(zspage
, 0);
1007 static void create_page_chain(struct size_class
*class, struct zspage
*zspage
,
1008 struct page
*pages
[])
1012 struct page
*prev_page
= NULL
;
1013 int nr_pages
= class->pages_per_zspage
;
1016 * Allocate individual pages and link them together as:
1017 * 1. all pages are linked together using page->index
1018 * 2. each sub-page point to zspage using page->private
1020 * we set PG_private to identify the first page (i.e. no other sub-page
1021 * has this flag set).
1023 for (i
= 0; i
< nr_pages
; i
++) {
1025 set_page_private(page
, (unsigned long)zspage
);
1028 zspage
->first_page
= page
;
1029 SetPagePrivate(page
);
1030 if (unlikely(class->objs_per_zspage
== 1 &&
1031 class->pages_per_zspage
== 1))
1032 SetZsHugePage(zspage
);
1034 prev_page
->index
= (unsigned long)page
;
1041 * Allocate a zspage for the given size class
1043 static struct zspage
*alloc_zspage(struct zs_pool
*pool
,
1044 struct size_class
*class,
1048 struct page
*pages
[ZS_MAX_PAGES_PER_ZSPAGE
];
1049 struct zspage
*zspage
= cache_alloc_zspage(pool
, gfp
);
1054 zspage
->magic
= ZSPAGE_MAGIC
;
1055 migrate_lock_init(zspage
);
1057 for (i
= 0; i
< class->pages_per_zspage
; i
++) {
1060 page
= alloc_page(gfp
);
1063 dec_zone_page_state(pages
[i
], NR_ZSPAGES
);
1064 __free_page(pages
[i
]);
1066 cache_free_zspage(pool
, zspage
);
1070 inc_zone_page_state(page
, NR_ZSPAGES
);
1074 create_page_chain(class, zspage
, pages
);
1075 init_zspage(class, zspage
);
1076 zspage
->pool
= pool
;
1081 static struct zspage
*find_get_zspage(struct size_class
*class)
1084 struct zspage
*zspage
;
1086 for (i
= ZS_ALMOST_FULL
; i
>= ZS_EMPTY
; i
--) {
1087 zspage
= list_first_entry_or_null(&class->fullness_list
[i
],
1088 struct zspage
, list
);
1096 static inline int __zs_cpu_up(struct mapping_area
*area
)
1099 * Make sure we don't leak memory if a cpu UP notification
1100 * and zs_init() race and both call zs_cpu_up() on the same cpu
1104 area
->vm_buf
= kmalloc(ZS_MAX_ALLOC_SIZE
, GFP_KERNEL
);
1110 static inline void __zs_cpu_down(struct mapping_area
*area
)
1112 kfree(area
->vm_buf
);
1113 area
->vm_buf
= NULL
;
1116 static void *__zs_map_object(struct mapping_area
*area
,
1117 struct page
*pages
[2], int off
, int size
)
1121 char *buf
= area
->vm_buf
;
1123 /* disable page faults to match kmap_atomic() return conditions */
1124 pagefault_disable();
1126 /* no read fastpath */
1127 if (area
->vm_mm
== ZS_MM_WO
)
1130 sizes
[0] = PAGE_SIZE
- off
;
1131 sizes
[1] = size
- sizes
[0];
1133 /* copy object to per-cpu buffer */
1134 addr
= kmap_atomic(pages
[0]);
1135 memcpy(buf
, addr
+ off
, sizes
[0]);
1136 kunmap_atomic(addr
);
1137 addr
= kmap_atomic(pages
[1]);
1138 memcpy(buf
+ sizes
[0], addr
, sizes
[1]);
1139 kunmap_atomic(addr
);
1141 return area
->vm_buf
;
1144 static void __zs_unmap_object(struct mapping_area
*area
,
1145 struct page
*pages
[2], int off
, int size
)
1151 /* no write fastpath */
1152 if (area
->vm_mm
== ZS_MM_RO
)
1156 buf
= buf
+ ZS_HANDLE_SIZE
;
1157 size
-= ZS_HANDLE_SIZE
;
1158 off
+= ZS_HANDLE_SIZE
;
1160 sizes
[0] = PAGE_SIZE
- off
;
1161 sizes
[1] = size
- sizes
[0];
1163 /* copy per-cpu buffer to object */
1164 addr
= kmap_atomic(pages
[0]);
1165 memcpy(addr
+ off
, buf
, sizes
[0]);
1166 kunmap_atomic(addr
);
1167 addr
= kmap_atomic(pages
[1]);
1168 memcpy(addr
, buf
+ sizes
[0], sizes
[1]);
1169 kunmap_atomic(addr
);
1172 /* enable page faults to match kunmap_atomic() return conditions */
1176 static int zs_cpu_prepare(unsigned int cpu
)
1178 struct mapping_area
*area
;
1180 area
= &per_cpu(zs_map_area
, cpu
);
1181 return __zs_cpu_up(area
);
1184 static int zs_cpu_dead(unsigned int cpu
)
1186 struct mapping_area
*area
;
1188 area
= &per_cpu(zs_map_area
, cpu
);
1189 __zs_cpu_down(area
);
1193 static bool can_merge(struct size_class
*prev
, int pages_per_zspage
,
1194 int objs_per_zspage
)
1196 if (prev
->pages_per_zspage
== pages_per_zspage
&&
1197 prev
->objs_per_zspage
== objs_per_zspage
)
1203 static bool zspage_full(struct size_class
*class, struct zspage
*zspage
)
1205 return get_zspage_inuse(zspage
) == class->objs_per_zspage
;
1208 unsigned long zs_get_total_pages(struct zs_pool
*pool
)
1210 return atomic_long_read(&pool
->pages_allocated
);
1212 EXPORT_SYMBOL_GPL(zs_get_total_pages
);
1215 * zs_map_object - get address of allocated object from handle.
1216 * @pool: pool from which the object was allocated
1217 * @handle: handle returned from zs_malloc
1218 * @mm: mapping mode to use
1220 * Before using an object allocated from zs_malloc, it must be mapped using
1221 * this function. When done with the object, it must be unmapped using
1224 * Only one object can be mapped per cpu at a time. There is no protection
1225 * against nested mappings.
1227 * This function returns with preemption and page faults disabled.
1229 void *zs_map_object(struct zs_pool
*pool
, unsigned long handle
,
1232 struct zspage
*zspage
;
1234 unsigned long obj
, off
;
1235 unsigned int obj_idx
;
1237 struct size_class
*class;
1238 struct mapping_area
*area
;
1239 struct page
*pages
[2];
1243 * Because we use per-cpu mapping areas shared among the
1244 * pools/users, we can't allow mapping in interrupt context
1245 * because it can corrupt another users mappings.
1247 BUG_ON(in_interrupt());
1249 /* It guarantees it can get zspage from handle safely */
1250 read_lock(&pool
->migrate_lock
);
1251 obj
= handle_to_obj(handle
);
1252 obj_to_location(obj
, &page
, &obj_idx
);
1253 zspage
= get_zspage(page
);
1256 * migration cannot move any zpages in this zspage. Here, class->lock
1257 * is too heavy since callers would take some time until they calls
1258 * zs_unmap_object API so delegate the locking from class to zspage
1259 * which is smaller granularity.
1261 migrate_read_lock(zspage
);
1262 read_unlock(&pool
->migrate_lock
);
1264 class = zspage_class(pool
, zspage
);
1265 off
= (class->size
* obj_idx
) & ~PAGE_MASK
;
1267 local_lock(&zs_map_area
.lock
);
1268 area
= this_cpu_ptr(&zs_map_area
);
1270 if (off
+ class->size
<= PAGE_SIZE
) {
1271 /* this object is contained entirely within a page */
1272 area
->vm_addr
= kmap_atomic(page
);
1273 ret
= area
->vm_addr
+ off
;
1277 /* this object spans two pages */
1279 pages
[1] = get_next_page(page
);
1282 ret
= __zs_map_object(area
, pages
, off
, class->size
);
1284 if (likely(!ZsHugePage(zspage
)))
1285 ret
+= ZS_HANDLE_SIZE
;
1289 EXPORT_SYMBOL_GPL(zs_map_object
);
1291 void zs_unmap_object(struct zs_pool
*pool
, unsigned long handle
)
1293 struct zspage
*zspage
;
1295 unsigned long obj
, off
;
1296 unsigned int obj_idx
;
1298 struct size_class
*class;
1299 struct mapping_area
*area
;
1301 obj
= handle_to_obj(handle
);
1302 obj_to_location(obj
, &page
, &obj_idx
);
1303 zspage
= get_zspage(page
);
1304 class = zspage_class(pool
, zspage
);
1305 off
= (class->size
* obj_idx
) & ~PAGE_MASK
;
1307 area
= this_cpu_ptr(&zs_map_area
);
1308 if (off
+ class->size
<= PAGE_SIZE
)
1309 kunmap_atomic(area
->vm_addr
);
1311 struct page
*pages
[2];
1314 pages
[1] = get_next_page(page
);
1317 __zs_unmap_object(area
, pages
, off
, class->size
);
1319 local_unlock(&zs_map_area
.lock
);
1321 migrate_read_unlock(zspage
);
1323 EXPORT_SYMBOL_GPL(zs_unmap_object
);
1326 * zs_huge_class_size() - Returns the size (in bytes) of the first huge
1327 * zsmalloc &size_class.
1328 * @pool: zsmalloc pool to use
1330 * The function returns the size of the first huge class - any object of equal
1331 * or bigger size will be stored in zspage consisting of a single physical
1334 * Context: Any context.
1336 * Return: the size (in bytes) of the first huge zsmalloc &size_class.
1338 size_t zs_huge_class_size(struct zs_pool
*pool
)
1340 return huge_class_size
;
1342 EXPORT_SYMBOL_GPL(zs_huge_class_size
);
1344 static unsigned long obj_malloc(struct zs_pool
*pool
,
1345 struct zspage
*zspage
, unsigned long handle
)
1347 int i
, nr_page
, offset
;
1349 struct link_free
*link
;
1350 struct size_class
*class;
1352 struct page
*m_page
;
1353 unsigned long m_offset
;
1356 class = pool
->size_class
[zspage
->class];
1357 handle
|= OBJ_ALLOCATED_TAG
;
1358 obj
= get_freeobj(zspage
);
1360 offset
= obj
* class->size
;
1361 nr_page
= offset
>> PAGE_SHIFT
;
1362 m_offset
= offset
& ~PAGE_MASK
;
1363 m_page
= get_first_page(zspage
);
1365 for (i
= 0; i
< nr_page
; i
++)
1366 m_page
= get_next_page(m_page
);
1368 vaddr
= kmap_atomic(m_page
);
1369 link
= (struct link_free
*)vaddr
+ m_offset
/ sizeof(*link
);
1370 set_freeobj(zspage
, link
->next
>> OBJ_TAG_BITS
);
1371 if (likely(!ZsHugePage(zspage
)))
1372 /* record handle in the header of allocated chunk */
1373 link
->handle
= handle
;
1375 /* record handle to page->index */
1376 zspage
->first_page
->index
= handle
;
1378 kunmap_atomic(vaddr
);
1379 mod_zspage_inuse(zspage
, 1);
1381 obj
= location_to_obj(m_page
, obj
);
1388 * zs_malloc - Allocate block of given size from pool.
1389 * @pool: pool to allocate from
1390 * @size: size of block to allocate
1391 * @gfp: gfp flags when allocating object
1393 * On success, handle to the allocated object is returned,
1394 * otherwise an ERR_PTR().
1395 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
1397 unsigned long zs_malloc(struct zs_pool
*pool
, size_t size
, gfp_t gfp
)
1399 unsigned long handle
, obj
;
1400 struct size_class
*class;
1401 enum fullness_group newfg
;
1402 struct zspage
*zspage
;
1404 if (unlikely(!size
|| size
> ZS_MAX_ALLOC_SIZE
))
1405 return (unsigned long)ERR_PTR(-EINVAL
);
1407 handle
= cache_alloc_handle(pool
, gfp
);
1409 return (unsigned long)ERR_PTR(-ENOMEM
);
1411 /* extra space in chunk to keep the handle */
1412 size
+= ZS_HANDLE_SIZE
;
1413 class = pool
->size_class
[get_size_class_index(size
)];
1415 /* class->lock effectively protects the zpage migration */
1416 spin_lock(&class->lock
);
1417 zspage
= find_get_zspage(class);
1418 if (likely(zspage
)) {
1419 obj
= obj_malloc(pool
, zspage
, handle
);
1420 /* Now move the zspage to another fullness group, if required */
1421 fix_fullness_group(class, zspage
);
1422 record_obj(handle
, obj
);
1423 class_stat_inc(class, OBJ_USED
, 1);
1424 spin_unlock(&class->lock
);
1429 spin_unlock(&class->lock
);
1431 zspage
= alloc_zspage(pool
, class, gfp
);
1433 cache_free_handle(pool
, handle
);
1434 return (unsigned long)ERR_PTR(-ENOMEM
);
1437 spin_lock(&class->lock
);
1438 obj
= obj_malloc(pool
, zspage
, handle
);
1439 newfg
= get_fullness_group(class, zspage
);
1440 insert_zspage(class, zspage
, newfg
);
1441 set_zspage_mapping(zspage
, class->index
, newfg
);
1442 record_obj(handle
, obj
);
1443 atomic_long_add(class->pages_per_zspage
,
1444 &pool
->pages_allocated
);
1445 class_stat_inc(class, OBJ_ALLOCATED
, class->objs_per_zspage
);
1446 class_stat_inc(class, OBJ_USED
, 1);
1448 /* We completely set up zspage so mark them as movable */
1449 SetZsPageMovable(pool
, zspage
);
1450 spin_unlock(&class->lock
);
1454 EXPORT_SYMBOL_GPL(zs_malloc
);
1456 static void obj_free(int class_size
, unsigned long obj
)
1458 struct link_free
*link
;
1459 struct zspage
*zspage
;
1460 struct page
*f_page
;
1461 unsigned long f_offset
;
1462 unsigned int f_objidx
;
1465 obj_to_location(obj
, &f_page
, &f_objidx
);
1466 f_offset
= (class_size
* f_objidx
) & ~PAGE_MASK
;
1467 zspage
= get_zspage(f_page
);
1469 vaddr
= kmap_atomic(f_page
);
1471 /* Insert this object in containing zspage's freelist */
1472 link
= (struct link_free
*)(vaddr
+ f_offset
);
1473 if (likely(!ZsHugePage(zspage
)))
1474 link
->next
= get_freeobj(zspage
) << OBJ_TAG_BITS
;
1477 kunmap_atomic(vaddr
);
1478 set_freeobj(zspage
, f_objidx
);
1479 mod_zspage_inuse(zspage
, -1);
1482 void zs_free(struct zs_pool
*pool
, unsigned long handle
)
1484 struct zspage
*zspage
;
1485 struct page
*f_page
;
1487 struct size_class
*class;
1488 enum fullness_group fullness
;
1490 if (IS_ERR_OR_NULL((void *)handle
))
1494 * The pool->migrate_lock protects the race with zpage's migration
1495 * so it's safe to get the page from handle.
1497 read_lock(&pool
->migrate_lock
);
1498 obj
= handle_to_obj(handle
);
1499 obj_to_page(obj
, &f_page
);
1500 zspage
= get_zspage(f_page
);
1501 class = zspage_class(pool
, zspage
);
1502 spin_lock(&class->lock
);
1503 read_unlock(&pool
->migrate_lock
);
1505 obj_free(class->size
, obj
);
1506 class_stat_dec(class, OBJ_USED
, 1);
1507 fullness
= fix_fullness_group(class, zspage
);
1508 if (fullness
!= ZS_EMPTY
)
1511 free_zspage(pool
, class, zspage
);
1513 spin_unlock(&class->lock
);
1514 cache_free_handle(pool
, handle
);
1516 EXPORT_SYMBOL_GPL(zs_free
);
1518 static void zs_object_copy(struct size_class
*class, unsigned long dst
,
1521 struct page
*s_page
, *d_page
;
1522 unsigned int s_objidx
, d_objidx
;
1523 unsigned long s_off
, d_off
;
1524 void *s_addr
, *d_addr
;
1525 int s_size
, d_size
, size
;
1528 s_size
= d_size
= class->size
;
1530 obj_to_location(src
, &s_page
, &s_objidx
);
1531 obj_to_location(dst
, &d_page
, &d_objidx
);
1533 s_off
= (class->size
* s_objidx
) & ~PAGE_MASK
;
1534 d_off
= (class->size
* d_objidx
) & ~PAGE_MASK
;
1536 if (s_off
+ class->size
> PAGE_SIZE
)
1537 s_size
= PAGE_SIZE
- s_off
;
1539 if (d_off
+ class->size
> PAGE_SIZE
)
1540 d_size
= PAGE_SIZE
- d_off
;
1542 s_addr
= kmap_atomic(s_page
);
1543 d_addr
= kmap_atomic(d_page
);
1546 size
= min(s_size
, d_size
);
1547 memcpy(d_addr
+ d_off
, s_addr
+ s_off
, size
);
1550 if (written
== class->size
)
1559 * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic()
1560 * calls must occurs in reverse order of calls to kmap_atomic().
1561 * So, to call kunmap_atomic(s_addr) we should first call
1562 * kunmap_atomic(d_addr). For more details see
1563 * Documentation/mm/highmem.rst.
1565 if (s_off
>= PAGE_SIZE
) {
1566 kunmap_atomic(d_addr
);
1567 kunmap_atomic(s_addr
);
1568 s_page
= get_next_page(s_page
);
1569 s_addr
= kmap_atomic(s_page
);
1570 d_addr
= kmap_atomic(d_page
);
1571 s_size
= class->size
- written
;
1575 if (d_off
>= PAGE_SIZE
) {
1576 kunmap_atomic(d_addr
);
1577 d_page
= get_next_page(d_page
);
1578 d_addr
= kmap_atomic(d_page
);
1579 d_size
= class->size
- written
;
1584 kunmap_atomic(d_addr
);
1585 kunmap_atomic(s_addr
);
1589 * Find alloced object in zspage from index object and
1592 static unsigned long find_alloced_obj(struct size_class
*class,
1593 struct page
*page
, int *obj_idx
)
1595 unsigned int offset
;
1596 int index
= *obj_idx
;
1597 unsigned long handle
= 0;
1598 void *addr
= kmap_atomic(page
);
1600 offset
= get_first_obj_offset(page
);
1601 offset
+= class->size
* index
;
1603 while (offset
< PAGE_SIZE
) {
1604 if (obj_allocated(page
, addr
+ offset
, &handle
))
1607 offset
+= class->size
;
1611 kunmap_atomic(addr
);
1618 struct zs_compact_control
{
1619 /* Source spage for migration which could be a subpage of zspage */
1620 struct page
*s_page
;
1621 /* Destination page for migration which should be a first page
1623 struct page
*d_page
;
1624 /* Starting object index within @s_page which used for live object
1625 * in the subpage. */
1629 static int migrate_zspage(struct zs_pool
*pool
, struct size_class
*class,
1630 struct zs_compact_control
*cc
)
1632 unsigned long used_obj
, free_obj
;
1633 unsigned long handle
;
1634 struct page
*s_page
= cc
->s_page
;
1635 struct page
*d_page
= cc
->d_page
;
1636 int obj_idx
= cc
->obj_idx
;
1640 handle
= find_alloced_obj(class, s_page
, &obj_idx
);
1642 s_page
= get_next_page(s_page
);
1649 /* Stop if there is no more space */
1650 if (zspage_full(class, get_zspage(d_page
))) {
1655 used_obj
= handle_to_obj(handle
);
1656 free_obj
= obj_malloc(pool
, get_zspage(d_page
), handle
);
1657 zs_object_copy(class, free_obj
, used_obj
);
1659 record_obj(handle
, free_obj
);
1660 obj_free(class->size
, used_obj
);
1663 /* Remember last position in this iteration */
1664 cc
->s_page
= s_page
;
1665 cc
->obj_idx
= obj_idx
;
1670 static struct zspage
*isolate_zspage(struct size_class
*class, bool source
)
1673 struct zspage
*zspage
;
1674 enum fullness_group fg
[2] = {ZS_ALMOST_EMPTY
, ZS_ALMOST_FULL
};
1677 fg
[0] = ZS_ALMOST_FULL
;
1678 fg
[1] = ZS_ALMOST_EMPTY
;
1681 for (i
= 0; i
< 2; i
++) {
1682 zspage
= list_first_entry_or_null(&class->fullness_list
[fg
[i
]],
1683 struct zspage
, list
);
1685 remove_zspage(class, zspage
, fg
[i
]);
1694 * putback_zspage - add @zspage into right class's fullness list
1695 * @class: destination class
1696 * @zspage: target page
1698 * Return @zspage's fullness_group
1700 static enum fullness_group
putback_zspage(struct size_class
*class,
1701 struct zspage
*zspage
)
1703 enum fullness_group fullness
;
1705 fullness
= get_fullness_group(class, zspage
);
1706 insert_zspage(class, zspage
, fullness
);
1707 set_zspage_mapping(zspage
, class->index
, fullness
);
1712 #ifdef CONFIG_COMPACTION
1714 * To prevent zspage destroy during migration, zspage freeing should
1715 * hold locks of all pages in the zspage.
1717 static void lock_zspage(struct zspage
*zspage
)
1719 struct page
*curr_page
, *page
;
1722 * Pages we haven't locked yet can be migrated off the list while we're
1723 * trying to lock them, so we need to be careful and only attempt to
1724 * lock each page under migrate_read_lock(). Otherwise, the page we lock
1725 * may no longer belong to the zspage. This means that we may wait for
1726 * the wrong page to unlock, so we must take a reference to the page
1727 * prior to waiting for it to unlock outside migrate_read_lock().
1730 migrate_read_lock(zspage
);
1731 page
= get_first_page(zspage
);
1732 if (trylock_page(page
))
1735 migrate_read_unlock(zspage
);
1736 wait_on_page_locked(page
);
1741 while ((page
= get_next_page(curr_page
))) {
1742 if (trylock_page(page
)) {
1746 migrate_read_unlock(zspage
);
1747 wait_on_page_locked(page
);
1749 migrate_read_lock(zspage
);
1752 migrate_read_unlock(zspage
);
1755 static void migrate_lock_init(struct zspage
*zspage
)
1757 rwlock_init(&zspage
->lock
);
1760 static void migrate_read_lock(struct zspage
*zspage
) __acquires(&zspage
->lock
)
1762 read_lock(&zspage
->lock
);
1765 static void migrate_read_unlock(struct zspage
*zspage
) __releases(&zspage
->lock
)
1767 read_unlock(&zspage
->lock
);
1770 static void migrate_write_lock(struct zspage
*zspage
)
1772 write_lock(&zspage
->lock
);
1775 static void migrate_write_lock_nested(struct zspage
*zspage
)
1777 write_lock_nested(&zspage
->lock
, SINGLE_DEPTH_NESTING
);
1780 static void migrate_write_unlock(struct zspage
*zspage
)
1782 write_unlock(&zspage
->lock
);
1785 /* Number of isolated subpage for *page migration* in this zspage */
1786 static void inc_zspage_isolation(struct zspage
*zspage
)
1791 static void dec_zspage_isolation(struct zspage
*zspage
)
1793 VM_BUG_ON(zspage
->isolated
== 0);
1797 static const struct movable_operations zsmalloc_mops
;
1799 static void replace_sub_page(struct size_class
*class, struct zspage
*zspage
,
1800 struct page
*newpage
, struct page
*oldpage
)
1803 struct page
*pages
[ZS_MAX_PAGES_PER_ZSPAGE
] = {NULL
, };
1806 page
= get_first_page(zspage
);
1808 if (page
== oldpage
)
1809 pages
[idx
] = newpage
;
1813 } while ((page
= get_next_page(page
)) != NULL
);
1815 create_page_chain(class, zspage
, pages
);
1816 set_first_obj_offset(newpage
, get_first_obj_offset(oldpage
));
1817 if (unlikely(ZsHugePage(zspage
)))
1818 newpage
->index
= oldpage
->index
;
1819 __SetPageMovable(newpage
, &zsmalloc_mops
);
1822 static bool zs_page_isolate(struct page
*page
, isolate_mode_t mode
)
1824 struct zspage
*zspage
;
1827 * Page is locked so zspage couldn't be destroyed. For detail, look at
1828 * lock_zspage in free_zspage.
1830 VM_BUG_ON_PAGE(!PageMovable(page
), page
);
1831 VM_BUG_ON_PAGE(PageIsolated(page
), page
);
1833 zspage
= get_zspage(page
);
1834 migrate_write_lock(zspage
);
1835 inc_zspage_isolation(zspage
);
1836 migrate_write_unlock(zspage
);
1841 static int zs_page_migrate(struct page
*newpage
, struct page
*page
,
1842 enum migrate_mode mode
)
1844 struct zs_pool
*pool
;
1845 struct size_class
*class;
1846 struct zspage
*zspage
;
1848 void *s_addr
, *d_addr
, *addr
;
1849 unsigned int offset
;
1850 unsigned long handle
;
1851 unsigned long old_obj
, new_obj
;
1852 unsigned int obj_idx
;
1855 * We cannot support the _NO_COPY case here, because copy needs to
1856 * happen under the zs lock, which does not work with
1857 * MIGRATE_SYNC_NO_COPY workflow.
1859 if (mode
== MIGRATE_SYNC_NO_COPY
)
1862 VM_BUG_ON_PAGE(!PageMovable(page
), page
);
1863 VM_BUG_ON_PAGE(!PageIsolated(page
), page
);
1865 /* The page is locked, so this pointer must remain valid */
1866 zspage
= get_zspage(page
);
1867 pool
= zspage
->pool
;
1870 * The pool migrate_lock protects the race between zpage migration
1873 write_lock(&pool
->migrate_lock
);
1874 class = zspage_class(pool
, zspage
);
1877 * the class lock protects zpage alloc/free in the zspage.
1879 spin_lock(&class->lock
);
1880 /* the migrate_write_lock protects zpage access via zs_map_object */
1881 migrate_write_lock(zspage
);
1883 offset
= get_first_obj_offset(page
);
1884 s_addr
= kmap_atomic(page
);
1887 * Here, any user cannot access all objects in the zspage so let's move.
1889 d_addr
= kmap_atomic(newpage
);
1890 memcpy(d_addr
, s_addr
, PAGE_SIZE
);
1891 kunmap_atomic(d_addr
);
1893 for (addr
= s_addr
+ offset
; addr
< s_addr
+ PAGE_SIZE
;
1894 addr
+= class->size
) {
1895 if (obj_allocated(page
, addr
, &handle
)) {
1897 old_obj
= handle_to_obj(handle
);
1898 obj_to_location(old_obj
, &dummy
, &obj_idx
);
1899 new_obj
= (unsigned long)location_to_obj(newpage
,
1901 record_obj(handle
, new_obj
);
1904 kunmap_atomic(s_addr
);
1906 replace_sub_page(class, zspage
, newpage
, page
);
1908 * Since we complete the data copy and set up new zspage structure,
1909 * it's okay to release migration_lock.
1911 write_unlock(&pool
->migrate_lock
);
1912 spin_unlock(&class->lock
);
1913 dec_zspage_isolation(zspage
);
1914 migrate_write_unlock(zspage
);
1917 if (page_zone(newpage
) != page_zone(page
)) {
1918 dec_zone_page_state(page
, NR_ZSPAGES
);
1919 inc_zone_page_state(newpage
, NR_ZSPAGES
);
1925 return MIGRATEPAGE_SUCCESS
;
1928 static void zs_page_putback(struct page
*page
)
1930 struct zspage
*zspage
;
1932 VM_BUG_ON_PAGE(!PageMovable(page
), page
);
1933 VM_BUG_ON_PAGE(!PageIsolated(page
), page
);
1935 zspage
= get_zspage(page
);
1936 migrate_write_lock(zspage
);
1937 dec_zspage_isolation(zspage
);
1938 migrate_write_unlock(zspage
);
1941 static const struct movable_operations zsmalloc_mops
= {
1942 .isolate_page
= zs_page_isolate
,
1943 .migrate_page
= zs_page_migrate
,
1944 .putback_page
= zs_page_putback
,
1948 * Caller should hold page_lock of all pages in the zspage
1949 * In here, we cannot use zspage meta data.
1951 static void async_free_zspage(struct work_struct
*work
)
1954 struct size_class
*class;
1955 unsigned int class_idx
;
1956 enum fullness_group fullness
;
1957 struct zspage
*zspage
, *tmp
;
1958 LIST_HEAD(free_pages
);
1959 struct zs_pool
*pool
= container_of(work
, struct zs_pool
,
1962 for (i
= 0; i
< ZS_SIZE_CLASSES
; i
++) {
1963 class = pool
->size_class
[i
];
1964 if (class->index
!= i
)
1967 spin_lock(&class->lock
);
1968 list_splice_init(&class->fullness_list
[ZS_EMPTY
], &free_pages
);
1969 spin_unlock(&class->lock
);
1972 list_for_each_entry_safe(zspage
, tmp
, &free_pages
, list
) {
1973 list_del(&zspage
->list
);
1974 lock_zspage(zspage
);
1976 get_zspage_mapping(zspage
, &class_idx
, &fullness
);
1977 VM_BUG_ON(fullness
!= ZS_EMPTY
);
1978 class = pool
->size_class
[class_idx
];
1979 spin_lock(&class->lock
);
1980 __free_zspage(pool
, class, zspage
);
1981 spin_unlock(&class->lock
);
1985 static void kick_deferred_free(struct zs_pool
*pool
)
1987 schedule_work(&pool
->free_work
);
1990 static void zs_flush_migration(struct zs_pool
*pool
)
1992 flush_work(&pool
->free_work
);
1995 static void init_deferred_free(struct zs_pool
*pool
)
1997 INIT_WORK(&pool
->free_work
, async_free_zspage
);
2000 static void SetZsPageMovable(struct zs_pool
*pool
, struct zspage
*zspage
)
2002 struct page
*page
= get_first_page(zspage
);
2005 WARN_ON(!trylock_page(page
));
2006 __SetPageMovable(page
, &zsmalloc_mops
);
2008 } while ((page
= get_next_page(page
)) != NULL
);
2011 static inline void zs_flush_migration(struct zs_pool
*pool
) { }
2016 * Based on the number of unused allocated objects calculate
2017 * and return the number of pages that we can free.
2019 static unsigned long zs_can_compact(struct size_class
*class)
2021 unsigned long obj_wasted
;
2022 unsigned long obj_allocated
= zs_stat_get(class, OBJ_ALLOCATED
);
2023 unsigned long obj_used
= zs_stat_get(class, OBJ_USED
);
2025 if (obj_allocated
<= obj_used
)
2028 obj_wasted
= obj_allocated
- obj_used
;
2029 obj_wasted
/= class->objs_per_zspage
;
2031 return obj_wasted
* class->pages_per_zspage
;
2034 static unsigned long __zs_compact(struct zs_pool
*pool
,
2035 struct size_class
*class)
2037 struct zs_compact_control cc
;
2038 struct zspage
*src_zspage
;
2039 struct zspage
*dst_zspage
= NULL
;
2040 unsigned long pages_freed
= 0;
2042 /* protect the race between zpage migration and zs_free */
2043 write_lock(&pool
->migrate_lock
);
2044 /* protect zpage allocation/free */
2045 spin_lock(&class->lock
);
2046 while ((src_zspage
= isolate_zspage(class, true))) {
2047 /* protect someone accessing the zspage(i.e., zs_map_object) */
2048 migrate_write_lock(src_zspage
);
2050 if (!zs_can_compact(class))
2054 cc
.s_page
= get_first_page(src_zspage
);
2056 while ((dst_zspage
= isolate_zspage(class, false))) {
2057 migrate_write_lock_nested(dst_zspage
);
2059 cc
.d_page
= get_first_page(dst_zspage
);
2061 * If there is no more space in dst_page, resched
2062 * and see if anyone had allocated another zspage.
2064 if (!migrate_zspage(pool
, class, &cc
))
2067 putback_zspage(class, dst_zspage
);
2068 migrate_write_unlock(dst_zspage
);
2070 if (rwlock_is_contended(&pool
->migrate_lock
))
2074 /* Stop if we couldn't find slot */
2075 if (dst_zspage
== NULL
)
2078 putback_zspage(class, dst_zspage
);
2079 migrate_write_unlock(dst_zspage
);
2081 if (putback_zspage(class, src_zspage
) == ZS_EMPTY
) {
2082 migrate_write_unlock(src_zspage
);
2083 free_zspage(pool
, class, src_zspage
);
2084 pages_freed
+= class->pages_per_zspage
;
2086 migrate_write_unlock(src_zspage
);
2087 spin_unlock(&class->lock
);
2088 write_unlock(&pool
->migrate_lock
);
2090 write_lock(&pool
->migrate_lock
);
2091 spin_lock(&class->lock
);
2095 putback_zspage(class, src_zspage
);
2096 migrate_write_unlock(src_zspage
);
2099 spin_unlock(&class->lock
);
2100 write_unlock(&pool
->migrate_lock
);
2105 unsigned long zs_compact(struct zs_pool
*pool
)
2108 struct size_class
*class;
2109 unsigned long pages_freed
= 0;
2111 for (i
= ZS_SIZE_CLASSES
- 1; i
>= 0; i
--) {
2112 class = pool
->size_class
[i
];
2113 if (class->index
!= i
)
2115 pages_freed
+= __zs_compact(pool
, class);
2117 atomic_long_add(pages_freed
, &pool
->stats
.pages_compacted
);
2121 EXPORT_SYMBOL_GPL(zs_compact
);
2123 void zs_pool_stats(struct zs_pool
*pool
, struct zs_pool_stats
*stats
)
2125 memcpy(stats
, &pool
->stats
, sizeof(struct zs_pool_stats
));
2127 EXPORT_SYMBOL_GPL(zs_pool_stats
);
2129 static unsigned long zs_shrinker_scan(struct shrinker
*shrinker
,
2130 struct shrink_control
*sc
)
2132 unsigned long pages_freed
;
2133 struct zs_pool
*pool
= container_of(shrinker
, struct zs_pool
,
2137 * Compact classes and calculate compaction delta.
2138 * Can run concurrently with a manually triggered
2139 * (by user) compaction.
2141 pages_freed
= zs_compact(pool
);
2143 return pages_freed
? pages_freed
: SHRINK_STOP
;
2146 static unsigned long zs_shrinker_count(struct shrinker
*shrinker
,
2147 struct shrink_control
*sc
)
2150 struct size_class
*class;
2151 unsigned long pages_to_free
= 0;
2152 struct zs_pool
*pool
= container_of(shrinker
, struct zs_pool
,
2155 for (i
= ZS_SIZE_CLASSES
- 1; i
>= 0; i
--) {
2156 class = pool
->size_class
[i
];
2157 if (class->index
!= i
)
2160 pages_to_free
+= zs_can_compact(class);
2163 return pages_to_free
;
2166 static void zs_unregister_shrinker(struct zs_pool
*pool
)
2168 unregister_shrinker(&pool
->shrinker
);
2171 static int zs_register_shrinker(struct zs_pool
*pool
)
2173 pool
->shrinker
.scan_objects
= zs_shrinker_scan
;
2174 pool
->shrinker
.count_objects
= zs_shrinker_count
;
2175 pool
->shrinker
.batch
= 0;
2176 pool
->shrinker
.seeks
= DEFAULT_SEEKS
;
2178 return register_shrinker(&pool
->shrinker
, "mm-zspool:%s",
2183 * zs_create_pool - Creates an allocation pool to work from.
2184 * @name: pool name to be created
2186 * This function must be called before anything when using
2187 * the zsmalloc allocator.
2189 * On success, a pointer to the newly created pool is returned,
2192 struct zs_pool
*zs_create_pool(const char *name
)
2195 struct zs_pool
*pool
;
2196 struct size_class
*prev_class
= NULL
;
2198 pool
= kzalloc(sizeof(*pool
), GFP_KERNEL
);
2202 init_deferred_free(pool
);
2203 rwlock_init(&pool
->migrate_lock
);
2205 pool
->name
= kstrdup(name
, GFP_KERNEL
);
2209 if (create_cache(pool
))
2213 * Iterate reversely, because, size of size_class that we want to use
2214 * for merging should be larger or equal to current size.
2216 for (i
= ZS_SIZE_CLASSES
- 1; i
>= 0; i
--) {
2218 int pages_per_zspage
;
2219 int objs_per_zspage
;
2220 struct size_class
*class;
2223 size
= ZS_MIN_ALLOC_SIZE
+ i
* ZS_SIZE_CLASS_DELTA
;
2224 if (size
> ZS_MAX_ALLOC_SIZE
)
2225 size
= ZS_MAX_ALLOC_SIZE
;
2226 pages_per_zspage
= get_pages_per_zspage(size
);
2227 objs_per_zspage
= pages_per_zspage
* PAGE_SIZE
/ size
;
2230 * We iterate from biggest down to smallest classes,
2231 * so huge_class_size holds the size of the first huge
2232 * class. Any object bigger than or equal to that will
2233 * endup in the huge class.
2235 if (pages_per_zspage
!= 1 && objs_per_zspage
!= 1 &&
2237 huge_class_size
= size
;
2239 * The object uses ZS_HANDLE_SIZE bytes to store the
2240 * handle. We need to subtract it, because zs_malloc()
2241 * unconditionally adds handle size before it performs
2242 * size class search - so object may be smaller than
2243 * huge class size, yet it still can end up in the huge
2244 * class because it grows by ZS_HANDLE_SIZE extra bytes
2245 * right before class lookup.
2247 huge_class_size
-= (ZS_HANDLE_SIZE
- 1);
2251 * size_class is used for normal zsmalloc operation such
2252 * as alloc/free for that size. Although it is natural that we
2253 * have one size_class for each size, there is a chance that we
2254 * can get more memory utilization if we use one size_class for
2255 * many different sizes whose size_class have same
2256 * characteristics. So, we makes size_class point to
2257 * previous size_class if possible.
2260 if (can_merge(prev_class
, pages_per_zspage
, objs_per_zspage
)) {
2261 pool
->size_class
[i
] = prev_class
;
2266 class = kzalloc(sizeof(struct size_class
), GFP_KERNEL
);
2272 class->pages_per_zspage
= pages_per_zspage
;
2273 class->objs_per_zspage
= objs_per_zspage
;
2274 spin_lock_init(&class->lock
);
2275 pool
->size_class
[i
] = class;
2276 for (fullness
= ZS_EMPTY
; fullness
< NR_ZS_FULLNESS
;
2278 INIT_LIST_HEAD(&class->fullness_list
[fullness
]);
2283 /* debug only, don't abort if it fails */
2284 zs_pool_stat_create(pool
, name
);
2287 * Not critical since shrinker is only used to trigger internal
2288 * defragmentation of the pool which is pretty optional thing. If
2289 * registration fails we still can use the pool normally and user can
2290 * trigger compaction manually. Thus, ignore return code.
2292 zs_register_shrinker(pool
);
2297 zs_destroy_pool(pool
);
2300 EXPORT_SYMBOL_GPL(zs_create_pool
);
2302 void zs_destroy_pool(struct zs_pool
*pool
)
2306 zs_unregister_shrinker(pool
);
2307 zs_flush_migration(pool
);
2308 zs_pool_stat_destroy(pool
);
2310 for (i
= 0; i
< ZS_SIZE_CLASSES
; i
++) {
2312 struct size_class
*class = pool
->size_class
[i
];
2317 if (class->index
!= i
)
2320 for (fg
= ZS_EMPTY
; fg
< NR_ZS_FULLNESS
; fg
++) {
2321 if (!list_empty(&class->fullness_list
[fg
])) {
2322 pr_info("Freeing non-empty class with size %db, fullness group %d\n",
2329 destroy_cache(pool
);
2333 EXPORT_SYMBOL_GPL(zs_destroy_pool
);
2335 static int __init
zs_init(void)
2339 ret
= cpuhp_setup_state(CPUHP_MM_ZS_PREPARE
, "mm/zsmalloc:prepare",
2340 zs_cpu_prepare
, zs_cpu_dead
);
2345 zpool_register_driver(&zs_zpool_driver
);
2356 static void __exit
zs_exit(void)
2359 zpool_unregister_driver(&zs_zpool_driver
);
2361 cpuhp_remove_state(CPUHP_MM_ZS_PREPARE
);
2366 module_init(zs_init
);
2367 module_exit(zs_exit
);
2369 MODULE_LICENSE("Dual BSD/GPL");
2370 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");