* page->index: links together all component pages of a zspage
* For the huge page, this is always 0, so we use this field
* to store handle.
- * page->page_type: first object offset in a subpage of zspage
+ * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object
+ * offset in a subpage of a zspage
*
* Usage of struct page flags:
* PG_private: identifies the first component page
/*
* lock ordering:
* page_lock
- * pool->lock
+ * pool->migrate_lock
+ * class->lock
* zspage->lock
*/
#include <linux/vmalloc.h>
#include <linux/preempt.h>
#include <linux/spinlock.h>
+#include <linux/sprintf.h>
#include <linux/shrinker.h>
#include <linux/types.h>
#include <linux/debugfs.h>
#define CLASS_BITS 8
#define MAGIC_VAL_BITS 8
-#define MAX(a, b) ((a) >= (b) ? (a) : (b))
-
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
static size_t huge_class_size;
struct size_class {
+ spinlock_t lock;
struct list_head fullness_list[NR_FULLNESS_GROUPS];
/*
* Size of objects stored in this class. Must be multiple
#ifdef CONFIG_COMPACTION
struct work_struct free_work;
#endif
- spinlock_t lock;
+ /* protect page/zspage migration */
+ rwlock_t migrate_lock;
atomic_t compaction_in_progress;
};
static int create_cache(struct zs_pool *pool)
{
- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
- 0, 0, NULL);
+ char *name;
+
+ name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name);
+ if (!name)
+ return -ENOMEM;
+ pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ kfree(name);
if (!pool->handle_cachep)
- return 1;
+ return -EINVAL;
- pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
- 0, 0, NULL);
+ name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name);
+ if (!name)
+ return -ENOMEM;
+ pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage),
+ 0, 0, NULL);
+ kfree(name);
if (!pool->zspage_cachep) {
kmem_cache_destroy(pool->handle_cachep);
pool->handle_cachep = NULL;
- return 1;
+ return -EINVAL;
}
return 0;
kmem_cache_free(pool->zspage_cachep, zspage);
}
-/* pool->lock(which owns the handle) synchronizes races */
+/* class->lock(which owns the handle) synchronizes races */
static void record_obj(unsigned long handle, unsigned long obj)
{
*(unsigned long *)handle = obj;
return PagePrivate(page);
}
-/* Protected by pool->lock */
+/* Protected by class->lock */
static inline int get_zspage_inuse(struct zspage *zspage)
{
return zspage->inuse;
return first_page;
}
+#define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff
+
static inline unsigned int get_first_obj_offset(struct page *page)
{
- return page->page_type;
+ VM_WARN_ON_ONCE(!PageZsmalloc(page));
+ return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK;
}
static inline void set_first_obj_offset(struct page *page, unsigned int offset)
{
- page->page_type = offset;
+ /* With 24 bits available, we can support offsets into 16 MiB pages. */
+ BUILD_BUG_ON(PAGE_SIZE > SZ_16M);
+ VM_WARN_ON_ONCE(!PageZsmalloc(page));
+ VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK);
+ page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK;
+ page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK;
}
static inline unsigned int get_freeobj(struct zspage *zspage)
return min_t(int, ZS_SIZE_CLASSES - 1, idx);
}
-static inline void class_stat_inc(struct size_class *class,
- int type, unsigned long cnt)
+static inline void class_stat_add(struct size_class *class, int type,
+ unsigned long cnt)
{
class->stats.objs[type] += cnt;
}
-static inline void class_stat_dec(struct size_class *class,
- int type, unsigned long cnt)
+static inline void class_stat_sub(struct size_class *class, int type,
+ unsigned long cnt)
{
class->stats.objs[type] -= cnt;
}
-static inline unsigned long zs_stat_get(struct size_class *class, int type)
+static inline unsigned long class_stat_read(struct size_class *class, int type)
{
return class->stats.objs[type];
}
if (class->index != i)
continue;
- spin_lock(&pool->lock);
+ spin_lock(&class->lock);
seq_printf(s, " %5u %5u ", i, class->size);
for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) {
- inuse_totals[fg] += zs_stat_get(class, fg);
- seq_printf(s, "%9lu ", zs_stat_get(class, fg));
+ inuse_totals[fg] += class_stat_read(class, fg);
+ seq_printf(s, "%9lu ", class_stat_read(class, fg));
}
- obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
- obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
+ obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED);
+ obj_used = class_stat_read(class, ZS_OBJS_INUSE);
freeable = zs_can_compact(class);
- spin_unlock(&pool->lock);
+ spin_unlock(&class->lock);
objs_per_zspage = class->objs_per_zspage;
pages_used = obj_allocated / objs_per_zspage *
struct zspage *zspage,
int fullness)
{
- class_stat_inc(class, fullness, 1);
+ class_stat_add(class, fullness, 1);
list_add(&zspage->list, &class->fullness_list[fullness]);
zspage->fullness = fullness;
}
VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
list_del_init(&zspage->list);
- class_stat_dec(class, fullness, 1);
+ class_stat_sub(class, fullness, 1);
}
/*
__ClearPageMovable(page);
ClearPagePrivate(page);
set_page_private(page, 0);
- page_mapcount_reset(page);
page->index = 0;
+ __ClearPageZsmalloc(page);
}
static int trylock_zspage(struct zspage *zspage)
{
struct page *page, *next;
- assert_spin_locked(&pool->lock);
+ assert_spin_locked(&class->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
cache_free_zspage(pool, zspage);
- class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+ class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
}
if (!page) {
while (--i >= 0) {
dec_zone_page_state(pages[i], NR_ZSPAGES);
+ __ClearPageZsmalloc(pages[i]);
__free_page(pages[i]);
}
cache_free_zspage(pool, zspage);
return NULL;
}
+ __SetPageZsmalloc(page);
inc_zone_page_state(page, NR_ZSPAGES);
pages[i] = page;
BUG_ON(in_interrupt());
/* It guarantees it can get zspage from handle safely */
- spin_lock(&pool->lock);
+ read_lock(&pool->migrate_lock);
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
/*
- * migration cannot move any zpages in this zspage. Here, pool->lock
+ * migration cannot move any zpages in this zspage. Here, class->lock
* is too heavy since callers would take some time until they calls
* zs_unmap_object API so delegate the locking from class to zspage
* which is smaller granularity.
*/
migrate_read_lock(zspage);
- spin_unlock(&pool->lock);
+ read_unlock(&pool->migrate_lock);
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
void *vaddr;
class = pool->size_class[zspage->class];
- handle |= OBJ_ALLOCATED_TAG;
obj = get_freeobj(zspage);
offset = obj * class->size;
set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
if (likely(!ZsHugePage(zspage)))
/* record handle in the header of allocated chunk */
- link->handle = handle;
+ link->handle = handle | OBJ_ALLOCATED_TAG;
else
/* record handle to page->index */
- zspage->first_page->index = handle;
+ zspage->first_page->index = handle | OBJ_ALLOCATED_TAG;
kunmap_atomic(vaddr);
mod_zspage_inuse(zspage, 1);
obj = location_to_obj(m_page, obj);
+ record_obj(handle, obj);
return obj;
}
*/
unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
{
- unsigned long handle, obj;
+ unsigned long handle;
struct size_class *class;
int newfg;
struct zspage *zspage;
size += ZS_HANDLE_SIZE;
class = pool->size_class[get_size_class_index(size)];
- /* pool->lock effectively protects the zpage migration */
- spin_lock(&pool->lock);
+ /* class->lock effectively protects the zpage migration */
+ spin_lock(&class->lock);
zspage = find_get_zspage(class);
if (likely(zspage)) {
- obj = obj_malloc(pool, zspage, handle);
+ obj_malloc(pool, zspage, handle);
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(class, zspage);
- record_obj(handle, obj);
- class_stat_inc(class, ZS_OBJS_INUSE, 1);
+ class_stat_add(class, ZS_OBJS_INUSE, 1);
goto out;
}
- spin_unlock(&pool->lock);
+ spin_unlock(&class->lock);
zspage = alloc_zspage(pool, class, gfp);
if (!zspage) {
return (unsigned long)ERR_PTR(-ENOMEM);
}
- spin_lock(&pool->lock);
- obj = obj_malloc(pool, zspage, handle);
+ spin_lock(&class->lock);
+ obj_malloc(pool, zspage, handle);
newfg = get_fullness_group(class, zspage);
insert_zspage(class, zspage, newfg);
- record_obj(handle, obj);
atomic_long_add(class->pages_per_zspage, &pool->pages_allocated);
- class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
- class_stat_inc(class, ZS_OBJS_INUSE, 1);
+ class_stat_add(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+ class_stat_add(class, ZS_OBJS_INUSE, 1);
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
out:
- spin_unlock(&pool->lock);
+ spin_unlock(&class->lock);
return handle;
}
return;
/*
- * The pool->lock protects the race with zpage's migration
+ * The pool->migrate_lock protects the race with zpage's migration
* so it's safe to get the page from handle.
*/
- spin_lock(&pool->lock);
+ read_lock(&pool->migrate_lock);
obj = handle_to_obj(handle);
obj_to_page(obj, &f_page);
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);
+ spin_lock(&class->lock);
+ read_unlock(&pool->migrate_lock);
- class_stat_dec(class, ZS_OBJS_INUSE, 1);
+ class_stat_sub(class, ZS_OBJS_INUSE, 1);
obj_free(class->size, obj);
fullness = fix_fullness_group(class, zspage);
if (fullness == ZS_INUSE_RATIO_0)
free_zspage(pool, class, zspage);
- spin_unlock(&pool->lock);
+ spin_unlock(&class->lock);
cache_free_handle(pool, handle);
}
EXPORT_SYMBOL_GPL(zs_free);
free_obj = obj_malloc(pool, dst_zspage, handle);
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
- record_obj(handle, free_obj);
obj_free(class->size, used_obj);
/* Stop if there is no more space */
unsigned long old_obj, new_obj;
unsigned int obj_idx;
- /*
- * We cannot support the _NO_COPY case here, because copy needs to
- * happen under the zs lock, which does not work with
- * MIGRATE_SYNC_NO_COPY workflow.
- */
- if (mode == MIGRATE_SYNC_NO_COPY)
- return -EINVAL;
-
VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ /* We're committed, tell the world that this is a Zsmalloc page. */
+ __SetPageZsmalloc(newpage);
+
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(page);
pool = zspage->pool;
/*
- * The pool's lock protects the race between zpage migration
+ * The pool migrate_lock protects the race between zpage migration
* and zs_free.
*/
- spin_lock(&pool->lock);
+ write_lock(&pool->migrate_lock);
class = zspage_class(pool, zspage);
+ /*
+ * the class lock protects zpage alloc/free in the zspage.
+ */
+ spin_lock(&class->lock);
/* the migrate_write_lock protects zpage access via zs_map_object */
migrate_write_lock(zspage);
replace_sub_page(class, zspage, newpage, page);
/*
* Since we complete the data copy and set up new zspage structure,
- * it's okay to release the pool's lock.
+ * it's okay to release migration_lock.
*/
- spin_unlock(&pool->lock);
+ write_unlock(&pool->migrate_lock);
+ spin_unlock(&class->lock);
migrate_write_unlock(zspage);
get_page(newpage);
if (class->index != i)
continue;
- spin_lock(&pool->lock);
+ spin_lock(&class->lock);
list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0],
&free_pages);
- spin_unlock(&pool->lock);
+ spin_unlock(&class->lock);
}
list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
list_del(&zspage->list);
lock_zspage(zspage);
- spin_lock(&pool->lock);
class = zspage_class(pool, zspage);
+ spin_lock(&class->lock);
+ class_stat_sub(class, ZS_INUSE_RATIO_0, 1);
__free_zspage(pool, class, zspage);
- spin_unlock(&pool->lock);
+ spin_unlock(&class->lock);
}
};
static unsigned long zs_can_compact(struct size_class *class)
{
unsigned long obj_wasted;
- unsigned long obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
- unsigned long obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
+ unsigned long obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED);
+ unsigned long obj_used = class_stat_read(class, ZS_OBJS_INUSE);
if (obj_allocated <= obj_used)
return 0;
* protect the race between zpage migration and zs_free
* as well as zpage allocation/free
*/
- spin_lock(&pool->lock);
+ write_lock(&pool->migrate_lock);
+ spin_lock(&class->lock);
while (zs_can_compact(class)) {
int fg;
src_zspage = NULL;
if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
- || spin_is_contended(&pool->lock)) {
+ || rwlock_is_contended(&pool->migrate_lock)) {
putback_zspage(class, dst_zspage);
dst_zspage = NULL;
- spin_unlock(&pool->lock);
+ spin_unlock(&class->lock);
+ write_unlock(&pool->migrate_lock);
cond_resched();
- spin_lock(&pool->lock);
+ write_lock(&pool->migrate_lock);
+ spin_lock(&class->lock);
}
}
if (dst_zspage)
putback_zspage(class, dst_zspage);
- spin_unlock(&pool->lock);
+ spin_unlock(&class->lock);
+ write_unlock(&pool->migrate_lock);
return pages_freed;
}
unsigned long pages_freed = 0;
/*
- * Pool compaction is performed under pool->lock so it is basically
+ * Pool compaction is performed under pool->migrate_lock so it is basically
* single-threaded. Having more than one thread in __zs_compact()
- * will increase pool->lock contention, which will impact other
- * zsmalloc operations that need pool->lock.
+ * will increase pool->migrate_lock contention, which will impact other
+ * zsmalloc operations that need pool->migrate_lock.
*/
if (atomic_xchg(&pool->compaction_in_progress, 1))
return 0;
return NULL;
init_deferred_free(pool);
- spin_lock_init(&pool->lock);
+ rwlock_init(&pool->migrate_lock);
atomic_set(&pool->compaction_in_progress, 0);
pool->name = kstrdup(name, GFP_KERNEL);
class->index = i;
class->pages_per_zspage = pages_per_zspage;
class->objs_per_zspage = objs_per_zspage;
+ spin_lock_init(&class->lock);
pool->size_class[i] = class;
fullness = ZS_INUSE_RATIO_0;
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("zsmalloc memory allocator");