Merge tag 'sched_ext-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj...

[thirdparty/kernel/stable.git] / mm / zsmalloc.c
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c

index b42d3545ca856f19645c73c938864065631b261d..16a07def09c96a9fe17ecd6bc3578c782dff951e 100644 (file)
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -20,7 +20,8 @@
   *     page->index: links together all component pages of a zspage
   *             For the huge page, this is always 0, so we use this field
   *             to store handle.
- *     page->page_type: first object offset in a subpage of zspage
+ *     page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object
+ *             offset in a subpage of a zspage
   *
   * Usage of struct page flags:
   *     PG_private: identifies the first component page
@@ -33,7 +34,8 @@
  /*
   * lock ordering:
   *     page_lock
- *     pool->lock
+ *     pool->migrate_lock
+ *     class->lock
   *     zspage->lock
   */
  
@@ -52,6 +54,7 @@
  #include <linux/vmalloc.h>
  #include <linux/preempt.h>
  #include <linux/spinlock.h>
+#include <linux/sprintf.h>
  #include <linux/shrinker.h>
  #include <linux/types.h>
  #include <linux/debugfs.h>
@@ -118,8 +121,6 @@
  #define CLASS_BITS     8
  #define MAGIC_VAL_BITS 8
  
-#define MAX(a, b) ((a) >= (b) ? (a) : (b))
-
  #define ZS_MAX_PAGES_PER_ZSPAGE        (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL))
  
  /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
@@ -182,6 +183,7 @@ static struct dentry *zs_stat_root;
  static size_t huge_class_size;
  
  struct size_class {
+       spinlock_t lock;
         struct list_head fullness_list[NR_FULLNESS_GROUPS];
         /*
          * Size of objects stored in this class. Must be multiple
@@ -236,7 +238,8 @@ struct zs_pool {
  #ifdef CONFIG_COMPACTION
         struct work_struct free_work;
  #endif
-       spinlock_t lock;
+       /* protect page/zspage migration */
+       rwlock_t migrate_lock;
         atomic_t compaction_in_progress;
  };
  
@@ -291,17 +294,27 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
  
  static int create_cache(struct zs_pool *pool)
  {
-       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
-                                       0, 0, NULL);
+       char *name;
+
+       name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name);
+       if (!name)
+               return -ENOMEM;
+       pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE,
+                                               0, 0, NULL);
+       kfree(name);
         if (!pool->handle_cachep)
-               return 1;
+               return -EINVAL;
  
-       pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
-                                       0, 0, NULL);
+       name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name);
+       if (!name)
+               return -ENOMEM;
+       pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage),
+                                               0, 0, NULL);
+       kfree(name);
         if (!pool->zspage_cachep) {
                 kmem_cache_destroy(pool->handle_cachep);
                 pool->handle_cachep = NULL;
-               return 1;
+               return -EINVAL;
         }
  
         return 0;
@@ -335,7 +348,7 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
         kmem_cache_free(pool->zspage_cachep, zspage);
  }
  
-/* pool->lock(which owns the handle) synchronizes races */
+/* class->lock(which owns the handle) synchronizes races */
  static void record_obj(unsigned long handle, unsigned long obj)
  {
         *(unsigned long *)handle = obj;
@@ -430,7 +443,7 @@ static __maybe_unused int is_first_page(struct page *page)
         return PagePrivate(page);
  }
  
-/* Protected by pool->lock */
+/* Protected by class->lock */
  static inline int get_zspage_inuse(struct zspage *zspage)
  {
         return zspage->inuse;
@@ -450,14 +463,22 @@ static inline struct page *get_first_page(struct zspage *zspage)
         return first_page;
  }
  
+#define FIRST_OBJ_PAGE_TYPE_MASK       0xffffff
+
  static inline unsigned int get_first_obj_offset(struct page *page)
  {
-       return page->page_type;
+       VM_WARN_ON_ONCE(!PageZsmalloc(page));
+       return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK;
  }
  
  static inline void set_first_obj_offset(struct page *page, unsigned int offset)
  {
-       page->page_type = offset;
+       /* With 24 bits available, we can support offsets into 16 MiB pages. */
+       BUILD_BUG_ON(PAGE_SIZE > SZ_16M);
+       VM_WARN_ON_ONCE(!PageZsmalloc(page));
+       VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK);
+       page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK;
+       page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK;
  }
  
  static inline unsigned int get_freeobj(struct zspage *zspage)
@@ -494,19 +515,19 @@ static int get_size_class_index(int size)
         return min_t(int, ZS_SIZE_CLASSES - 1, idx);
  }
  
-static inline void class_stat_inc(struct size_class *class,
-                               int type, unsigned long cnt)
+static inline void class_stat_add(struct size_class *class, int type,
+                                 unsigned long cnt)
  {
         class->stats.objs[type] += cnt;
  }
  
-static inline void class_stat_dec(struct size_class *class,
-                               int type, unsigned long cnt)
+static inline void class_stat_sub(struct size_class *class, int type,
+                                 unsigned long cnt)
  {
         class->stats.objs[type] -= cnt;
  }
  
-static inline unsigned long zs_stat_get(struct size_class *class, int type)
+static inline unsigned long class_stat_read(struct size_class *class, int type)
  {
         return class->stats.objs[type];
  }
@@ -554,18 +575,18 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
                 if (class->index != i)
                         continue;
  
-               spin_lock(&pool->lock);
+               spin_lock(&class->lock);
  
                 seq_printf(s, " %5u %5u ", i, class->size);
                 for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) {
-                       inuse_totals[fg] += zs_stat_get(class, fg);
-                       seq_printf(s, "%9lu ", zs_stat_get(class, fg));
+                       inuse_totals[fg] += class_stat_read(class, fg);
+                       seq_printf(s, "%9lu ", class_stat_read(class, fg));
                 }
  
-               obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
-               obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
+               obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED);
+               obj_used = class_stat_read(class, ZS_OBJS_INUSE);
                 freeable = zs_can_compact(class);
-               spin_unlock(&pool->lock);
+               spin_unlock(&class->lock);
  
                 objs_per_zspage = class->objs_per_zspage;
                 pages_used = obj_allocated / objs_per_zspage *
@@ -668,7 +689,7 @@ static void insert_zspage(struct size_class *class,
                                 struct zspage *zspage,
                                 int fullness)
  {
-       class_stat_inc(class, fullness, 1);
+       class_stat_add(class, fullness, 1);
         list_add(&zspage->list, &class->fullness_list[fullness]);
         zspage->fullness = fullness;
  }
@@ -684,7 +705,7 @@ static void remove_zspage(struct size_class *class, struct zspage *zspage)
         VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
  
         list_del_init(&zspage->list);
-       class_stat_dec(class, fullness, 1);
+       class_stat_sub(class, fullness, 1);
  }
  
  /*
@@ -791,8 +812,8 @@ static void reset_page(struct page *page)
         __ClearPageMovable(page);
         ClearPagePrivate(page);
         set_page_private(page, 0);
-       page_mapcount_reset(page);
         page->index = 0;
+       __ClearPageZsmalloc(page);
  }
  
  static int trylock_zspage(struct zspage *zspage)
@@ -821,7 +842,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
  {
         struct page *page, *next;
  
-       assert_spin_locked(&pool->lock);
+       assert_spin_locked(&class->lock);
  
         VM_BUG_ON(get_zspage_inuse(zspage));
         VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
@@ -839,7 +860,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
  
         cache_free_zspage(pool, zspage);
  
-       class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+       class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
         atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
  }
  
@@ -965,11 +986,13 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
                 if (!page) {
                         while (--i >= 0) {
                                 dec_zone_page_state(pages[i], NR_ZSPAGES);
+                               __ClearPageZsmalloc(pages[i]);
                                 __free_page(pages[i]);
                         }
                         cache_free_zspage(pool, zspage);
                         return NULL;
                 }
+               __SetPageZsmalloc(page);
  
                 inc_zone_page_state(page, NR_ZSPAGES);
                 pages[i] = page;
@@ -1178,19 +1201,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
         BUG_ON(in_interrupt());
  
         /* It guarantees it can get zspage from handle safely */
-       spin_lock(&pool->lock);
+       read_lock(&pool->migrate_lock);
         obj = handle_to_obj(handle);
         obj_to_location(obj, &page, &obj_idx);
         zspage = get_zspage(page);
  
         /*
-        * migration cannot move any zpages in this zspage. Here, pool->lock
+        * migration cannot move any zpages in this zspage. Here, class->lock
          * is too heavy since callers would take some time until they calls
          * zs_unmap_object API so delegate the locking from class to zspage
          * which is smaller granularity.
          */
         migrate_read_lock(zspage);
-       spin_unlock(&pool->lock);
+       read_unlock(&pool->migrate_lock);
  
         class = zspage_class(pool, zspage);
         off = offset_in_page(class->size * obj_idx);
@@ -1285,7 +1308,6 @@ static unsigned long obj_malloc(struct zs_pool *pool,
         void *vaddr;
  
         class = pool->size_class[zspage->class];
-       handle |= OBJ_ALLOCATED_TAG;
         obj = get_freeobj(zspage);
  
         offset = obj * class->size;
@@ -1301,15 +1323,16 @@ static unsigned long obj_malloc(struct zs_pool *pool,
         set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
         if (likely(!ZsHugePage(zspage)))
                 /* record handle in the header of allocated chunk */
-               link->handle = handle;
+               link->handle = handle | OBJ_ALLOCATED_TAG;
         else
                 /* record handle to page->index */
-               zspage->first_page->index = handle;
+               zspage->first_page->index = handle | OBJ_ALLOCATED_TAG;
  
         kunmap_atomic(vaddr);
         mod_zspage_inuse(zspage, 1);
  
         obj = location_to_obj(m_page, obj);
+       record_obj(handle, obj);
  
         return obj;
  }
@@ -1327,7 +1350,7 @@ static unsigned long obj_malloc(struct zs_pool *pool,
   */
  unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
  {
-       unsigned long handle, obj;
+       unsigned long handle;
         struct size_class *class;
         int newfg;
         struct zspage *zspage;
@@ -1346,20 +1369,19 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
         size += ZS_HANDLE_SIZE;
         class = pool->size_class[get_size_class_index(size)];
  
-       /* pool->lock effectively protects the zpage migration */
-       spin_lock(&pool->lock);
+       /* class->lock effectively protects the zpage migration */
+       spin_lock(&class->lock);
         zspage = find_get_zspage(class);
         if (likely(zspage)) {
-               obj = obj_malloc(pool, zspage, handle);
+               obj_malloc(pool, zspage, handle);
                 /* Now move the zspage to another fullness group, if required */
                 fix_fullness_group(class, zspage);
-               record_obj(handle, obj);
-               class_stat_inc(class, ZS_OBJS_INUSE, 1);
+               class_stat_add(class, ZS_OBJS_INUSE, 1);
  
                 goto out;
         }
  
-       spin_unlock(&pool->lock);
+       spin_unlock(&class->lock);
  
         zspage = alloc_zspage(pool, class, gfp);
         if (!zspage) {
@@ -1367,19 +1389,18 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
                 return (unsigned long)ERR_PTR(-ENOMEM);
         }
  
-       spin_lock(&pool->lock);
-       obj = obj_malloc(pool, zspage, handle);
+       spin_lock(&class->lock);
+       obj_malloc(pool, zspage, handle);
         newfg = get_fullness_group(class, zspage);
         insert_zspage(class, zspage, newfg);
-       record_obj(handle, obj);
         atomic_long_add(class->pages_per_zspage, &pool->pages_allocated);
-       class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
-       class_stat_inc(class, ZS_OBJS_INUSE, 1);
+       class_stat_add(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+       class_stat_add(class, ZS_OBJS_INUSE, 1);
  
         /* We completely set up zspage so mark them as movable */
         SetZsPageMovable(pool, zspage);
  out:
-       spin_unlock(&pool->lock);
+       spin_unlock(&class->lock);
  
         return handle;
  }
@@ -1424,23 +1445,25 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
                 return;
  
         /*
-        * The pool->lock protects the race with zpage's migration
+        * The pool->migrate_lock protects the race with zpage's migration
          * so it's safe to get the page from handle.
          */
-       spin_lock(&pool->lock);
+       read_lock(&pool->migrate_lock);
         obj = handle_to_obj(handle);
         obj_to_page(obj, &f_page);
         zspage = get_zspage(f_page);
         class = zspage_class(pool, zspage);
+       spin_lock(&class->lock);
+       read_unlock(&pool->migrate_lock);
  
-       class_stat_dec(class, ZS_OBJS_INUSE, 1);
+       class_stat_sub(class, ZS_OBJS_INUSE, 1);
         obj_free(class->size, obj);
  
         fullness = fix_fullness_group(class, zspage);
         if (fullness == ZS_INUSE_RATIO_0)
                 free_zspage(pool, class, zspage);
  
-       spin_unlock(&pool->lock);
+       spin_unlock(&class->lock);
         cache_free_handle(pool, handle);
  }
  EXPORT_SYMBOL_GPL(zs_free);
@@ -1568,7 +1591,6 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage,
                 free_obj = obj_malloc(pool, dst_zspage, handle);
                 zs_object_copy(class, free_obj, used_obj);
                 obj_idx++;
-               record_obj(handle, free_obj);
                 obj_free(class->size, used_obj);
  
                 /* Stop if there is no more space */
@@ -1752,27 +1774,26 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
         unsigned long old_obj, new_obj;
         unsigned int obj_idx;
  
-       /*
-        * We cannot support the _NO_COPY case here, because copy needs to
-        * happen under the zs lock, which does not work with
-        * MIGRATE_SYNC_NO_COPY workflow.
-        */
-       if (mode == MIGRATE_SYNC_NO_COPY)
-               return -EINVAL;
-
         VM_BUG_ON_PAGE(!PageIsolated(page), page);
  
+       /* We're committed, tell the world that this is a Zsmalloc page. */
+       __SetPageZsmalloc(newpage);
+
         /* The page is locked, so this pointer must remain valid */
         zspage = get_zspage(page);
         pool = zspage->pool;
  
         /*
-        * The pool's lock protects the race between zpage migration
+        * The pool migrate_lock protects the race between zpage migration
          * and zs_free.
          */
-       spin_lock(&pool->lock);
+       write_lock(&pool->migrate_lock);
         class = zspage_class(pool, zspage);
  
+       /*
+        * the class lock protects zpage alloc/free in the zspage.
+        */
+       spin_lock(&class->lock);
         /* the migrate_write_lock protects zpage access via zs_map_object */
         migrate_write_lock(zspage);
  
@@ -1802,9 +1823,10 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
         replace_sub_page(class, zspage, newpage, page);
         /*
          * Since we complete the data copy and set up new zspage structure,
-        * it's okay to release the pool's lock.
+        * it's okay to release migration_lock.
          */
-       spin_unlock(&pool->lock);
+       write_unlock(&pool->migrate_lock);
+       spin_unlock(&class->lock);
         migrate_write_unlock(zspage);
  
         get_page(newpage);
@@ -1848,20 +1870,21 @@ static void async_free_zspage(struct work_struct *work)
                 if (class->index != i)
                         continue;
  
-               spin_lock(&pool->lock);
+               spin_lock(&class->lock);
                 list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0],
                                  &free_pages);
-               spin_unlock(&pool->lock);
+               spin_unlock(&class->lock);
         }
  
         list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
                 list_del(&zspage->list);
                 lock_zspage(zspage);
  
-               spin_lock(&pool->lock);
                 class = zspage_class(pool, zspage);
+               spin_lock(&class->lock);
+               class_stat_sub(class, ZS_INUSE_RATIO_0, 1);
                 __free_zspage(pool, class, zspage);
-               spin_unlock(&pool->lock);
+               spin_unlock(&class->lock);
         }
  };
  
@@ -1902,8 +1925,8 @@ static inline void zs_flush_migration(struct zs_pool *pool) { }
  static unsigned long zs_can_compact(struct size_class *class)
  {
         unsigned long obj_wasted;
-       unsigned long obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
-       unsigned long obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
+       unsigned long obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED);
+       unsigned long obj_used = class_stat_read(class, ZS_OBJS_INUSE);
  
         if (obj_allocated <= obj_used)
                 return 0;
@@ -1925,7 +1948,8 @@ static unsigned long __zs_compact(struct zs_pool *pool,
          * protect the race between zpage migration and zs_free
          * as well as zpage allocation/free
          */
-       spin_lock(&pool->lock);
+       write_lock(&pool->migrate_lock);
+       spin_lock(&class->lock);
         while (zs_can_compact(class)) {
                 int fg;
  
@@ -1951,13 +1975,15 @@ static unsigned long __zs_compact(struct zs_pool *pool,
                 src_zspage = NULL;
  
                 if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
-                   || spin_is_contended(&pool->lock)) {
+                   || rwlock_is_contended(&pool->migrate_lock)) {
                         putback_zspage(class, dst_zspage);
                         dst_zspage = NULL;
  
-                       spin_unlock(&pool->lock);
+                       spin_unlock(&class->lock);
+                       write_unlock(&pool->migrate_lock);
                         cond_resched();
-                       spin_lock(&pool->lock);
+                       write_lock(&pool->migrate_lock);
+                       spin_lock(&class->lock);
                 }
         }
  
@@ -1967,7 +1993,8 @@ static unsigned long __zs_compact(struct zs_pool *pool,
         if (dst_zspage)
                 putback_zspage(class, dst_zspage);
  
-       spin_unlock(&pool->lock);
+       spin_unlock(&class->lock);
+       write_unlock(&pool->migrate_lock);
  
         return pages_freed;
  }
@@ -1979,10 +2006,10 @@ unsigned long zs_compact(struct zs_pool *pool)
         unsigned long pages_freed = 0;
  
         /*
-        * Pool compaction is performed under pool->lock so it is basically
+        * Pool compaction is performed under pool->migrate_lock so it is basically
          * single-threaded. Having more than one thread in __zs_compact()
-        * will increase pool->lock contention, which will impact other
-        * zsmalloc operations that need pool->lock.
+        * will increase pool->migrate_lock contention, which will impact other
+        * zsmalloc operations that need pool->migrate_lock.
          */
         if (atomic_xchg(&pool->compaction_in_progress, 1))
                 return 0;
@@ -2104,7 +2131,7 @@ struct zs_pool *zs_create_pool(const char *name)
                 return NULL;
  
         init_deferred_free(pool);
-       spin_lock_init(&pool->lock);
+       rwlock_init(&pool->migrate_lock);
         atomic_set(&pool->compaction_in_progress, 0);
  
         pool->name = kstrdup(name, GFP_KERNEL);
@@ -2176,6 +2203,7 @@ struct zs_pool *zs_create_pool(const char *name)
                 class->index = i;
                 class->pages_per_zspage = pages_per_zspage;
                 class->objs_per_zspage = objs_per_zspage;
+               spin_lock_init(&class->lock);
                 pool->size_class[i] = class;
  
                 fullness = ZS_INUSE_RATIO_0;
@@ -2276,3 +2304,4 @@ module_exit(zs_exit);
  
  MODULE_LICENSE("Dual BSD/GPL");
  MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("zsmalloc memory allocator");