]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
mm/device-public-memory: device memory cache coherent with CPU
authorJérôme Glisse <jglisse@redhat.com>
Fri, 8 Sep 2017 23:12:24 +0000 (16:12 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 9 Sep 2017 01:26:46 +0000 (18:26 -0700)
Platform with advance system bus (like CAPI or CCIX) allow device memory
to be accessible from CPU in a cache coherent fashion.  Add a new type of
ZONE_DEVICE to represent such memory.  The use case are the same as for
the un-addressable device memory but without all the corners cases.

Link: http://lkml.kernel.org/r/20170817000548.32038-19-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
14 files changed:
fs/proc/task_mmu.c
include/linux/hmm.h
include/linux/ioport.h
include/linux/memremap.h
include/linux/mm.h
kernel/memremap.c
mm/Kconfig
mm/gup.c
mm/hmm.c
mm/madvise.c
mm/memcontrol.c
mm/memory.c
mm/migrate.c
mm/swap.c

index 90ab657f8e5653ca2b284015a537e51e455afdb3..281880c7e694ec521f61234731338b3220335998 100644 (file)
@@ -1267,7 +1267,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
                if (pm->show_pfn)
                        frame = pte_pfn(pte);
                flags |= PM_PRESENT;
-               page = vm_normal_page(vma, addr, pte);
+               page = _vm_normal_page(vma, addr, pte, true);
                if (pte_soft_dirty(pte))
                        flags |= PM_SOFT_DIRTY;
        } else if (is_swap_pte(pte)) {
index 67a03b20a2dbcd2d08e72887b7e39782d6b9c3f8..6d3b0b4fed4ee004dc87ce2e7e931df50d333a7e 100644 (file)
@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -494,7 +494,7 @@ struct hmm_device {
  */
 struct hmm_device *hmm_device_new(void *drvdata);
 void hmm_device_put(struct hmm_device *hmm_device);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
index 3a4f69137bc2fa732dca27167573a9cec506dd2a..f5cf32e800411ecafd0aef552d7ae675dceefeb7 100644 (file)
@@ -131,6 +131,7 @@ enum {
        IORES_DESC_PERSISTENT_MEMORY            = 4,
        IORES_DESC_PERSISTENT_MEMORY_LEGACY     = 5,
        IORES_DESC_DEVICE_PRIVATE_MEMORY        = 6,
+       IORES_DESC_DEVICE_PUBLIC_MEMORY         = 7,
 };
 
 /* helpers to define resources */
index 8aa6b82679e2381e6b60cd5a829fc370269bedb9..f8ee1c73ad2d67e3f2968366590ce1ee3e0cc86b 100644 (file)
@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
  *
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_PUBLIC:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
  */
 enum memory_type {
        MEMORY_DEVICE_HOST = 0,
        MEMORY_DEVICE_PRIVATE,
+       MEMORY_DEVICE_PUBLIC,
 };
 
 /*
@@ -92,6 +100,8 @@ enum memory_type {
  * The page_free() callback is called once the page refcount reaches 1
  * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
  * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
  */
 typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
                                unsigned long addr,
@@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct page *page)
        return is_zone_device_page(page) &&
                page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
+
+static inline bool is_device_public_page(const struct page *page)
+{
+       return is_zone_device_page(page) &&
+               page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+}
 #else
 static inline void *devm_memremap_pages(struct device *dev,
                struct resource *res, struct percpu_ref *ref,
@@ -157,6 +173,11 @@ static inline bool is_device_private_page(const struct page *page)
 {
        return false;
 }
+
+static inline bool is_device_public_page(const struct page *page)
+{
+       return false;
+}
 #endif
 
 /**
index eccdab4bb44a5312cf5849379c445a34be3881ed..de66a1127db4f8d266c87ac949076ab5cb6c2375 100644 (file)
@@ -800,15 +800,16 @@ static inline bool is_zone_device_page(const struct page *page)
 }
 #endif
 
-#ifdef CONFIG_DEVICE_PRIVATE
-void put_zone_device_private_page(struct page *page);
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page);
 #else
-static inline void put_zone_device_private_page(struct page *page)
+static inline void put_zone_device_private_or_public_page(struct page *page)
 {
 }
-#endif
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 static inline bool is_device_private_page(const struct page *page);
+static inline bool is_device_public_page(const struct page *page);
 
 DECLARE_STATIC_KEY_FALSE(device_private_key);
 
@@ -834,8 +835,9 @@ static inline void put_page(struct page *page)
         * include/linux/memremap.h and HMM for details.
         */
        if (static_branch_unlikely(&device_private_key) &&
-           unlikely(is_device_private_page(page))) {
-               put_zone_device_private_page(page);
+           unlikely(is_device_private_page(page) ||
+                    is_device_public_page(page))) {
+               put_zone_device_private_or_public_page(page);
                return;
        }
 
@@ -1224,8 +1226,10 @@ struct zap_details {
        pgoff_t last_index;                     /* Highest page->index to unmap */
 };
 
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-               pte_t pte);
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                            pte_t pte, bool with_public_device);
+#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
+
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd);
 
index ea0e18a2a5f2267d8cfbe5ebc993ae14b6110cb4..6bcbfbf1a8fdfd2f1008cde707db9a798a68cdc6 100644 (file)
@@ -501,8 +501,8 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 #endif /* CONFIG_ZONE_DEVICE */
 
 
-#ifdef CONFIG_DEVICE_PRIVATE
-void put_zone_device_private_page(struct page *page)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page)
 {
        int count = page_ref_dec_return(page);
 
@@ -522,5 +522,5 @@ void put_zone_device_private_page(struct page *page)
        } else if (!count)
                __put_page(page);
 }
-EXPORT_SYMBOL(put_zone_device_private_page);
-#endif /* CONFIG_DEVICE_PRIVATE */
+EXPORT_SYMBOL(put_zone_device_private_or_public_page);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
index ec27855db1339da1f24c616ee1d6750d0bc33d1e..7bea16697d87a3e9ba3d9be93224a250c411cb9e 100644 (file)
@@ -720,12 +720,23 @@ config HMM_MIRROR
 config DEVICE_PRIVATE
        bool "Unaddressable device memory (GPU memory, ...)"
        depends on ARCH_HAS_HMM
+       select HMM
 
        help
          Allows creation of struct pages to represent unaddressable device
          memory; i.e., memory that is only accessible from the device (or
          group of devices). You likely also want to select HMM_MIRROR.
 
+config DEVICE_PUBLIC
+       bool "Addressable device memory (like GPU memory)"
+       depends on ARCH_HAS_HMM
+       select HMM
+
+       help
+         Allows creation of struct pages to represent addressable device
+         memory; i.e., memory that is accessible from both the device and
+         the CPU
+
 config FRAME_VECTOR
        bool
 
index 76fd199aaae2446004b691baca51f2e5d004cb9f..b2b4d4263768d82d61b99d2a2125251345fa778c 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -456,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
                        goto unmap;
                *page = pte_page(*pte);
+
+               /*
+                * This should never happen (a device public page in the gate
+                * area).
+                */
+               if (is_device_public_page(*page))
+                       goto unmap;
        }
        get_page(*page);
 out:
index c9d23ef80552ff394cc2e8335a858dae6b29e92d..b31d56662202c7c2dbda0d934e7a9cb0edcf30d3 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -737,7 +737,7 @@ EXPORT_SYMBOL(hmm_vma_fault);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
                                       unsigned long addr)
 {
@@ -1177,4 +1177,4 @@ static int __init hmm_init(void)
 }
 
 device_initcall(hmm_init);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
index eea1c733286fba68fa7ed540aa77a1ea7f0c8f59..21261ff0466fb99d1254ee3927dd1730642d5356 100644 (file)
@@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                        continue;
                }
 
-               page = vm_normal_page(vma, addr, ptent);
+               page = _vm_normal_page(vma, addr, ptent, true);
                if (!page)
                        continue;
 
index 8aa98f9bc72354795492d092bbd60ee8860c4784..126a939b600a12f0b77ab4933896a08b502f01d3 100644 (file)
@@ -4623,10 +4623,11 @@ out:
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
- *     (so ZONE_DEVICE page and thus not on the lru). For now we such page is
- *     charge like a regular page would be as for all intent and purposes it is
- *     just special memory taking the place of a regular page.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
+ *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ *     For now we such page is charge like a regular page would be as for all
+ *     intent and purposes it is just special memory taking the place of a
+ *     regular page.
  *
  *     See Documentations/vm/hmm.txt and include/linux/hmm.h
  *
@@ -4657,7 +4658,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                 */
                if (page->mem_cgroup == mc.from) {
                        ret = MC_TARGET_PAGE;
-                       if (is_device_private_page(page))
+                       if (is_device_private_page(page) ||
+                           is_device_public_page(page))
                                ret = MC_TARGET_DEVICE;
                        if (target)
                                target->page = page;
index 079eeac0b009790ac6c707eb65f05562f9ddcb86..ad0ea1af1f4497684ba49a39185a10d8ee997bda 100644 (file)
@@ -818,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 #else
 # define HAVE_PTE_SPECIAL 0
 #endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-                               pte_t pte)
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                            pte_t pte, bool with_public_device)
 {
        unsigned long pfn = pte_pfn(pte);
 
@@ -830,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                        return vma->vm_ops->find_special_page(vma, addr);
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
-               if (!is_zero_pfn(pfn))
-                       print_bad_pte(vma, addr, pte, NULL);
+               if (is_zero_pfn(pfn))
+                       return NULL;
+
+               /*
+                * Device public pages are special pages (they are ZONE_DEVICE
+                * pages but different from persistent memory). They behave
+                * allmost like normal pages. The difference is that they are
+                * not on the lru and thus should never be involve with any-
+                * thing that involve lru manipulation (mlock, numa balancing,
+                * ...).
+                *
+                * This is why we still want to return NULL for such page from
+                * vm_normal_page() so that we do not have to special case all
+                * call site of vm_normal_page().
+                */
+               if (likely(pfn < highest_memmap_pfn)) {
+                       struct page *page = pfn_to_page(pfn);
+
+                       if (is_device_public_page(page)) {
+                               if (with_public_device)
+                                       return page;
+                               return NULL;
+                       }
+               }
+               print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }
 
@@ -1012,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                get_page(page);
                page_dup_rmap(page, false);
                rss[mm_counter(page)]++;
+       } else if (pte_devmap(pte)) {
+               page = pte_page(pte);
+
+               /*
+                * Cache coherent device memory behave like regular page and
+                * not like persistent memory page. For more informations see
+                * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
+                */
+               if (is_device_public_page(page)) {
+                       get_page(page);
+                       page_dup_rmap(page, false);
+                       rss[mm_counter(page)]++;
+               }
        }
 
 out_set_pte:
@@ -1267,7 +1303,7 @@ again:
                if (pte_present(ptent)) {
                        struct page *page;
 
-                       page = vm_normal_page(vma, addr, ptent);
+                       page = _vm_normal_page(vma, addr, ptent, true);
                        if (unlikely(details) && page) {
                                /*
                                 * unmap_shared_mapping_pages() wants to
index e581253ef3301164b8e1d2ed2e9a14c28022b853..618aeb5e9cde070858a45176d3b9f38c826b4aec 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
+#include <linux/pfn_t.h>
 #include <linux/memremap.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/balloon_compaction.h>
@@ -239,10 +240,14 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
                if (is_write_migration_entry(entry))
                        pte = maybe_mkwrite(pte, vma);
 
-               if (unlikely(is_zone_device_page(new)) &&
-                   is_device_private_page(new)) {
-                       entry = make_device_private_entry(new, pte_write(pte));
-                       pte = swp_entry_to_pte(entry);
+               if (unlikely(is_zone_device_page(new))) {
+                       if (is_device_private_page(new)) {
+                               entry = make_device_private_entry(new, pte_write(pte));
+                               pte = swp_entry_to_pte(entry);
+                       } else if (is_device_public_page(new)) {
+                               pte = pte_mkdevmap(pte);
+                               flush_dcache_page(new);
+                       }
                } else
                        flush_dcache_page(new);
 
@@ -437,12 +442,11 @@ int migrate_page_move_mapping(struct address_space *mapping,
        void **pslot;
 
        /*
-        * ZONE_DEVICE pages have 1 refcount always held by their device
-        *
-        * Note that DAX memory will never reach that point as it does not have
-        * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
+        * Device public or private pages have an extra refcount as they are
+        * ZONE_DEVICE pages.
         */
-       expected_count += is_zone_device_page(page);
+       expected_count += is_device_private_page(page);
+       expected_count += is_device_public_page(page);
 
        if (!mapping) {
                /* Anonymous page without mapping */
@@ -2123,7 +2127,6 @@ out_unlock:
 
 #endif /* CONFIG_NUMA */
 
-
 struct migrate_vma {
        struct vm_area_struct   *vma;
        unsigned long           *dst;
@@ -2263,7 +2266,7 @@ again:
                                pfn = 0;
                                goto next;
                        }
-                       page = vm_normal_page(migrate->vma, addr, pte);
+                       page = _vm_normal_page(migrate->vma, addr, pte, true);
                        mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
                        mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
                }
@@ -2406,10 +2409,19 @@ static bool migrate_vma_check_page(struct page *page)
                if (is_device_private_page(page))
                        return true;
 
-               /* Other ZONE_DEVICE memory type are not supported */
-               return false;
+               /*
+                * Only allow device public page to be migrated and account for
+                * the extra reference count imply by ZONE_DEVICE pages.
+                */
+               if (!is_device_public_page(page))
+                       return false;
+               extra++;
        }
 
+       /* For file back page */
+       if (page_mapping(page))
+               extra += 1 + page_has_private(page);
+
        if ((page_count(page) - extra) > page_mapcount(page))
                return false;
 
@@ -2647,11 +2659,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
         */
        __SetPageUptodate(page);
 
-       if (is_zone_device_page(page) && is_device_private_page(page)) {
-               swp_entry_t swp_entry;
-
-               swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
-               entry = swp_entry_to_pte(swp_entry);
+       if (is_zone_device_page(page)) {
+               if (is_device_private_page(page)) {
+                       swp_entry_t swp_entry;
+
+                       swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+                       entry = swp_entry_to_pte(swp_entry);
+               } else if (is_device_public_page(page)) {
+                       entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+                       if (vma->vm_flags & VM_WRITE)
+                               entry = pte_mkwrite(pte_mkdirty(entry));
+                       entry = pte_mkdevmap(entry);
+               }
        } else {
                entry = mk_pte(page, vma->vm_page_prot);
                if (vma->vm_flags & VM_WRITE)
@@ -2768,7 +2787,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
                                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
                                        continue;
                                }
-                       } else {
+                       } else if (!is_device_public_page(newpage)) {
                                /*
                                 * Other types of ZONE_DEVICE page are not
                                 * supported.
index 62d96b8e5eb3b01a51f48bcde87aabf591c72d96..9295ae960d6680165f67db4405698c3b48e3b84e 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -765,6 +765,17 @@ void release_pages(struct page **pages, int nr, bool cold)
                if (is_huge_zero_page(page))
                        continue;
 
+               /* Device public page can not be huge page */
+               if (is_device_public_page(page)) {
+                       if (locked_pgdat) {
+                               spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+                                                      flags);
+                               locked_pgdat = NULL;
+                       }
+                       put_zone_device_private_or_public_page(page);
+                       continue;
+               }
+
                page = compound_head(page);
                if (!put_page_testzero(page))
                        continue;