--- /dev/null
+From 615d6e8756c87149f2d4c1b93d471bca002bd849 Mon Sep 17 00:00:00 2001
+From: Davidlohr Bueso <davidlohr@hp.com>
+Date: Mon, 7 Apr 2014 15:37:25 -0700
+Subject: mm: per-thread vma caching
+
+From: Davidlohr Bueso <davidlohr@hp.com>
+
+commit 615d6e8756c87149f2d4c1b93d471bca002bd849 upstream.
+
+This patch is a continuation of efforts trying to optimize find_vma(),
+avoiding potentially expensive rbtree walks to locate a vma upon faults.
+The original approach (https://lkml.org/lkml/2013/11/1/410), where the
+largest vma was also cached, ended up being too specific and random,
+thus further comparison with other approaches were needed. There are
+two things to consider when dealing with this, the cache hit rate and
+the latency of find_vma(). Improving the hit-rate does not necessarily
+translate in finding the vma any faster, as the overhead of any fancy
+caching schemes can be too high to consider.
+
+We currently cache the last used vma for the whole address space, which
+provides a nice optimization, reducing the total cycles in find_vma() by
+up to 250%, for workloads with good locality. On the other hand, this
+simple scheme is pretty much useless for workloads with poor locality.
+Analyzing ebizzy runs shows that, no matter how many threads are
+running, the mmap_cache hit rate is less than 2%, and in many situations
+below 1%.
+
+The proposed approach is to replace this scheme with a small per-thread
+cache, maximizing hit rates at a very low maintenance cost.
+Invalidations are performed by simply bumping up a 32-bit sequence
+number. The only expensive operation is in the rare case of a seq
+number overflow, where all caches that share the same address space are
+flushed. Upon a miss, the proposed replacement policy is based on the
+page number that contains the virtual address in question. Concretely,
+the following results are seen on an 80 core, 8 socket x86-64 box:
+
+1) System bootup: Most programs are single threaded, so the per-thread
+ scheme does improve ~50% hit rate by just adding a few more slots to
+ the cache.
+
++----------------+----------+------------------+
+| caching scheme | hit-rate | cycles (billion) |
++----------------+----------+------------------+
+| baseline | 50.61% | 19.90 |
+| patched | 73.45% | 13.58 |
++----------------+----------+------------------+
+
+2) Kernel build: This one is already pretty good with the current
+ approach as we're dealing with good locality.
+
++----------------+----------+------------------+
+| caching scheme | hit-rate | cycles (billion) |
++----------------+----------+------------------+
+| baseline | 75.28% | 11.03 |
+| patched | 88.09% | 9.31 |
++----------------+----------+------------------+
+
+3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload.
+
++----------------+----------+------------------+
+| caching scheme | hit-rate | cycles (billion) |
++----------------+----------+------------------+
+| baseline | 70.66% | 17.14 |
+| patched | 91.15% | 12.57 |
++----------------+----------+------------------+
+
+4) Ebizzy: There's a fair amount of variation from run to run, but this
+ approach always shows nearly perfect hit rates, while baseline is just
+ about non-existent. The amounts of cycles can fluctuate between
+ anywhere from ~60 to ~116 for the baseline scheme, but this approach
+ reduces it considerably. For instance, with 80 threads:
+
++----------------+----------+------------------+
+| caching scheme | hit-rate | cycles (billion) |
++----------------+----------+------------------+
+| baseline | 1.06% | 91.54 |
+| patched | 99.97% | 14.18 |
++----------------+----------+------------------+
+
+[akpm@linux-foundation.org: fix nommu build, per Davidlohr]
+[akpm@linux-foundation.org: document vmacache_valid() logic]
+[akpm@linux-foundation.org: attempt to untangle header files]
+[akpm@linux-foundation.org: add vmacache_find() BUG_ON]
+[hughd@google.com: add vmacache_valid_mm() (from Oleg)]
+[akpm@linux-foundation.org: coding-style fixes]
+[akpm@linux-foundation.org: adjust and enhance comments]
+Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
+Reviewed-by: Michel Lespinasse <walken@google.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Tested-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/unicore32/include/asm/mmu_context.h | 4 -
+ fs/exec.c | 5 +
+ fs/proc/task_mmu.c | 3
+ include/linux/mm_types.h | 4 -
+ include/linux/sched.h | 7 +
+ include/linux/vmacache.h | 38 ++++++++++
+ kernel/debug/debug_core.c | 14 +++
+ kernel/fork.c | 7 +
+ mm/Makefile | 2
+ mm/mmap.c | 51 +++++++-------
+ mm/nommu.c | 24 ++++--
+ mm/vmacache.c | 112 +++++++++++++++++++++++++++++++
+ 12 files changed, 229 insertions(+), 42 deletions(-)
+
+--- a/arch/unicore32/include/asm/mmu_context.h
++++ b/arch/unicore32/include/asm/mmu_context.h
+@@ -14,6 +14,8 @@
+
+ #include <linux/compiler.h>
+ #include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/io.h>
+
+ #include <asm/cacheflush.h>
+@@ -73,7 +75,7 @@ do { \
+ else \
+ mm->mmap = NULL; \
+ rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
+- mm->mmap_cache = NULL; \
++ vmacache_invalidate(mm); \
+ mm->map_count--; \
+ remove_vma(high_vma); \
+ } \
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -26,6 +26,7 @@
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/stat.h>
+ #include <linux/fcntl.h>
+ #include <linux/swap.h>
+@@ -820,7 +821,7 @@ EXPORT_SYMBOL(read_code);
+ static int exec_mmap(struct mm_struct *mm)
+ {
+ struct task_struct *tsk;
+- struct mm_struct * old_mm, *active_mm;
++ struct mm_struct *old_mm, *active_mm;
+
+ /* Notify parent that we're no longer interested in the old VM */
+ tsk = current;
+@@ -846,6 +847,8 @@ static int exec_mmap(struct mm_struct *m
+ tsk->mm = mm;
+ tsk->active_mm = mm;
+ activate_mm(active_mm, mm);
++ tsk->mm->vmacache_seqnum = 0;
++ vmacache_flush(tsk);
+ task_unlock(tsk);
+ if (old_mm) {
+ up_read(&old_mm->mmap_sem);
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1,4 +1,5 @@
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/hugetlb.h>
+ #include <linux/huge_mm.h>
+ #include <linux/mount.h>
+@@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m,
+
+ /*
+ * We remember last_addr rather than next_addr to hit with
+- * mmap_cache most of the time. We have zero last_addr at
++ * vmacache most of the time. We have zero last_addr at
+ * the beginning and also after lseek. We will have -1 last_addr
+ * after the end of the vmas.
+ */
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -342,9 +342,9 @@ struct mm_rss_stat {
+
+ struct kioctx_table;
+ struct mm_struct {
+- struct vm_area_struct * mmap; /* list of VMAs */
++ struct vm_area_struct *mmap; /* list of VMAs */
+ struct rb_root mm_rb;
+- struct vm_area_struct * mmap_cache; /* last find_vma result */
++ u32 vmacache_seqnum; /* per-thread vmacache */
+ #ifdef CONFIG_MMU
+ unsigned long (*get_unmapped_area) (struct file *filp,
+ unsigned long addr, unsigned long len,
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -59,6 +59,10 @@ struct sched_param {
+
+ #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
+
++#define VMACACHE_BITS 2
++#define VMACACHE_SIZE (1U << VMACACHE_BITS)
++#define VMACACHE_MASK (VMACACHE_SIZE - 1)
++
+ /*
+ * Extended scheduling parameters data structure.
+ *
+@@ -1228,6 +1232,9 @@ struct task_struct {
+ #ifdef CONFIG_COMPAT_BRK
+ unsigned brk_randomized:1;
+ #endif
++ /* per-thread vma caching */
++ u32 vmacache_seqnum;
++ struct vm_area_struct *vmacache[VMACACHE_SIZE];
+ #if defined(SPLIT_RSS_COUNTING)
+ struct task_rss_stat rss_stat;
+ #endif
+--- /dev/null
++++ b/include/linux/vmacache.h
+@@ -0,0 +1,38 @@
++#ifndef __LINUX_VMACACHE_H
++#define __LINUX_VMACACHE_H
++
++#include <linux/sched.h>
++#include <linux/mm.h>
++
++/*
++ * Hash based on the page number. Provides a good hit rate for
++ * workloads with good locality and those with random accesses as well.
++ */
++#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
++
++static inline void vmacache_flush(struct task_struct *tsk)
++{
++ memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
++}
++
++extern void vmacache_flush_all(struct mm_struct *mm);
++extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
++extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
++ unsigned long addr);
++
++#ifndef CONFIG_MMU
++extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
++ unsigned long start,
++ unsigned long end);
++#endif
++
++static inline void vmacache_invalidate(struct mm_struct *mm)
++{
++ mm->vmacache_seqnum++;
++
++ /* deal with overflows */
++ if (unlikely(mm->vmacache_seqnum == 0))
++ vmacache_flush_all(mm);
++}
++
++#endif /* __LINUX_VMACACHE_H */
+--- a/kernel/debug/debug_core.c
++++ b/kernel/debug/debug_core.c
+@@ -49,6 +49,7 @@
+ #include <linux/pid.h>
+ #include <linux/smp.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/rcupdate.h>
+
+ #include <asm/cacheflush.h>
+@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsi
+ if (!CACHE_FLUSH_IS_SAFE)
+ return;
+
+- if (current->mm && current->mm->mmap_cache) {
+- flush_cache_range(current->mm->mmap_cache,
+- addr, addr + BREAK_INSTR_SIZE);
++ if (current->mm) {
++ int i;
++
++ for (i = 0; i < VMACACHE_SIZE; i++) {
++ if (!current->vmacache[i])
++ continue;
++ flush_cache_range(current->vmacache[i],
++ addr, addr + BREAK_INSTR_SIZE);
++ }
+ }
++
+ /* Force flush instruction cache if it was outside the mm */
+ flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+ }
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -28,6 +28,8 @@
+ #include <linux/mman.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/nsproxy.h>
+ #include <linux/capability.h>
+ #include <linux/cpu.h>
+@@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm
+
+ mm->locked_vm = 0;
+ mm->mmap = NULL;
+- mm->mmap_cache = NULL;
++ mm->vmacache_seqnum = 0;
+ mm->map_count = 0;
+ cpumask_clear(mm_cpumask(mm));
+ mm->mm_rb = RB_ROOT;
+@@ -876,6 +878,9 @@ static int copy_mm(unsigned long clone_f
+ if (!oldmm)
+ return 0;
+
++ /* initialize the new vmacache entries */
++ vmacache_flush(tsk);
++
+ if (clone_flags & CLONE_VM) {
+ atomic_inc(&oldmm->mm_users);
+ mm = oldmm;
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.
+ readahead.o swap.o truncate.o vmscan.o shmem.o \
+ util.o mmzone.o vmstat.o backing-dev.o \
+ mm_init.o mmu_context.o percpu.o slab_common.o \
+- compaction.o balloon_compaction.o \
++ compaction.o balloon_compaction.o vmacache.o \
+ interval_tree.o list_lru.o $(mmu-y)
+
+ obj-y += init-mm.o
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -10,6 +10,7 @@
+ #include <linux/slab.h>
+ #include <linux/backing-dev.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/shm.h>
+ #include <linux/mman.h>
+ #include <linux/pagemap.h>
+@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struc
+ prev->vm_next = next = vma->vm_next;
+ if (next)
+ next->vm_prev = prev;
+- if (mm->mmap_cache == vma)
+- mm->mmap_cache = prev;
++
++ /* Kill the cache */
++ vmacache_invalidate(mm);
+ }
+
+ /*
+@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
+ /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
+ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ {
+- struct vm_area_struct *vma = NULL;
++ struct rb_node *rb_node;
++ struct vm_area_struct *vma;
+
+ /* Check the cache first. */
+- /* (Cache hit rate is typically around 35%.) */
+- vma = ACCESS_ONCE(mm->mmap_cache);
+- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+- struct rb_node *rb_node;
++ vma = vmacache_find(mm, addr);
++ if (likely(vma))
++ return vma;
+
+- rb_node = mm->mm_rb.rb_node;
+- vma = NULL;
++ rb_node = mm->mm_rb.rb_node;
++ vma = NULL;
+
+- while (rb_node) {
+- struct vm_area_struct *vma_tmp;
++ while (rb_node) {
++ struct vm_area_struct *tmp;
+
+- vma_tmp = rb_entry(rb_node,
+- struct vm_area_struct, vm_rb);
++ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+- if (vma_tmp->vm_end > addr) {
+- vma = vma_tmp;
+- if (vma_tmp->vm_start <= addr)
+- break;
+- rb_node = rb_node->rb_left;
+- } else
+- rb_node = rb_node->rb_right;
+- }
+- if (vma)
+- mm->mmap_cache = vma;
++ if (tmp->vm_end > addr) {
++ vma = tmp;
++ if (tmp->vm_start <= addr)
++ break;
++ rb_node = rb_node->rb_left;
++ } else
++ rb_node = rb_node->rb_right;
+ }
++
++ if (vma)
++ vmacache_update(addr, vma);
+ return vma;
+ }
+
+@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_str
+ } else
+ mm->highest_vm_end = prev ? prev->vm_end : 0;
+ tail_vma->vm_next = NULL;
+- mm->mmap_cache = NULL; /* Kill the cache. */
++
++ /* Kill the cache */
++ vmacache_invalidate(mm);
+ }
+
+ /*
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -15,6 +15,7 @@
+
+ #include <linux/export.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/mman.h>
+ #include <linux/swap.h>
+ #include <linux/file.h>
+@@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_stru
+ */
+ static void delete_vma_from_mm(struct vm_area_struct *vma)
+ {
++ int i;
+ struct address_space *mapping;
+ struct mm_struct *mm = vma->vm_mm;
++ struct task_struct *curr = current;
+
+ kenter("%p", vma);
+
+ protect_vma(vma, 0);
+
+ mm->map_count--;
+- if (mm->mmap_cache == vma)
+- mm->mmap_cache = NULL;
++ for (i = 0; i < VMACACHE_SIZE; i++) {
++ /* if the vma is cached, invalidate the entire cache */
++ if (curr->vmacache[i] == vma) {
++ vmacache_invalidate(curr->mm);
++ break;
++ }
++ }
+
+ /* remove the VMA from the mapping */
+ if (vma->vm_file) {
+@@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct m
+ struct vm_area_struct *vma;
+
+ /* check the cache first */
+- vma = ACCESS_ONCE(mm->mmap_cache);
+- if (vma && vma->vm_start <= addr && vma->vm_end > addr)
++ vma = vmacache_find(mm, addr);
++ if (likely(vma))
+ return vma;
+
+ /* trawl the list (there may be multiple mappings in which addr
+@@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct m
+ if (vma->vm_start > addr)
+ return NULL;
+ if (vma->vm_end > addr) {
+- mm->mmap_cache = vma;
++ vmacache_update(addr, vma);
+ return vma;
+ }
+ }
+@@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_e
+ unsigned long end = addr + len;
+
+ /* check the cache first */
+- vma = mm->mmap_cache;
+- if (vma && vma->vm_start == addr && vma->vm_end == end)
++ vma = vmacache_find_exact(mm, addr, end);
++ if (vma)
+ return vma;
+
+ /* trawl the list (there may be multiple mappings in which addr
+@@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_e
+ if (vma->vm_start > addr)
+ return NULL;
+ if (vma->vm_end == end) {
+- mm->mmap_cache = vma;
++ vmacache_update(addr, vma);
+ return vma;
+ }
+ }
+--- /dev/null
++++ b/mm/vmacache.c
+@@ -0,0 +1,112 @@
++/*
++ * Copyright (C) 2014 Davidlohr Bueso.
++ */
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
++
++/*
++ * Flush vma caches for threads that share a given mm.
++ *
++ * The operation is safe because the caller holds the mmap_sem
++ * exclusively and other threads accessing the vma cache will
++ * have mmap_sem held at least for read, so no extra locking
++ * is required to maintain the vma cache.
++ */
++void vmacache_flush_all(struct mm_struct *mm)
++{
++ struct task_struct *g, *p;
++
++ rcu_read_lock();
++ for_each_process_thread(g, p) {
++ /*
++ * Only flush the vmacache pointers as the
++ * mm seqnum is already set and curr's will
++ * be set upon invalidation when the next
++ * lookup is done.
++ */
++ if (mm == p->mm)
++ vmacache_flush(p);
++ }
++ rcu_read_unlock();
++}
++
++/*
++ * This task may be accessing a foreign mm via (for example)
++ * get_user_pages()->find_vma(). The vmacache is task-local and this
++ * task's vmacache pertains to a different mm (ie, its own). There is
++ * nothing we can do here.
++ *
++ * Also handle the case where a kernel thread has adopted this mm via use_mm().
++ * That kernel thread's vmacache is not applicable to this mm.
++ */
++static bool vmacache_valid_mm(struct mm_struct *mm)
++{
++ return current->mm == mm && !(current->flags & PF_KTHREAD);
++}
++
++void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
++{
++ if (vmacache_valid_mm(newvma->vm_mm))
++ current->vmacache[VMACACHE_HASH(addr)] = newvma;
++}
++
++static bool vmacache_valid(struct mm_struct *mm)
++{
++ struct task_struct *curr;
++
++ if (!vmacache_valid_mm(mm))
++ return false;
++
++ curr = current;
++ if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
++ /*
++ * First attempt will always be invalid, initialize
++ * the new cache for this task here.
++ */
++ curr->vmacache_seqnum = mm->vmacache_seqnum;
++ vmacache_flush(curr);
++ return false;
++ }
++ return true;
++}
++
++struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
++{
++ int i;
++
++ if (!vmacache_valid(mm))
++ return NULL;
++
++ for (i = 0; i < VMACACHE_SIZE; i++) {
++ struct vm_area_struct *vma = current->vmacache[i];
++
++ if (vma && vma->vm_start <= addr && vma->vm_end > addr) {
++ BUG_ON(vma->vm_mm != mm);
++ return vma;
++ }
++ }
++
++ return NULL;
++}
++
++#ifndef CONFIG_MMU
++struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
++ unsigned long start,
++ unsigned long end)
++{
++ int i;
++
++ if (!vmacache_valid(mm))
++ return NULL;
++
++ for (i = 0; i < VMACACHE_SIZE; i++) {
++ struct vm_area_struct *vma = current->vmacache[i];
++
++ if (vma && vma->vm_start == start && vma->vm_end == end)
++ return vma;
++ }
++
++ return NULL;
++}
++#endif