]> git.ipfire.org Git - people/teissler/ipfire-2.x.git/blobdiff - src/patches/suse-2.6.27.31/patches.suse/mnt-want-write-speedup.patch
Move xen patchset to new version's subdir.
[people/teissler/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.suse / mnt-want-write-speedup.patch
diff --git a/src/patches/suse-2.6.27.31/patches.suse/mnt-want-write-speedup.patch b/src/patches/suse-2.6.27.31/patches.suse/mnt-want-write-speedup.patch
new file mode 100644 (file)
index 0000000..22a3d39
--- /dev/null
@@ -0,0 +1,417 @@
+From: Nick Piggin <npiggin@suse.de>
+Subject: fs: mnt_want_write speedup
+References: bnc#436953
+Patch-upstream: no (could be submitted)
+
+This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
+basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
+A microbenchmark yes, but it exercises some important paths in the mm.
+
+Before:
+ avg = 501.9
+ std = 14.7773
+
+After:
+ avg = 462.286
+ std = 5.46106
+
+(50 runs of each, stddev gives a reasonable confidence, but there is quite
+a bit of variation there still)
+
+It does this by removing the complex per-cpu locking and counter-cache and
+replaces it with a percpu counter in struct vfsmount. This makes the code
+much simpler, and avoids spinlocks (although the msync is still pretty
+costly, unfortunately). It results in about 900 bytes smaller code too. It
+does increase the size of a vfsmount, however.
+
+It should also give a speedup on large systems if CPUs are frequently operating
+on different mounts (because the existing scheme has to operate on an atomic in
+the struct vfsmount when switching between mounts). But I'm most interested in
+the single threaded path performance for the moment.
+
+---
+ fs/namespace.c        |  251 +++++++++++++++-----------------------------------
+ include/linux/mount.h |   18 +++
+ 2 files changed, 96 insertions(+), 173 deletions(-)
+
+--- linux-2.6.27.orig/fs/namespace.c
++++ linux-2.6.27/fs/namespace.c
+@@ -130,10 +130,20 @@ struct vfsmount *alloc_vfsmnt(const char
+               INIT_LIST_HEAD(&mnt->mnt_share);
+               INIT_LIST_HEAD(&mnt->mnt_slave_list);
+               INIT_LIST_HEAD(&mnt->mnt_slave);
+-              atomic_set(&mnt->__mnt_writers, 0);
++#ifdef CONFIG_SMP
++              mnt->mnt_writers = alloc_percpu(int);
++              if (!mnt->mnt_writers)
++                      goto out_free_devname;
++#else
++              mnt->mnt_writers = 0;
++#endif
+       }
+       return mnt;
++#ifdef CONFIG_SMP
++out_free_devname:
++      kfree(mnt->mnt_devname);
++#endif
+ out_free_id:
+       mnt_free_id(mnt);
+ out_free_cache:
+@@ -170,65 +180,38 @@ int __mnt_is_readonly(struct vfsmount *m
+ }
+ EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+-struct mnt_writer {
+-      /*
+-       * If holding multiple instances of this lock, they
+-       * must be ordered by cpu number.
+-       */
+-      spinlock_t lock;
+-      struct lock_class_key lock_class; /* compiles out with !lockdep */
+-      unsigned long count;
+-      struct vfsmount *mnt;
+-} ____cacheline_aligned_in_smp;
+-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
++static inline void inc_mnt_writers(struct vfsmount *mnt)
++{
++#ifdef CONFIG_SMP
++      (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
++#else
++      mnt->mnt_writers++;
++#endif
++}
+-static int __init init_mnt_writers(void)
++static inline void dec_mnt_writers(struct vfsmount *mnt)
+ {
+-      int cpu;
+-      for_each_possible_cpu(cpu) {
+-              struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+-              spin_lock_init(&writer->lock);
+-              lockdep_set_class(&writer->lock, &writer->lock_class);
+-              writer->count = 0;
+-      }
+-      return 0;
++#ifdef CONFIG_SMP
++      (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
++#else
++      mnt->mnt_writers--;
++#endif
+ }
+-fs_initcall(init_mnt_writers);
+-static void unlock_mnt_writers(void)
++static unsigned int count_mnt_writers(struct vfsmount *mnt)
+ {
++#ifdef CONFIG_SMP
++      unsigned int count = 0;
+       int cpu;
+-      struct mnt_writer *cpu_writer;
+       for_each_possible_cpu(cpu) {
+-              cpu_writer = &per_cpu(mnt_writers, cpu);
+-              spin_unlock(&cpu_writer->lock);
++              count += *per_cpu_ptr(mnt->mnt_writers, cpu);
+       }
+-}
+-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+-{
+-      if (!cpu_writer->mnt)
+-              return;
+-      /*
+-       * This is in case anyone ever leaves an invalid,
+-       * old ->mnt and a count of 0.
+-       */
+-      if (!cpu_writer->count)
+-              return;
+-      atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+-      cpu_writer->count = 0;
+-}
+- /*
+- * must hold cpu_writer->lock
+- */
+-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+-                                        struct vfsmount *mnt)
+-{
+-      if (cpu_writer->mnt == mnt)
+-              return;
+-      __clear_mnt_count(cpu_writer);
+-      cpu_writer->mnt = mnt;
++      return count;
++#else
++      return mnt->mnt_writers;
++#endif
+ }
+ /*
+@@ -252,75 +235,34 @@ static inline void use_cpu_writer_for_mo
+ int mnt_want_write(struct vfsmount *mnt)
+ {
+       int ret = 0;
+-      struct mnt_writer *cpu_writer;
+-      cpu_writer = &get_cpu_var(mnt_writers);
+-      spin_lock(&cpu_writer->lock);
++      preempt_disable();
++      inc_mnt_writers(mnt);
++      /*
++       * The store to inc_mnt_writers must be visible before we pass
++       * MNT_WRITE_HOLD loop below, so that the slowpath can see our
++       * incremented count after it has set MNT_WRITE_HOLD.
++       */
++      smp_mb();
++      while (mnt->mnt_flags & MNT_WRITE_HOLD)
++              cpu_relax();
++      /*
++       * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
++       * be set to match its requirements. So we must not load that until
++       * MNT_WRITE_HOLD is cleared.
++       */
++      smp_rmb();
+       if (__mnt_is_readonly(mnt)) {
++              dec_mnt_writers(mnt);
+               ret = -EROFS;
+               goto out;
+       }
+-      use_cpu_writer_for_mount(cpu_writer, mnt);
+-      cpu_writer->count++;
+ out:
+-      spin_unlock(&cpu_writer->lock);
+-      put_cpu_var(mnt_writers);
++      preempt_enable();
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(mnt_want_write);
+-static void lock_mnt_writers(void)
+-{
+-      int cpu;
+-      struct mnt_writer *cpu_writer;
+-
+-      for_each_possible_cpu(cpu) {
+-              cpu_writer = &per_cpu(mnt_writers, cpu);
+-              spin_lock(&cpu_writer->lock);
+-              __clear_mnt_count(cpu_writer);
+-              cpu_writer->mnt = NULL;
+-      }
+-}
+-
+-/*
+- * These per-cpu write counts are not guaranteed to have
+- * matched increments and decrements on any given cpu.
+- * A file open()ed for write on one cpu and close()d on
+- * another cpu will imbalance this count.  Make sure it
+- * does not get too far out of whack.
+- */
+-static void handle_write_count_underflow(struct vfsmount *mnt)
+-{
+-      if (atomic_read(&mnt->__mnt_writers) >=
+-          MNT_WRITER_UNDERFLOW_LIMIT)
+-              return;
+-      /*
+-       * It isn't necessary to hold all of the locks
+-       * at the same time, but doing it this way makes
+-       * us share a lot more code.
+-       */
+-      lock_mnt_writers();
+-      /*
+-       * vfsmount_lock is for mnt_flags.
+-       */
+-      spin_lock(&vfsmount_lock);
+-      /*
+-       * If coalescing the per-cpu writer counts did not
+-       * get us back to a positive writer count, we have
+-       * a bug.
+-       */
+-      if ((atomic_read(&mnt->__mnt_writers) < 0) &&
+-          !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
+-              WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
+-                              "count: %d\n",
+-                      mnt, atomic_read(&mnt->__mnt_writers));
+-              /* use the flag to keep the dmesg spam down */
+-              mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
+-      }
+-      spin_unlock(&vfsmount_lock);
+-      unlock_mnt_writers();
+-}
+-
+ /**
+  * mnt_drop_write - give up write access to a mount
+  * @mnt: the mount on which to give up write access
+@@ -331,37 +273,9 @@ static void handle_write_count_underflow
+  */
+ void mnt_drop_write(struct vfsmount *mnt)
+ {
+-      int must_check_underflow = 0;
+-      struct mnt_writer *cpu_writer;
+-
+-      cpu_writer = &get_cpu_var(mnt_writers);
+-      spin_lock(&cpu_writer->lock);
+-
+-      use_cpu_writer_for_mount(cpu_writer, mnt);
+-      if (cpu_writer->count > 0) {
+-              cpu_writer->count--;
+-      } else {
+-              must_check_underflow = 1;
+-              atomic_dec(&mnt->__mnt_writers);
+-      }
+-
+-      spin_unlock(&cpu_writer->lock);
+-      /*
+-       * Logically, we could call this each time,
+-       * but the __mnt_writers cacheline tends to
+-       * be cold, and makes this expensive.
+-       */
+-      if (must_check_underflow)
+-              handle_write_count_underflow(mnt);
+-      /*
+-       * This could be done right after the spinlock
+-       * is taken because the spinlock keeps us on
+-       * the cpu, and disables preemption.  However,
+-       * putting it here bounds the amount that
+-       * __mnt_writers can underflow.  Without it,
+-       * we could theoretically wrap __mnt_writers.
+-       */
+-      put_cpu_var(mnt_writers);
++      preempt_disable();
++      dec_mnt_writers(mnt);
++      preempt_enable();
+ }
+ EXPORT_SYMBOL_GPL(mnt_drop_write);
+@@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
+ {
+       int ret = 0;
+-      lock_mnt_writers();
++      spin_lock(&vfsmount_lock);
++      mnt->mnt_flags |= MNT_WRITE_HOLD;
+       /*
+-       * With all the locks held, this value is stable
++       * After storing MNT_WRITE_HOLD, we'll read the counters. This store
++       * should be visible before we do.
+        */
+-      if (atomic_read(&mnt->__mnt_writers) > 0) {
++      smp_mb();
++
++      /*
++       * With writers on hold, if this value is zero, then there are definitely
++       * no active writers (although held writers may subsequently increment
++       * the count, they'll have to wait, and decrement it after seeing
++       * MNT_READONLY).
++       */
++      if (count_mnt_writers(mnt) > 0) {
+               ret = -EBUSY;
+               goto out;
+       }
+-      /*
+-       * nobody can do a successful mnt_want_write() with all
+-       * of the counts in MNT_DENIED_WRITE and the locks held.
+-       */
+-      spin_lock(&vfsmount_lock);
+       if (!ret)
+               mnt->mnt_flags |= MNT_READONLY;
+-      spin_unlock(&vfsmount_lock);
+ out:
+-      unlock_mnt_writers();
++      /*
++       * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
++       * that become unheld will see MNT_READONLY.
++       */
++      smp_wmb();
++      mnt->mnt_flags &= ~MNT_WRITE_HOLD;
++      spin_unlock(&vfsmount_lock);
+       return ret;
+ }
+@@ -410,6 +334,9 @@ void free_vfsmnt(struct vfsmount *mnt)
+ {
+       kfree(mnt->mnt_devname);
+       mnt_free_id(mnt);
++#ifdef CONFIG_SMP
++      free_percpu(mnt->mnt_writers);
++#endif
+       kmem_cache_free(mnt_cache, mnt);
+ }
+@@ -604,36 +531,14 @@ static struct vfsmount *clone_mnt(struct
+ static inline void __mntput(struct vfsmount *mnt)
+ {
+-      int cpu;
+       struct super_block *sb = mnt->mnt_sb;
+       /*
+-       * We don't have to hold all of the locks at the
+-       * same time here because we know that we're the
+-       * last reference to mnt and that no new writers
+-       * can come in.
+-       */
+-      for_each_possible_cpu(cpu) {
+-              struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
+-              if (cpu_writer->mnt != mnt)
+-                      continue;
+-              spin_lock(&cpu_writer->lock);
+-              atomic_add(cpu_writer->count, &mnt->__mnt_writers);
+-              cpu_writer->count = 0;
+-              /*
+-               * Might as well do this so that no one
+-               * ever sees the pointer and expects
+-               * it to be valid.
+-               */
+-              cpu_writer->mnt = NULL;
+-              spin_unlock(&cpu_writer->lock);
+-      }
+-      /*
+        * This probably indicates that somebody messed
+        * up a mnt_want/drop_write() pair.  If this
+        * happens, the filesystem was probably unable
+        * to make r/w->r/o transitions.
+        */
+-      WARN_ON(atomic_read(&mnt->__mnt_writers));
++      WARN_ON(count_mnt_writers(mnt));
+       dput(mnt->mnt_root);
+       free_vfsmnt(mnt);
+       deactivate_super(sb);
+--- linux-2.6.27.orig/include/linux/mount.h
++++ linux-2.6.27/include/linux/mount.h
+@@ -32,6 +32,7 @@ struct mnt_namespace;
+ #define MNT_SHRINKABLE        0x100
+ #define MNT_IMBALANCED_WRITE_COUNT    0x200 /* just for debugging */
++#define MNT_WRITE_HOLD        0x400
+ #define MNT_SHARED    0x1000  /* if the vfsmount is a shared mount */
+ #define MNT_UNBINDABLE        0x2000  /* if the vfsmount is a unbindable mount */
+@@ -66,13 +67,30 @@ struct vfsmount {
+       int mnt_expiry_mark;            /* true if marked for expiry */
+       int mnt_pinned;
+       int mnt_ghosts;
++#ifdef __GENKSYMS__
+       /*
+        * This value is not stable unless all of the mnt_writers[] spinlocks
+        * are held, and all mnt_writer[]s on this mount have 0 as their ->count
+        */
+       atomic_t __mnt_writers;
++#else
++#ifdef CONFIG_SMP
++      int *mnt_writers;
++#else
++      int mnt_writers;
++#endif
++#endif
+ };
++static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
++{
++#ifdef CONFIG_SMP
++      return mnt->mnt_writers;
++#else
++      return &mnt->mnt_writers;
++#endif
++}
++
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
+ {
+       if (mnt)