src/patches/suse-2.6.27.31/patches.suse/mnt-want-write-speedup.patch

   1 From: Nick Piggin <npiggin@suse.de>
   2 Subject: fs: mnt_want_write speedup
   3 References: bnc#436953
   4 Patch-upstream: no (could be submitted)
   5
   6 This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
   7 basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
   8 A microbenchmark yes, but it exercises some important paths in the mm.
   9
  10 Before:
  11  avg = 501.9
  12  std = 14.7773
  13
  14 After:
  15  avg = 462.286
  16  std = 5.46106
  17
  18 (50 runs of each, stddev gives a reasonable confidence, but there is quite
  19 a bit of variation there still)
  20
  21 It does this by removing the complex per-cpu locking and counter-cache and
  22 replaces it with a percpu counter in struct vfsmount. This makes the code
  23 much simpler, and avoids spinlocks (although the msync is still pretty
  24 costly, unfortunately). It results in about 900 bytes smaller code too. It
  25 does increase the size of a vfsmount, however.
  26
  27 It should also give a speedup on large systems if CPUs are frequently operating
  28 on different mounts (because the existing scheme has to operate on an atomic in
  29 the struct vfsmount when switching between mounts). But I'm most interested in
  30 the single threaded path performance for the moment.
  31
  32 ---
  33  fs/namespace.c        |  251 +++++++++++++++-----------------------------------
  34  include/linux/mount.h |   18 +++
  35  2 files changed, 96 insertions(+), 173 deletions(-)
  36
  37 --- linux-2.6.27.orig/fs/namespace.c
  38 +++ linux-2.6.27/fs/namespace.c
  39 @@ -130,10 +130,20 @@ struct vfsmount *alloc_vfsmnt(const char
  40                 INIT_LIST_HEAD(&mnt->mnt_share);
  41                 INIT_LIST_HEAD(&mnt->mnt_slave_list);
  42                 INIT_LIST_HEAD(&mnt->mnt_slave);
  43 -               atomic_set(&mnt->__mnt_writers, 0);
  44 +#ifdef CONFIG_SMP
  45 +               mnt->mnt_writers = alloc_percpu(int);
  46 +               if (!mnt->mnt_writers)
  47 +                       goto out_free_devname;
  48 +#else
  49 +               mnt->mnt_writers = 0;
  50 +#endif
  51         }
  52         return mnt;
  53
  54 +#ifdef CONFIG_SMP
  55 +out_free_devname:
  56 +       kfree(mnt->mnt_devname);
  57 +#endif
  58  out_free_id:
  59         mnt_free_id(mnt);
  60  out_free_cache:
  61 @@ -170,65 +180,38 @@ int __mnt_is_readonly(struct vfsmount *m
  62  }
  63  EXPORT_SYMBOL_GPL(__mnt_is_readonly);
  64
  65 -struct mnt_writer {
  66 -       /*
  67 -        * If holding multiple instances of this lock, they
  68 -        * must be ordered by cpu number.
  69 -        */
  70 -       spinlock_t lock;
  71 -       struct lock_class_key lock_class; /* compiles out with !lockdep */
  72 -       unsigned long count;
  73 -       struct vfsmount *mnt;
  74 -} ____cacheline_aligned_in_smp;
  75 -static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
  76 +static inline void inc_mnt_writers(struct vfsmount *mnt)
  77 +{
  78 +#ifdef CONFIG_SMP
  79 +       (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
  80 +#else
  81 +       mnt->mnt_writers++;
  82 +#endif
  83 +}
  84
  85 -static int __init init_mnt_writers(void)
  86 +static inline void dec_mnt_writers(struct vfsmount *mnt)
  87  {
  88 -       int cpu;
  89 -       for_each_possible_cpu(cpu) {
  90 -               struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
  91 -               spin_lock_init(&writer->lock);
  92 -               lockdep_set_class(&writer->lock, &writer->lock_class);
  93 -               writer->count = 0;
  94 -       }
  95 -       return 0;
  96 +#ifdef CONFIG_SMP
  97 +       (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
  98 +#else
  99 +       mnt->mnt_writers--;
 100 +#endif
 101  }
 102 -fs_initcall(init_mnt_writers);
 103
 104 -static void unlock_mnt_writers(void)
 105 +static unsigned int count_mnt_writers(struct vfsmount *mnt)
 106  {
 107 +#ifdef CONFIG_SMP
 108 +       unsigned int count = 0;
 109         int cpu;
 110 -       struct mnt_writer *cpu_writer;
 111
 112         for_each_possible_cpu(cpu) {
 113 -               cpu_writer = &per_cpu(mnt_writers, cpu);
 114 -               spin_unlock(&cpu_writer->lock);
 115 +               count += *per_cpu_ptr(mnt->mnt_writers, cpu);
 116         }
 117 -}
 118
 119 -static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
 120 -{
 121 -       if (!cpu_writer->mnt)
 122 -               return;
 123 -       /*
 124 -        * This is in case anyone ever leaves an invalid,
 125 -        * old ->mnt and a count of 0.
 126 -        */
 127 -       if (!cpu_writer->count)
 128 -               return;
 129 -       atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
 130 -       cpu_writer->count = 0;
 131 -}
 132 - /*
 133 - * must hold cpu_writer->lock
 134 - */
 135 -static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
 136 -                                         struct vfsmount *mnt)
 137 -{
 138 -       if (cpu_writer->mnt == mnt)
 139 -               return;
 140 -       __clear_mnt_count(cpu_writer);
 141 -       cpu_writer->mnt = mnt;
 142 +       return count;
 143 +#else
 144 +       return mnt->mnt_writers;
 145 +#endif
 146  }
 147
 148  /*
 149 @@ -252,75 +235,34 @@ static inline void use_cpu_writer_for_mo
 150  int mnt_want_write(struct vfsmount *mnt)
 151  {
 152         int ret = 0;
 153 -       struct mnt_writer *cpu_writer;
 154
 155 -       cpu_writer = &get_cpu_var(mnt_writers);
 156 -       spin_lock(&cpu_writer->lock);
 157 +       preempt_disable();
 158 +       inc_mnt_writers(mnt);
 159 +       /*
 160 +        * The store to inc_mnt_writers must be visible before we pass
 161 +        * MNT_WRITE_HOLD loop below, so that the slowpath can see our
 162 +        * incremented count after it has set MNT_WRITE_HOLD.
 163 +        */
 164 +       smp_mb();
 165 +       while (mnt->mnt_flags & MNT_WRITE_HOLD)
 166 +               cpu_relax();
 167 +       /*
 168 +        * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
 169 +        * be set to match its requirements. So we must not load that until
 170 +        * MNT_WRITE_HOLD is cleared.
 171 +        */
 172 +       smp_rmb();
 173         if (__mnt_is_readonly(mnt)) {
 174 +               dec_mnt_writers(mnt);
 175                 ret = -EROFS;
 176                 goto out;
 177         }
 178 -       use_cpu_writer_for_mount(cpu_writer, mnt);
 179 -       cpu_writer->count++;
 180  out:
 181 -       spin_unlock(&cpu_writer->lock);
 182 -       put_cpu_var(mnt_writers);
 183 +       preempt_enable();
 184         return ret;
 185  }
 186  EXPORT_SYMBOL_GPL(mnt_want_write);
 187
 188 -static void lock_mnt_writers(void)
 189 -{
 190 -       int cpu;
 191 -       struct mnt_writer *cpu_writer;
 192 -
 193 -       for_each_possible_cpu(cpu) {
 194 -               cpu_writer = &per_cpu(mnt_writers, cpu);
 195 -               spin_lock(&cpu_writer->lock);
 196 -               __clear_mnt_count(cpu_writer);
 197 -               cpu_writer->mnt = NULL;
 198 -       }
 199 -}
 200 -
 201 -/*
 202 - * These per-cpu write counts are not guaranteed to have
 203 - * matched increments and decrements on any given cpu.
 204 - * A file open()ed for write on one cpu and close()d on
 205 - * another cpu will imbalance this count.  Make sure it
 206 - * does not get too far out of whack.
 207 - */
 208 -static void handle_write_count_underflow(struct vfsmount *mnt)
 209 -{
 210 -       if (atomic_read(&mnt->__mnt_writers) >=
 211 -           MNT_WRITER_UNDERFLOW_LIMIT)
 212 -               return;
 213 -       /*
 214 -        * It isn't necessary to hold all of the locks
 215 -        * at the same time, but doing it this way makes
 216 -        * us share a lot more code.
 217 -        */
 218 -       lock_mnt_writers();
 219 -       /*
 220 -        * vfsmount_lock is for mnt_flags.
 221 -        */
 222 -       spin_lock(&vfsmount_lock);
 223 -       /*
 224 -        * If coalescing the per-cpu writer counts did not
 225 -        * get us back to a positive writer count, we have
 226 -        * a bug.
 227 -        */
 228 -       if ((atomic_read(&mnt->__mnt_writers) < 0) &&
 229 -           !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
 230 -               WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
 231 -                               "count: %d\n",
 232 -                       mnt, atomic_read(&mnt->__mnt_writers));
 233 -               /* use the flag to keep the dmesg spam down */
 234 -               mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
 235 -       }
 236 -       spin_unlock(&vfsmount_lock);
 237 -       unlock_mnt_writers();
 238 -}
 239 -
 240  /**
 241   * mnt_drop_write - give up write access to a mount
 242   * @mnt: the mount on which to give up write access
 243 @@ -331,37 +273,9 @@ static void handle_write_count_underflow
 244   */
 245  void mnt_drop_write(struct vfsmount *mnt)
 246  {
 247 -       int must_check_underflow = 0;
 248 -       struct mnt_writer *cpu_writer;
 249 -
 250 -       cpu_writer = &get_cpu_var(mnt_writers);
 251 -       spin_lock(&cpu_writer->lock);
 252 -
 253 -       use_cpu_writer_for_mount(cpu_writer, mnt);
 254 -       if (cpu_writer->count > 0) {
 255 -               cpu_writer->count--;
 256 -       } else {
 257 -               must_check_underflow = 1;
 258 -               atomic_dec(&mnt->__mnt_writers);
 259 -       }
 260 -
 261 -       spin_unlock(&cpu_writer->lock);
 262 -       /*
 263 -        * Logically, we could call this each time,
 264 -        * but the __mnt_writers cacheline tends to
 265 -        * be cold, and makes this expensive.
 266 -        */
 267 -       if (must_check_underflow)
 268 -               handle_write_count_underflow(mnt);
 269 -       /*
 270 -        * This could be done right after the spinlock
 271 -        * is taken because the spinlock keeps us on
 272 -        * the cpu, and disables preemption.  However,
 273 -        * putting it here bounds the amount that
 274 -        * __mnt_writers can underflow.  Without it,
 275 -        * we could theoretically wrap __mnt_writers.
 276 -        */
 277 -       put_cpu_var(mnt_writers);
 278 +       preempt_disable();
 279 +       dec_mnt_writers(mnt);
 280 +       preempt_enable();
 281  }
 282  EXPORT_SYMBOL_GPL(mnt_drop_write);
 283
 284 @@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
 285  {
 286         int ret = 0;
 287
 288 -       lock_mnt_writers();
 289 +       spin_lock(&vfsmount_lock);
 290 +       mnt->mnt_flags |= MNT_WRITE_HOLD;
 291         /*
 292 -        * With all the locks held, this value is stable
 293 +        * After storing MNT_WRITE_HOLD, we'll read the counters. This store
 294 +        * should be visible before we do.
 295          */
 296 -       if (atomic_read(&mnt->__mnt_writers) > 0) {
 297 +       smp_mb();
 298 +
 299 +       /*
 300 +        * With writers on hold, if this value is zero, then there are definitely
 301 +        * no active writers (although held writers may subsequently increment
 302 +        * the count, they'll have to wait, and decrement it after seeing
 303 +        * MNT_READONLY).
 304 +        */
 305 +       if (count_mnt_writers(mnt) > 0) {
 306                 ret = -EBUSY;
 307                 goto out;
 308         }
 309 -       /*
 310 -        * nobody can do a successful mnt_want_write() with all
 311 -        * of the counts in MNT_DENIED_WRITE and the locks held.
 312 -        */
 313 -       spin_lock(&vfsmount_lock);
 314         if (!ret)
 315                 mnt->mnt_flags |= MNT_READONLY;
 316 -       spin_unlock(&vfsmount_lock);
 317  out:
 318 -       unlock_mnt_writers();
 319 +       /*
 320 +        * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
 321 +        * that become unheld will see MNT_READONLY.
 322 +        */
 323 +       smp_wmb();
 324 +       mnt->mnt_flags &= ~MNT_WRITE_HOLD;
 325 +       spin_unlock(&vfsmount_lock);
 326         return ret;
 327  }
 328
 329 @@ -410,6 +334,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 330  {
 331         kfree(mnt->mnt_devname);
 332         mnt_free_id(mnt);
 333 +#ifdef CONFIG_SMP
 334 +       free_percpu(mnt->mnt_writers);
 335 +#endif
 336         kmem_cache_free(mnt_cache, mnt);
 337  }
 338
 339 @@ -604,36 +531,14 @@ static struct vfsmount *clone_mnt(struct
 340
 341  static inline void __mntput(struct vfsmount *mnt)
 342  {
 343 -       int cpu;
 344         struct super_block *sb = mnt->mnt_sb;
 345         /*
 346 -        * We don't have to hold all of the locks at the
 347 -        * same time here because we know that we're the
 348 -        * last reference to mnt and that no new writers
 349 -        * can come in.
 350 -        */
 351 -       for_each_possible_cpu(cpu) {
 352 -               struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
 353 -               if (cpu_writer->mnt != mnt)
 354 -                       continue;
 355 -               spin_lock(&cpu_writer->lock);
 356 -               atomic_add(cpu_writer->count, &mnt->__mnt_writers);
 357 -               cpu_writer->count = 0;
 358 -               /*
 359 -                * Might as well do this so that no one
 360 -                * ever sees the pointer and expects
 361 -                * it to be valid.
 362 -                */
 363 -               cpu_writer->mnt = NULL;
 364 -               spin_unlock(&cpu_writer->lock);
 365 -       }
 366 -       /*
 367          * This probably indicates that somebody messed
 368          * up a mnt_want/drop_write() pair.  If this
 369          * happens, the filesystem was probably unable
 370          * to make r/w->r/o transitions.
 371          */
 372 -       WARN_ON(atomic_read(&mnt->__mnt_writers));
 373 +       WARN_ON(count_mnt_writers(mnt));
 374         dput(mnt->mnt_root);
 375         free_vfsmnt(mnt);
 376         deactivate_super(sb);
 377 --- linux-2.6.27.orig/include/linux/mount.h
 378 +++ linux-2.6.27/include/linux/mount.h
 379 @@ -32,6 +32,7 @@ struct mnt_namespace;
 380
 381  #define MNT_SHRINKABLE 0x100
 382  #define MNT_IMBALANCED_WRITE_COUNT     0x200 /* just for debugging */
 383 +#define MNT_WRITE_HOLD 0x400
 384
 385  #define MNT_SHARED     0x1000  /* if the vfsmount is a shared mount */
 386  #define MNT_UNBINDABLE 0x2000  /* if the vfsmount is a unbindable mount */
 387 @@ -66,13 +67,30 @@ struct vfsmount {
 388         int mnt_expiry_mark;            /* true if marked for expiry */
 389         int mnt_pinned;
 390         int mnt_ghosts;
 391 +#ifdef __GENKSYMS__
 392         /*
 393          * This value is not stable unless all of the mnt_writers[] spinlocks
 394          * are held, and all mnt_writer[]s on this mount have 0 as their ->count
 395          */
 396         atomic_t __mnt_writers;
 397 +#else
 398 +#ifdef CONFIG_SMP
 399 +       int *mnt_writers;
 400 +#else
 401 +       int mnt_writers;
 402 +#endif
 403 +#endif
 404  };
 405
 406 +static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
 407 +{
 408 +#ifdef CONFIG_SMP
 409 +       return mnt->mnt_writers;
 410 +#else
 411 +       return &mnt->mnt_writers;
 412 +#endif
 413 +}
 414 +
 415  static inline struct vfsmount *mntget(struct vfsmount *mnt)
 416  {
 417         if (mnt)