1 From: Nick Piggin <npiggin@suse.de>
2 Subject: fs: mnt_want_write speedup
4 Patch-upstream: no (could be submitted)
6 This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
7 basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
8 A microbenchmark yes, but it exercises some important paths in the mm.
18 (50 runs of each, stddev gives a reasonable confidence, but there is quite
19 a bit of variation there still)
21 It does this by removing the complex per-cpu locking and counter-cache and
22 replaces it with a percpu counter in struct vfsmount. This makes the code
23 much simpler, and avoids spinlocks (although the msync is still pretty
24 costly, unfortunately). It results in about 900 bytes smaller code too. It
25 does increase the size of a vfsmount, however.
27 It should also give a speedup on large systems if CPUs are frequently operating
28 on different mounts (because the existing scheme has to operate on an atomic in
29 the struct vfsmount when switching between mounts). But I'm most interested in
30 the single threaded path performance for the moment.
33 fs/namespace.c | 251 +++++++++++++++-----------------------------------
34 include/linux/mount.h | 18 +++
35 2 files changed, 96 insertions(+), 173 deletions(-)
37 --- linux-2.6.27.orig/fs/namespace.c
38 +++ linux-2.6.27/fs/namespace.c
39 @@ -130,10 +130,20 @@ struct vfsmount *alloc_vfsmnt(const char
40 INIT_LIST_HEAD(&mnt->mnt_share);
41 INIT_LIST_HEAD(&mnt->mnt_slave_list);
42 INIT_LIST_HEAD(&mnt->mnt_slave);
43 - atomic_set(&mnt->__mnt_writers, 0);
45 + mnt->mnt_writers = alloc_percpu(int);
46 + if (!mnt->mnt_writers)
47 + goto out_free_devname;
49 + mnt->mnt_writers = 0;
56 + kfree(mnt->mnt_devname);
61 @@ -170,65 +180,38 @@ int __mnt_is_readonly(struct vfsmount *m
63 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
67 - * If holding multiple instances of this lock, they
68 - * must be ordered by cpu number.
71 - struct lock_class_key lock_class; /* compiles out with !lockdep */
72 - unsigned long count;
73 - struct vfsmount *mnt;
74 -} ____cacheline_aligned_in_smp;
75 -static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
76 +static inline void inc_mnt_writers(struct vfsmount *mnt)
79 + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
85 -static int __init init_mnt_writers(void)
86 +static inline void dec_mnt_writers(struct vfsmount *mnt)
89 - for_each_possible_cpu(cpu) {
90 - struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
91 - spin_lock_init(&writer->lock);
92 - lockdep_set_class(&writer->lock, &writer->lock_class);
97 + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
102 -fs_initcall(init_mnt_writers);
104 -static void unlock_mnt_writers(void)
105 +static unsigned int count_mnt_writers(struct vfsmount *mnt)
108 + unsigned int count = 0;
110 - struct mnt_writer *cpu_writer;
112 for_each_possible_cpu(cpu) {
113 - cpu_writer = &per_cpu(mnt_writers, cpu);
114 - spin_unlock(&cpu_writer->lock);
115 + count += *per_cpu_ptr(mnt->mnt_writers, cpu);
119 -static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
121 - if (!cpu_writer->mnt)
124 - * This is in case anyone ever leaves an invalid,
125 - * old ->mnt and a count of 0.
127 - if (!cpu_writer->count)
129 - atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
130 - cpu_writer->count = 0;
133 - * must hold cpu_writer->lock
135 -static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
136 - struct vfsmount *mnt)
138 - if (cpu_writer->mnt == mnt)
140 - __clear_mnt_count(cpu_writer);
141 - cpu_writer->mnt = mnt;
144 + return mnt->mnt_writers;
149 @@ -252,75 +235,34 @@ static inline void use_cpu_writer_for_mo
150 int mnt_want_write(struct vfsmount *mnt)
153 - struct mnt_writer *cpu_writer;
155 - cpu_writer = &get_cpu_var(mnt_writers);
156 - spin_lock(&cpu_writer->lock);
158 + inc_mnt_writers(mnt);
160 + * The store to inc_mnt_writers must be visible before we pass
161 + * MNT_WRITE_HOLD loop below, so that the slowpath can see our
162 + * incremented count after it has set MNT_WRITE_HOLD.
165 + while (mnt->mnt_flags & MNT_WRITE_HOLD)
168 + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
169 + * be set to match its requirements. So we must not load that until
170 + * MNT_WRITE_HOLD is cleared.
173 if (__mnt_is_readonly(mnt)) {
174 + dec_mnt_writers(mnt);
178 - use_cpu_writer_for_mount(cpu_writer, mnt);
179 - cpu_writer->count++;
181 - spin_unlock(&cpu_writer->lock);
182 - put_cpu_var(mnt_writers);
186 EXPORT_SYMBOL_GPL(mnt_want_write);
188 -static void lock_mnt_writers(void)
191 - struct mnt_writer *cpu_writer;
193 - for_each_possible_cpu(cpu) {
194 - cpu_writer = &per_cpu(mnt_writers, cpu);
195 - spin_lock(&cpu_writer->lock);
196 - __clear_mnt_count(cpu_writer);
197 - cpu_writer->mnt = NULL;
202 - * These per-cpu write counts are not guaranteed to have
203 - * matched increments and decrements on any given cpu.
204 - * A file open()ed for write on one cpu and close()d on
205 - * another cpu will imbalance this count. Make sure it
206 - * does not get too far out of whack.
208 -static void handle_write_count_underflow(struct vfsmount *mnt)
210 - if (atomic_read(&mnt->__mnt_writers) >=
211 - MNT_WRITER_UNDERFLOW_LIMIT)
214 - * It isn't necessary to hold all of the locks
215 - * at the same time, but doing it this way makes
216 - * us share a lot more code.
218 - lock_mnt_writers();
220 - * vfsmount_lock is for mnt_flags.
222 - spin_lock(&vfsmount_lock);
224 - * If coalescing the per-cpu writer counts did not
225 - * get us back to a positive writer count, we have
228 - if ((atomic_read(&mnt->__mnt_writers) < 0) &&
229 - !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
230 - WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
232 - mnt, atomic_read(&mnt->__mnt_writers));
233 - /* use the flag to keep the dmesg spam down */
234 - mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
236 - spin_unlock(&vfsmount_lock);
237 - unlock_mnt_writers();
241 * mnt_drop_write - give up write access to a mount
242 * @mnt: the mount on which to give up write access
243 @@ -331,37 +273,9 @@ static void handle_write_count_underflow
245 void mnt_drop_write(struct vfsmount *mnt)
247 - int must_check_underflow = 0;
248 - struct mnt_writer *cpu_writer;
250 - cpu_writer = &get_cpu_var(mnt_writers);
251 - spin_lock(&cpu_writer->lock);
253 - use_cpu_writer_for_mount(cpu_writer, mnt);
254 - if (cpu_writer->count > 0) {
255 - cpu_writer->count--;
257 - must_check_underflow = 1;
258 - atomic_dec(&mnt->__mnt_writers);
261 - spin_unlock(&cpu_writer->lock);
263 - * Logically, we could call this each time,
264 - * but the __mnt_writers cacheline tends to
265 - * be cold, and makes this expensive.
267 - if (must_check_underflow)
268 - handle_write_count_underflow(mnt);
270 - * This could be done right after the spinlock
271 - * is taken because the spinlock keeps us on
272 - * the cpu, and disables preemption. However,
273 - * putting it here bounds the amount that
274 - * __mnt_writers can underflow. Without it,
275 - * we could theoretically wrap __mnt_writers.
277 - put_cpu_var(mnt_writers);
279 + dec_mnt_writers(mnt);
282 EXPORT_SYMBOL_GPL(mnt_drop_write);
284 @@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
288 - lock_mnt_writers();
289 + spin_lock(&vfsmount_lock);
290 + mnt->mnt_flags |= MNT_WRITE_HOLD;
292 - * With all the locks held, this value is stable
293 + * After storing MNT_WRITE_HOLD, we'll read the counters. This store
294 + * should be visible before we do.
296 - if (atomic_read(&mnt->__mnt_writers) > 0) {
300 + * With writers on hold, if this value is zero, then there are definitely
301 + * no active writers (although held writers may subsequently increment
302 + * the count, they'll have to wait, and decrement it after seeing
305 + if (count_mnt_writers(mnt) > 0) {
310 - * nobody can do a successful mnt_want_write() with all
311 - * of the counts in MNT_DENIED_WRITE and the locks held.
313 - spin_lock(&vfsmount_lock);
315 mnt->mnt_flags |= MNT_READONLY;
316 - spin_unlock(&vfsmount_lock);
318 - unlock_mnt_writers();
320 + * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
321 + * that become unheld will see MNT_READONLY.
324 + mnt->mnt_flags &= ~MNT_WRITE_HOLD;
325 + spin_unlock(&vfsmount_lock);
329 @@ -410,6 +334,9 @@ void free_vfsmnt(struct vfsmount *mnt)
331 kfree(mnt->mnt_devname);
334 + free_percpu(mnt->mnt_writers);
336 kmem_cache_free(mnt_cache, mnt);
339 @@ -604,36 +531,14 @@ static struct vfsmount *clone_mnt(struct
341 static inline void __mntput(struct vfsmount *mnt)
344 struct super_block *sb = mnt->mnt_sb;
346 - * We don't have to hold all of the locks at the
347 - * same time here because we know that we're the
348 - * last reference to mnt and that no new writers
351 - for_each_possible_cpu(cpu) {
352 - struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
353 - if (cpu_writer->mnt != mnt)
355 - spin_lock(&cpu_writer->lock);
356 - atomic_add(cpu_writer->count, &mnt->__mnt_writers);
357 - cpu_writer->count = 0;
359 - * Might as well do this so that no one
360 - * ever sees the pointer and expects
363 - cpu_writer->mnt = NULL;
364 - spin_unlock(&cpu_writer->lock);
367 * This probably indicates that somebody messed
368 * up a mnt_want/drop_write() pair. If this
369 * happens, the filesystem was probably unable
370 * to make r/w->r/o transitions.
372 - WARN_ON(atomic_read(&mnt->__mnt_writers));
373 + WARN_ON(count_mnt_writers(mnt));
376 deactivate_super(sb);
377 --- linux-2.6.27.orig/include/linux/mount.h
378 +++ linux-2.6.27/include/linux/mount.h
379 @@ -32,6 +32,7 @@ struct mnt_namespace;
381 #define MNT_SHRINKABLE 0x100
382 #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
383 +#define MNT_WRITE_HOLD 0x400
385 #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
386 #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
387 @@ -66,13 +67,30 @@ struct vfsmount {
388 int mnt_expiry_mark; /* true if marked for expiry */
393 * This value is not stable unless all of the mnt_writers[] spinlocks
394 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
396 atomic_t __mnt_writers;
406 +static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
409 + return mnt->mnt_writers;
411 + return &mnt->mnt_writers;
415 static inline struct vfsmount *mntget(struct vfsmount *mnt)