]> git.ipfire.org Git - people/teissler/ipfire-2.x.git/blob - src/patches/suse-2.6.27.31/patches.suse/mnt-want-write-speedup.patch
Merge branch 'master' of git://git.ipfire.org/ipfire-2.x
[people/teissler/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.suse / mnt-want-write-speedup.patch
1 From: Nick Piggin <npiggin@suse.de>
2 Subject: fs: mnt_want_write speedup
3 References: bnc#436953
4 Patch-upstream: no (could be submitted)
5
6 This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
7 basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
8 A microbenchmark yes, but it exercises some important paths in the mm.
9
10 Before:
11 avg = 501.9
12 std = 14.7773
13
14 After:
15 avg = 462.286
16 std = 5.46106
17
18 (50 runs of each, stddev gives a reasonable confidence, but there is quite
19 a bit of variation there still)
20
21 It does this by removing the complex per-cpu locking and counter-cache and
22 replaces it with a percpu counter in struct vfsmount. This makes the code
23 much simpler, and avoids spinlocks (although the msync is still pretty
24 costly, unfortunately). It results in about 900 bytes smaller code too. It
25 does increase the size of a vfsmount, however.
26
27 It should also give a speedup on large systems if CPUs are frequently operating
28 on different mounts (because the existing scheme has to operate on an atomic in
29 the struct vfsmount when switching between mounts). But I'm most interested in
30 the single threaded path performance for the moment.
31
32 ---
33 fs/namespace.c | 251 +++++++++++++++-----------------------------------
34 include/linux/mount.h | 18 +++
35 2 files changed, 96 insertions(+), 173 deletions(-)
36
37 --- linux-2.6.27.orig/fs/namespace.c
38 +++ linux-2.6.27/fs/namespace.c
39 @@ -130,10 +130,20 @@ struct vfsmount *alloc_vfsmnt(const char
40 INIT_LIST_HEAD(&mnt->mnt_share);
41 INIT_LIST_HEAD(&mnt->mnt_slave_list);
42 INIT_LIST_HEAD(&mnt->mnt_slave);
43 - atomic_set(&mnt->__mnt_writers, 0);
44 +#ifdef CONFIG_SMP
45 + mnt->mnt_writers = alloc_percpu(int);
46 + if (!mnt->mnt_writers)
47 + goto out_free_devname;
48 +#else
49 + mnt->mnt_writers = 0;
50 +#endif
51 }
52 return mnt;
53
54 +#ifdef CONFIG_SMP
55 +out_free_devname:
56 + kfree(mnt->mnt_devname);
57 +#endif
58 out_free_id:
59 mnt_free_id(mnt);
60 out_free_cache:
61 @@ -170,65 +180,38 @@ int __mnt_is_readonly(struct vfsmount *m
62 }
63 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
64
65 -struct mnt_writer {
66 - /*
67 - * If holding multiple instances of this lock, they
68 - * must be ordered by cpu number.
69 - */
70 - spinlock_t lock;
71 - struct lock_class_key lock_class; /* compiles out with !lockdep */
72 - unsigned long count;
73 - struct vfsmount *mnt;
74 -} ____cacheline_aligned_in_smp;
75 -static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
76 +static inline void inc_mnt_writers(struct vfsmount *mnt)
77 +{
78 +#ifdef CONFIG_SMP
79 + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
80 +#else
81 + mnt->mnt_writers++;
82 +#endif
83 +}
84
85 -static int __init init_mnt_writers(void)
86 +static inline void dec_mnt_writers(struct vfsmount *mnt)
87 {
88 - int cpu;
89 - for_each_possible_cpu(cpu) {
90 - struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
91 - spin_lock_init(&writer->lock);
92 - lockdep_set_class(&writer->lock, &writer->lock_class);
93 - writer->count = 0;
94 - }
95 - return 0;
96 +#ifdef CONFIG_SMP
97 + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
98 +#else
99 + mnt->mnt_writers--;
100 +#endif
101 }
102 -fs_initcall(init_mnt_writers);
103
104 -static void unlock_mnt_writers(void)
105 +static unsigned int count_mnt_writers(struct vfsmount *mnt)
106 {
107 +#ifdef CONFIG_SMP
108 + unsigned int count = 0;
109 int cpu;
110 - struct mnt_writer *cpu_writer;
111
112 for_each_possible_cpu(cpu) {
113 - cpu_writer = &per_cpu(mnt_writers, cpu);
114 - spin_unlock(&cpu_writer->lock);
115 + count += *per_cpu_ptr(mnt->mnt_writers, cpu);
116 }
117 -}
118
119 -static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
120 -{
121 - if (!cpu_writer->mnt)
122 - return;
123 - /*
124 - * This is in case anyone ever leaves an invalid,
125 - * old ->mnt and a count of 0.
126 - */
127 - if (!cpu_writer->count)
128 - return;
129 - atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
130 - cpu_writer->count = 0;
131 -}
132 - /*
133 - * must hold cpu_writer->lock
134 - */
135 -static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
136 - struct vfsmount *mnt)
137 -{
138 - if (cpu_writer->mnt == mnt)
139 - return;
140 - __clear_mnt_count(cpu_writer);
141 - cpu_writer->mnt = mnt;
142 + return count;
143 +#else
144 + return mnt->mnt_writers;
145 +#endif
146 }
147
148 /*
149 @@ -252,75 +235,34 @@ static inline void use_cpu_writer_for_mo
150 int mnt_want_write(struct vfsmount *mnt)
151 {
152 int ret = 0;
153 - struct mnt_writer *cpu_writer;
154
155 - cpu_writer = &get_cpu_var(mnt_writers);
156 - spin_lock(&cpu_writer->lock);
157 + preempt_disable();
158 + inc_mnt_writers(mnt);
159 + /*
160 + * The store to inc_mnt_writers must be visible before we pass
161 + * MNT_WRITE_HOLD loop below, so that the slowpath can see our
162 + * incremented count after it has set MNT_WRITE_HOLD.
163 + */
164 + smp_mb();
165 + while (mnt->mnt_flags & MNT_WRITE_HOLD)
166 + cpu_relax();
167 + /*
168 + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
169 + * be set to match its requirements. So we must not load that until
170 + * MNT_WRITE_HOLD is cleared.
171 + */
172 + smp_rmb();
173 if (__mnt_is_readonly(mnt)) {
174 + dec_mnt_writers(mnt);
175 ret = -EROFS;
176 goto out;
177 }
178 - use_cpu_writer_for_mount(cpu_writer, mnt);
179 - cpu_writer->count++;
180 out:
181 - spin_unlock(&cpu_writer->lock);
182 - put_cpu_var(mnt_writers);
183 + preempt_enable();
184 return ret;
185 }
186 EXPORT_SYMBOL_GPL(mnt_want_write);
187
188 -static void lock_mnt_writers(void)
189 -{
190 - int cpu;
191 - struct mnt_writer *cpu_writer;
192 -
193 - for_each_possible_cpu(cpu) {
194 - cpu_writer = &per_cpu(mnt_writers, cpu);
195 - spin_lock(&cpu_writer->lock);
196 - __clear_mnt_count(cpu_writer);
197 - cpu_writer->mnt = NULL;
198 - }
199 -}
200 -
201 -/*
202 - * These per-cpu write counts are not guaranteed to have
203 - * matched increments and decrements on any given cpu.
204 - * A file open()ed for write on one cpu and close()d on
205 - * another cpu will imbalance this count. Make sure it
206 - * does not get too far out of whack.
207 - */
208 -static void handle_write_count_underflow(struct vfsmount *mnt)
209 -{
210 - if (atomic_read(&mnt->__mnt_writers) >=
211 - MNT_WRITER_UNDERFLOW_LIMIT)
212 - return;
213 - /*
214 - * It isn't necessary to hold all of the locks
215 - * at the same time, but doing it this way makes
216 - * us share a lot more code.
217 - */
218 - lock_mnt_writers();
219 - /*
220 - * vfsmount_lock is for mnt_flags.
221 - */
222 - spin_lock(&vfsmount_lock);
223 - /*
224 - * If coalescing the per-cpu writer counts did not
225 - * get us back to a positive writer count, we have
226 - * a bug.
227 - */
228 - if ((atomic_read(&mnt->__mnt_writers) < 0) &&
229 - !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
230 - WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
231 - "count: %d\n",
232 - mnt, atomic_read(&mnt->__mnt_writers));
233 - /* use the flag to keep the dmesg spam down */
234 - mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
235 - }
236 - spin_unlock(&vfsmount_lock);
237 - unlock_mnt_writers();
238 -}
239 -
240 /**
241 * mnt_drop_write - give up write access to a mount
242 * @mnt: the mount on which to give up write access
243 @@ -331,37 +273,9 @@ static void handle_write_count_underflow
244 */
245 void mnt_drop_write(struct vfsmount *mnt)
246 {
247 - int must_check_underflow = 0;
248 - struct mnt_writer *cpu_writer;
249 -
250 - cpu_writer = &get_cpu_var(mnt_writers);
251 - spin_lock(&cpu_writer->lock);
252 -
253 - use_cpu_writer_for_mount(cpu_writer, mnt);
254 - if (cpu_writer->count > 0) {
255 - cpu_writer->count--;
256 - } else {
257 - must_check_underflow = 1;
258 - atomic_dec(&mnt->__mnt_writers);
259 - }
260 -
261 - spin_unlock(&cpu_writer->lock);
262 - /*
263 - * Logically, we could call this each time,
264 - * but the __mnt_writers cacheline tends to
265 - * be cold, and makes this expensive.
266 - */
267 - if (must_check_underflow)
268 - handle_write_count_underflow(mnt);
269 - /*
270 - * This could be done right after the spinlock
271 - * is taken because the spinlock keeps us on
272 - * the cpu, and disables preemption. However,
273 - * putting it here bounds the amount that
274 - * __mnt_writers can underflow. Without it,
275 - * we could theoretically wrap __mnt_writers.
276 - */
277 - put_cpu_var(mnt_writers);
278 + preempt_disable();
279 + dec_mnt_writers(mnt);
280 + preempt_enable();
281 }
282 EXPORT_SYMBOL_GPL(mnt_drop_write);
283
284 @@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
285 {
286 int ret = 0;
287
288 - lock_mnt_writers();
289 + spin_lock(&vfsmount_lock);
290 + mnt->mnt_flags |= MNT_WRITE_HOLD;
291 /*
292 - * With all the locks held, this value is stable
293 + * After storing MNT_WRITE_HOLD, we'll read the counters. This store
294 + * should be visible before we do.
295 */
296 - if (atomic_read(&mnt->__mnt_writers) > 0) {
297 + smp_mb();
298 +
299 + /*
300 + * With writers on hold, if this value is zero, then there are definitely
301 + * no active writers (although held writers may subsequently increment
302 + * the count, they'll have to wait, and decrement it after seeing
303 + * MNT_READONLY).
304 + */
305 + if (count_mnt_writers(mnt) > 0) {
306 ret = -EBUSY;
307 goto out;
308 }
309 - /*
310 - * nobody can do a successful mnt_want_write() with all
311 - * of the counts in MNT_DENIED_WRITE and the locks held.
312 - */
313 - spin_lock(&vfsmount_lock);
314 if (!ret)
315 mnt->mnt_flags |= MNT_READONLY;
316 - spin_unlock(&vfsmount_lock);
317 out:
318 - unlock_mnt_writers();
319 + /*
320 + * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
321 + * that become unheld will see MNT_READONLY.
322 + */
323 + smp_wmb();
324 + mnt->mnt_flags &= ~MNT_WRITE_HOLD;
325 + spin_unlock(&vfsmount_lock);
326 return ret;
327 }
328
329 @@ -410,6 +334,9 @@ void free_vfsmnt(struct vfsmount *mnt)
330 {
331 kfree(mnt->mnt_devname);
332 mnt_free_id(mnt);
333 +#ifdef CONFIG_SMP
334 + free_percpu(mnt->mnt_writers);
335 +#endif
336 kmem_cache_free(mnt_cache, mnt);
337 }
338
339 @@ -604,36 +531,14 @@ static struct vfsmount *clone_mnt(struct
340
341 static inline void __mntput(struct vfsmount *mnt)
342 {
343 - int cpu;
344 struct super_block *sb = mnt->mnt_sb;
345 /*
346 - * We don't have to hold all of the locks at the
347 - * same time here because we know that we're the
348 - * last reference to mnt and that no new writers
349 - * can come in.
350 - */
351 - for_each_possible_cpu(cpu) {
352 - struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
353 - if (cpu_writer->mnt != mnt)
354 - continue;
355 - spin_lock(&cpu_writer->lock);
356 - atomic_add(cpu_writer->count, &mnt->__mnt_writers);
357 - cpu_writer->count = 0;
358 - /*
359 - * Might as well do this so that no one
360 - * ever sees the pointer and expects
361 - * it to be valid.
362 - */
363 - cpu_writer->mnt = NULL;
364 - spin_unlock(&cpu_writer->lock);
365 - }
366 - /*
367 * This probably indicates that somebody messed
368 * up a mnt_want/drop_write() pair. If this
369 * happens, the filesystem was probably unable
370 * to make r/w->r/o transitions.
371 */
372 - WARN_ON(atomic_read(&mnt->__mnt_writers));
373 + WARN_ON(count_mnt_writers(mnt));
374 dput(mnt->mnt_root);
375 free_vfsmnt(mnt);
376 deactivate_super(sb);
377 --- linux-2.6.27.orig/include/linux/mount.h
378 +++ linux-2.6.27/include/linux/mount.h
379 @@ -32,6 +32,7 @@ struct mnt_namespace;
380
381 #define MNT_SHRINKABLE 0x100
382 #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
383 +#define MNT_WRITE_HOLD 0x400
384
385 #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
386 #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
387 @@ -66,13 +67,30 @@ struct vfsmount {
388 int mnt_expiry_mark; /* true if marked for expiry */
389 int mnt_pinned;
390 int mnt_ghosts;
391 +#ifdef __GENKSYMS__
392 /*
393 * This value is not stable unless all of the mnt_writers[] spinlocks
394 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
395 */
396 atomic_t __mnt_writers;
397 +#else
398 +#ifdef CONFIG_SMP
399 + int *mnt_writers;
400 +#else
401 + int mnt_writers;
402 +#endif
403 +#endif
404 };
405
406 +static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
407 +{
408 +#ifdef CONFIG_SMP
409 + return mnt->mnt_writers;
410 +#else
411 + return &mnt->mnt_writers;
412 +#endif
413 +}
414 +
415 static inline struct vfsmount *mntget(struct vfsmount *mnt)
416 {
417 if (mnt)