[ipfire-2.x.git] / src / patches / suse-2.6.27.39 / patches.suse / mnt-want-write-speedup.patch

From: Nick Piggin <npiggin@suse.de>
Subject: fs: mnt_want_write speedup
References: bnc#436953
Patch-upstream: no (could be submitted)

This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
A microbenchmark yes, but it exercises some important paths in the mm.

Before:
 avg = 501.9
 std = 14.7773

After:
 avg = 462.286
 std = 5.46106

(50 runs of each, stddev gives a reasonable confidence, but there is quite
a bit of variation there still)

It does this by removing the complex per-cpu locking and counter-cache and
replaces it with a percpu counter in struct vfsmount. This makes the code
much simpler, and avoids spinlocks (although the msync is still pretty
costly, unfortunately). It results in about 900 bytes smaller code too. It
does increase the size of a vfsmount, however.

It should also give a speedup on large systems if CPUs are frequently operating
on different mounts (because the existing scheme has to operate on an atomic in
the struct vfsmount when switching between mounts). But I'm most interested in
the single threaded path performance for the moment.

---
 fs/namespace.c        |  251 +++++++++++++++-----------------------------------
 include/linux/mount.h |   18 +++
 2 files changed, 96 insertions(+), 173 deletions(-)

--- linux-2.6.27.orig/fs/namespace.c
+++ linux-2.6.27/fs/namespace.c
@@ -130,10 +130,20 @@ struct vfsmount *alloc_vfsmnt(const char
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
-		atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+		mnt->mnt_writers = alloc_percpu(int);
+		if (!mnt->mnt_writers)
+			goto out_free_devname;
+#else
+		mnt->mnt_writers = 0;
+#endif
 	}
 	return mnt;
 
+#ifdef CONFIG_SMP
+out_free_devname:
+	kfree(mnt->mnt_devname);
+#endif
 out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
@@ -170,65 +180,38 @@ int __mnt_is_readonly(struct vfsmount *m
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
-struct mnt_writer {
-	/*
-	 * If holding multiple instances of this lock, they
-	 * must be ordered by cpu number.
-	 */
-	spinlock_t lock;
-	struct lock_class_key lock_class; /* compiles out with !lockdep */
-	unsigned long count;
-	struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+static inline void inc_mnt_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+#else
+	mnt->mnt_writers++;
+#endif
+}
 
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
 {
-	int cpu;
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
-		spin_lock_init(&writer->lock);
-		lockdep_set_class(&writer->lock, &writer->lock_class);
-		writer->count = 0;
-	}
-	return 0;
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+#else
+	mnt->mnt_writers--;
+#endif
 }
-fs_initcall(init_mnt_writers);
 
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
 	int cpu;
-	struct mnt_writer *cpu_writer;
 
 	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_unlock(&cpu_writer->lock);
+		count += *per_cpu_ptr(mnt->mnt_writers, cpu);
 	}
-}
 
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
-{
-	if (!cpu_writer->mnt)
-		return;
-	/*
-	 * This is in case anyone ever leaves an invalid,
-	 * old ->mnt and a count of 0.
-	 */
-	if (!cpu_writer->count)
-		return;
-	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
-	cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
-					  struct vfsmount *mnt)
-{
-	if (cpu_writer->mnt == mnt)
-		return;
-	__clear_mnt_count(cpu_writer);
-	cpu_writer->mnt = mnt;
+	return count;
+#else
+	return mnt->mnt_writers;
+#endif
 }
 
 /*
@@ -252,75 +235,34 @@ static inline void use_cpu_writer_for_mo
 int mnt_want_write(struct vfsmount *mnt)
 {
 	int ret = 0;
-	struct mnt_writer *cpu_writer;
 
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
+	preempt_disable();
+	inc_mnt_writers(mnt);
+	/*
+	 * The store to inc_mnt_writers must be visible before we pass
+	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+	 * incremented count after it has set MNT_WRITE_HOLD.
+	 */
+	smp_mb();
+	while (mnt->mnt_flags & MNT_WRITE_HOLD)
+		cpu_relax();
+	/*
+	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+	 * be set to match its requirements. So we must not load that until
+	 * MNT_WRITE_HOLD is cleared.
+	 */
+	smp_rmb();
 	if (__mnt_is_readonly(mnt)) {
+		dec_mnt_writers(mnt);
 		ret = -EROFS;
 		goto out;
 	}
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	cpu_writer->count++;
 out:
-	spin_unlock(&cpu_writer->lock);
-	put_cpu_var(mnt_writers);
+	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
-static void lock_mnt_writers(void)
-{
-	int cpu;
-	struct mnt_writer *cpu_writer;
-
-	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_lock(&cpu_writer->lock);
-		__clear_mnt_count(cpu_writer);
-		cpu_writer->mnt = NULL;
-	}
-}
-
-/*
- * These per-cpu write counts are not guaranteed to have
- * matched increments and decrements on any given cpu.
- * A file open()ed for write on one cpu and close()d on
- * another cpu will imbalance this count.  Make sure it
- * does not get too far out of whack.
- */
-static void handle_write_count_underflow(struct vfsmount *mnt)
-{
-	if (atomic_read(&mnt->__mnt_writers) >=
-	    MNT_WRITER_UNDERFLOW_LIMIT)
-		return;
-	/*
-	 * It isn't necessary to hold all of the locks
-	 * at the same time, but doing it this way makes
-	 * us share a lot more code.
-	 */
-	lock_mnt_writers();
-	/*
-	 * vfsmount_lock is for mnt_flags.
-	 */
-	spin_lock(&vfsmount_lock);
-	/*
-	 * If coalescing the per-cpu writer counts did not
-	 * get us back to a positive writer count, we have
-	 * a bug.
-	 */
-	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
-	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-		WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
-				"count: %d\n",
-			mnt, atomic_read(&mnt->__mnt_writers));
-		/* use the flag to keep the dmesg spam down */
-		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
-	}
-	spin_unlock(&vfsmount_lock);
-	unlock_mnt_writers();
-}
-
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
@@ -331,37 +273,9 @@ static void handle_write_count_underflow
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	int must_check_underflow = 0;
-	struct mnt_writer *cpu_writer;
-
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
-
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	if (cpu_writer->count > 0) {
-		cpu_writer->count--;
-	} else {
-		must_check_underflow = 1;
-		atomic_dec(&mnt->__mnt_writers);
-	}
-
-	spin_unlock(&cpu_writer->lock);
-	/*
-	 * Logically, we could call this each time,
-	 * but the __mnt_writers cacheline tends to
-	 * be cold, and makes this expensive.
-	 */
-	if (must_check_underflow)
-		handle_write_count_underflow(mnt);
-	/*
-	 * This could be done right after the spinlock
-	 * is taken because the spinlock keeps us on
-	 * the cpu, and disables preemption.  However,
-	 * putting it here bounds the amount that
-	 * __mnt_writers can underflow.  Without it,
-	 * we could theoretically wrap __mnt_writers.
-	 */
-	put_cpu_var(mnt_writers);
+	preempt_disable();
+	dec_mnt_writers(mnt);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
@@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
 {
 	int ret = 0;
 
-	lock_mnt_writers();
+	spin_lock(&vfsmount_lock);
+	mnt->mnt_flags |= MNT_WRITE_HOLD;
 	/*
-	 * With all the locks held, this value is stable
+	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+	 * should be visible before we do.
 	 */
-	if (atomic_read(&mnt->__mnt_writers) > 0) {
+	smp_mb();
+
+	/*
+	 * With writers on hold, if this value is zero, then there are definitely
+	 * no active writers (although held writers may subsequently increment
+	 * the count, they'll have to wait, and decrement it after seeing
+	 * MNT_READONLY).
+	 */
+	if (count_mnt_writers(mnt) > 0) {
 		ret = -EBUSY;
 		goto out;
 	}
-	/*
-	 * nobody can do a successful mnt_want_write() with all
-	 * of the counts in MNT_DENIED_WRITE and the locks held.
-	 */
-	spin_lock(&vfsmount_lock);
 	if (!ret)
 		mnt->mnt_flags |= MNT_READONLY;
-	spin_unlock(&vfsmount_lock);
 out:
-	unlock_mnt_writers();
+	/*
+	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+	 * that become unheld will see MNT_READONLY.
+	 */
+	smp_wmb();
+	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+	spin_unlock(&vfsmount_lock);
 	return ret;
 }
 
@@ -410,6 +334,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_writers);
+#endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 
@@ -604,36 +531,14 @@ static struct vfsmount *clone_mnt(struct
 
 static inline void __mntput(struct vfsmount *mnt)
 {
-	int cpu;
 	struct super_block *sb = mnt->mnt_sb;
 	/*
-	 * We don't have to hold all of the locks at the
-	 * same time here because we know that we're the
-	 * last reference to mnt and that no new writers
-	 * can come in.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-		if (cpu_writer->mnt != mnt)
-			continue;
-		spin_lock(&cpu_writer->lock);
-		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
-		cpu_writer->count = 0;
-		/*
-		 * Might as well do this so that no one
-		 * ever sees the pointer and expects
-		 * it to be valid.
-		 */
-		cpu_writer->mnt = NULL;
-		spin_unlock(&cpu_writer->lock);
-	}
-	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
 	 * happens, the filesystem was probably unable
 	 * to make r/w->r/o transitions.
 	 */
-	WARN_ON(atomic_read(&mnt->__mnt_writers));
+	WARN_ON(count_mnt_writers(mnt));
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
 	deactivate_super(sb);
--- linux-2.6.27.orig/include/linux/mount.h
+++ linux-2.6.27/include/linux/mount.h
@@ -32,6 +32,7 @@ struct mnt_namespace;
 
 #define MNT_SHRINKABLE	0x100
 #define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
+#define MNT_WRITE_HOLD	0x400
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -66,13 +67,30 @@ struct vfsmount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
 	int mnt_ghosts;
+#ifdef __GENKSYMS__
 	/*
 	 * This value is not stable unless all of the mnt_writers[] spinlocks
 	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
 	 */
 	atomic_t __mnt_writers;
+#else
+#ifdef CONFIG_SMP
+	int *mnt_writers;
+#else
+	int mnt_writers;
+#endif
+#endif
 };
 
+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	return mnt->mnt_writers;
+#else
+	return &mnt->mnt_writers;
+#endif
+}
+
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
Commit	Line	Data
2cb7cef9 BS	1	From: Nick Piggin <npiggin@suse.de>
	2	Subject: fs: mnt_want_write speedup
	3	References: bnc#436953
	4	Patch-upstream: no (could be submitted)
	5
	6	This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
	7	basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
	8	A microbenchmark yes, but it exercises some important paths in the mm.
	9
	10	Before:
	11	avg = 501.9
	12	std = 14.7773
	13
	14	After:
	15	avg = 462.286
	16	std = 5.46106
	17
	18	(50 runs of each, stddev gives a reasonable confidence, but there is quite
	19	a bit of variation there still)
	20
	21	It does this by removing the complex per-cpu locking and counter-cache and
	22	replaces it with a percpu counter in struct vfsmount. This makes the code
	23	much simpler, and avoids spinlocks (although the msync is still pretty
	24	costly, unfortunately). It results in about 900 bytes smaller code too. It
	25	does increase the size of a vfsmount, however.
	26
	27	It should also give a speedup on large systems if CPUs are frequently operating
	28	on different mounts (because the existing scheme has to operate on an atomic in
	29	the struct vfsmount when switching between mounts). But I'm most interested in
	30	the single threaded path performance for the moment.
	31
	32	---
	33	fs/namespace.c \| 251 +++++++++++++++-----------------------------------
	34	include/linux/mount.h \| 18 +++
	35	2 files changed, 96 insertions(+), 173 deletions(-)
	36
	37	--- linux-2.6.27.orig/fs/namespace.c
	38	+++ linux-2.6.27/fs/namespace.c
	39	@@ -130,10 +130,20 @@ struct vfsmount *alloc_vfsmnt(const char
	40	INIT_LIST_HEAD(&mnt->mnt_share);
	41	INIT_LIST_HEAD(&mnt->mnt_slave_list);
	42	INIT_LIST_HEAD(&mnt->mnt_slave);
	43	- atomic_set(&mnt->__mnt_writers, 0);
	44	+#ifdef CONFIG_SMP
	45	+ mnt->mnt_writers = alloc_percpu(int);
	46	+ if (!mnt->mnt_writers)
	47	+ goto out_free_devname;
	48	+#else
	49	+ mnt->mnt_writers = 0;
	50	+#endif
	51	}
	52	return mnt;
	53
	54	+#ifdef CONFIG_SMP
	55	+out_free_devname:
	56	+ kfree(mnt->mnt_devname);
	57	+#endif
	58	out_free_id:
	59	mnt_free_id(mnt);
	60	out_free_cache:
	61	@@ -170,65 +180,38 @@ int __mnt_is_readonly(struct vfsmount *m
	62	}
	63	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
	64
65	-struct mnt_writer {
66	- /*
67	- * If holding multiple instances of this lock, they
68	- * must be ordered by cpu number.
69	- */
70	- spinlock_t lock;
71	- struct lock_class_key lock_class; /* compiles out with !lockdep */
72	- unsigned long count;
73	- struct vfsmount *mnt;
74	-} ____cacheline_aligned_in_smp;
75	-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
76	+static inline void inc_mnt_writers(struct vfsmount *mnt)
77	+{
78	+#ifdef CONFIG_SMP
79	+ (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
80	+#else
81	+ mnt->mnt_writers++;
82	+#endif
83	+}
84
85	-static int __init init_mnt_writers(void)
86	+static inline void dec_mnt_writers(struct vfsmount *mnt)
87	{
88	- int cpu;
89	- for_each_possible_cpu(cpu) {
90	- struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
91	- spin_lock_init(&writer->lock);
92	- lockdep_set_class(&writer->lock, &writer->lock_class);
93	- writer->count = 0;
94	- }
95	- return 0;
96	+#ifdef CONFIG_SMP
97	+ (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
98	+#else
99	+ mnt->mnt_writers--;
100	+#endif
101	}
102	-fs_initcall(init_mnt_writers);
103
104	-static void unlock_mnt_writers(void)
105	+static unsigned int count_mnt_writers(struct vfsmount *mnt)
106	{
107	+#ifdef CONFIG_SMP
108	+ unsigned int count = 0;
109	int cpu;
110	- struct mnt_writer *cpu_writer;
111
112	for_each_possible_cpu(cpu) {
113	- cpu_writer = &per_cpu(mnt_writers, cpu);
114	- spin_unlock(&cpu_writer->lock);
115	+ count += *per_cpu_ptr(mnt->mnt_writers, cpu);
116	}
117	-}
118
119	-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
120	-{
121	- if (!cpu_writer->mnt)
122	- return;
123	- /*
124	- * This is in case anyone ever leaves an invalid,
125	- * old ->mnt and a count of 0.
126	- */
127	- if (!cpu_writer->count)
128	- return;
129	- atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
130	- cpu_writer->count = 0;
131	-}
132	- /*
133	- * must hold cpu_writer->lock
134	- */
135	-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
136	- struct vfsmount *mnt)
137	-{
138	- if (cpu_writer->mnt == mnt)
139	- return;
140	- __clear_mnt_count(cpu_writer);
141	- cpu_writer->mnt = mnt;
142	+ return count;
143	+#else
144	+ return mnt->mnt_writers;
145	+#endif
146	}
147
148	/*
149	@@ -252,75 +235,34 @@ static inline void use_cpu_writer_for_mo
150	int mnt_want_write(struct vfsmount *mnt)
151	{
152	int ret = 0;
153	- struct mnt_writer *cpu_writer;
154
155	- cpu_writer = &get_cpu_var(mnt_writers);
156	- spin_lock(&cpu_writer->lock);
157	+ preempt_disable();
158	+ inc_mnt_writers(mnt);
159	+ /*
160	+ * The store to inc_mnt_writers must be visible before we pass
161	+ * MNT_WRITE_HOLD loop below, so that the slowpath can see our
162	+ * incremented count after it has set MNT_WRITE_HOLD.
163	+ */
164	+ smp_mb();
165	+ while (mnt->mnt_flags & MNT_WRITE_HOLD)
166	+ cpu_relax();
167	+ /*
168	+ * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
169	+ * be set to match its requirements. So we must not load that until
170	+ * MNT_WRITE_HOLD is cleared.
171	+ */
172	+ smp_rmb();
173	if (__mnt_is_readonly(mnt)) {
174	+ dec_mnt_writers(mnt);
175	ret = -EROFS;
176	goto out;
177	}
178	- use_cpu_writer_for_mount(cpu_writer, mnt);
179	- cpu_writer->count++;
180	out:
181	- spin_unlock(&cpu_writer->lock);
182	- put_cpu_var(mnt_writers);
183	+ preempt_enable();
184	return ret;
185	}
186	EXPORT_SYMBOL_GPL(mnt_want_write);
187
188	-static void lock_mnt_writers(void)
189	-{
190	- int cpu;
191	- struct mnt_writer *cpu_writer;
192	-
193	- for_each_possible_cpu(cpu) {
194	- cpu_writer = &per_cpu(mnt_writers, cpu);
195	- spin_lock(&cpu_writer->lock);
196	- __clear_mnt_count(cpu_writer);
197	- cpu_writer->mnt = NULL;
198	- }
199	-}
200	-
201	-/*
202	- * These per-cpu write counts are not guaranteed to have
203	- * matched increments and decrements on any given cpu.
204	- * A file open()ed for write on one cpu and close()d on
205	- * another cpu will imbalance this count. Make sure it
206	- * does not get too far out of whack.
207	- */
208	-static void handle_write_count_underflow(struct vfsmount *mnt)
209	-{
210	- if (atomic_read(&mnt->__mnt_writers) >=
211	- MNT_WRITER_UNDERFLOW_LIMIT)
212	- return;
213	- /*
214	- * It isn't necessary to hold all of the locks
215	- * at the same time, but doing it this way makes
216	- * us share a lot more code.
217	- */
218	- lock_mnt_writers();
219	- /*
220	- * vfsmount_lock is for mnt_flags.
221	- */
222	- spin_lock(&vfsmount_lock);
223	- /*
224	- * If coalescing the per-cpu writer counts did not
225	- * get us back to a positive writer count, we have
226	- * a bug.
227	- */
228	- if ((atomic_read(&mnt->__mnt_writers) < 0) &&
229	- !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
230	- WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
231	- "count: %d\n",
232	- mnt, atomic_read(&mnt->__mnt_writers));
233	- /* use the flag to keep the dmesg spam down */
234	- mnt->mnt_flags \|= MNT_IMBALANCED_WRITE_COUNT;
235	- }
236	- spin_unlock(&vfsmount_lock);
237	- unlock_mnt_writers();
238	-}
239	-
240	/**
241	* mnt_drop_write - give up write access to a mount
242	* @mnt: the mount on which to give up write access
243	@@ -331,37 +273,9 @@ static void handle_write_count_underflow
244	*/
245	void mnt_drop_write(struct vfsmount *mnt)
246	{
247	- int must_check_underflow = 0;
248	- struct mnt_writer *cpu_writer;
249	-
250	- cpu_writer = &get_cpu_var(mnt_writers);
251	- spin_lock(&cpu_writer->lock);
252	-
253	- use_cpu_writer_for_mount(cpu_writer, mnt);
254	- if (cpu_writer->count > 0) {
255	- cpu_writer->count--;
256	- } else {
257	- must_check_underflow = 1;
258	- atomic_dec(&mnt->__mnt_writers);
259	- }
260	-
261	- spin_unlock(&cpu_writer->lock);
262	- /*
263	- * Logically, we could call this each time,
264	- * but the __mnt_writers cacheline tends to
265	- * be cold, and makes this expensive.
266	- */
267	- if (must_check_underflow)
268	- handle_write_count_underflow(mnt);
269	- /*
270	- * This could be done right after the spinlock
271	- * is taken because the spinlock keeps us on
272	- * the cpu, and disables preemption. However,
273	- * putting it here bounds the amount that
274	- * __mnt_writers can underflow. Without it,
275	- * we could theoretically wrap __mnt_writers.
276	- */
277	- put_cpu_var(mnt_writers);
278	+ preempt_disable();
279	+ dec_mnt_writers(mnt);
280	+ preempt_enable();
281	}
282	EXPORT_SYMBOL_GPL(mnt_drop_write);
283
284	@@ -369,24 +283,34 @@ static int mnt_make_readonly(struct vfsm
285	{
286	int ret = 0;
287
288	- lock_mnt_writers();
289	+ spin_lock(&vfsmount_lock);
290	+ mnt->mnt_flags \|= MNT_WRITE_HOLD;
291	/*
292	- * With all the locks held, this value is stable
293	+ * After storing MNT_WRITE_HOLD, we'll read the counters. This store
294	+ * should be visible before we do.
295	*/
296	- if (atomic_read(&mnt->__mnt_writers) > 0) {
297	+ smp_mb();
298	+
299	+ /*
300	+ * With writers on hold, if this value is zero, then there are definitely
301	+ * no active writers (although held writers may subsequently increment
302	+ * the count, they'll have to wait, and decrement it after seeing
303	+ * MNT_READONLY).
304	+ */
305	+ if (count_mnt_writers(mnt) > 0) {
306	ret = -EBUSY;
307	goto out;
308	}
309	- /*
310	- * nobody can do a successful mnt_want_write() with all
311	- * of the counts in MNT_DENIED_WRITE and the locks held.
312	- */
313	- spin_lock(&vfsmount_lock);
314	if (!ret)
315	mnt->mnt_flags \|= MNT_READONLY;
316	- spin_unlock(&vfsmount_lock);
317	out:
318	- unlock_mnt_writers();
319	+ /*
320	+ * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
321	+ * that become unheld will see MNT_READONLY.
322	+ */
323	+ smp_wmb();
324	+ mnt->mnt_flags &= ~MNT_WRITE_HOLD;
325	+ spin_unlock(&vfsmount_lock);
326	return ret;
327	}
328
329	@@ -410,6 +334,9 @@ void free_vfsmnt(struct vfsmount *mnt)
330	{
331	kfree(mnt->mnt_devname);
332	mnt_free_id(mnt);
333	+#ifdef CONFIG_SMP
334	+ free_percpu(mnt->mnt_writers);
335	+#endif
336	kmem_cache_free(mnt_cache, mnt);
337	}
338
339	@@ -604,36 +531,14 @@ static struct vfsmount *clone_mnt(struct
340
341	static inline void __mntput(struct vfsmount *mnt)
342	{
343	- int cpu;
344	struct super_block *sb = mnt->mnt_sb;
345	/*
346	- * We don't have to hold all of the locks at the
347	- * same time here because we know that we're the
348	- * last reference to mnt and that no new writers
349	- * can come in.
350	- */
351	- for_each_possible_cpu(cpu) {
352	- struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
353	- if (cpu_writer->mnt != mnt)
354	- continue;
355	- spin_lock(&cpu_writer->lock);
356	- atomic_add(cpu_writer->count, &mnt->__mnt_writers);
357	- cpu_writer->count = 0;
358	- /*
359	- * Might as well do this so that no one
360	- * ever sees the pointer and expects
361	- * it to be valid.
362	- */
363	- cpu_writer->mnt = NULL;
364	- spin_unlock(&cpu_writer->lock);
365	- }
366	- /*
367	* This probably indicates that somebody messed
368	* up a mnt_want/drop_write() pair. If this
369	* happens, the filesystem was probably unable
370	* to make r/w->r/o transitions.
371	*/
372	- WARN_ON(atomic_read(&mnt->__mnt_writers));
373	+ WARN_ON(count_mnt_writers(mnt));
374	dput(mnt->mnt_root);
375	free_vfsmnt(mnt);
376	deactivate_super(sb);
377	--- linux-2.6.27.orig/include/linux/mount.h
378	+++ linux-2.6.27/include/linux/mount.h
379	@@ -32,6 +32,7 @@ struct mnt_namespace;
380
381	#define MNT_SHRINKABLE 0x100
382	#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
383	+#define MNT_WRITE_HOLD 0x400
384
385	#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
386	#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
387	@@ -66,13 +67,30 @@ struct vfsmount {
388	int mnt_expiry_mark; /* true if marked for expiry */
389	int mnt_pinned;
390	int mnt_ghosts;
391	+#ifdef __GENKSYMS__
392	/*
393	* This value is not stable unless all of the mnt_writers[] spinlocks
394	* are held, and all mnt_writer[]s on this mount have 0 as their ->count
395	*/
396	atomic_t __mnt_writers;
397	+#else
398	+#ifdef CONFIG_SMP
399	+ int *mnt_writers;
400	+#else
401	+ int mnt_writers;
402	+#endif
403	+#endif
404	};
405
406	+static inline int get_mnt_writers_ptr(struct vfsmount mnt)
407	+{
408	+#ifdef CONFIG_SMP
409	+ return mnt->mnt_writers;
410	+#else
411	+ return &mnt->mnt_writers;
412	+#endif
413	+}
414	+
415	static inline struct vfsmount mntget(struct vfsmount mnt)
416	{
417	if (mnt)