[people/teissler/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.fixes / ext3_false_EIO_fix.diff

From: Jan Kara <jack@suse.cz>
Subject: [PATCH] ext3: Avoid false EIO errors
References: bnc#479730

Sometimes block_write_begin() can map buffers in a page but later we fail to
copy data into those buffers (because the source page has been paged out in the
mean time). We then end up with !uptodate mapped buffers. To add a bit more to
the confusion, block_write_end() does not commit any data (and thus does not
any mark buffers as uptodate) if we didn't succeed with copying all the data.

Commit f4fc66a894546bdc88a775d0e83ad20a65210bcb (ext3: convert to new aops)
missed these cases and thus we were inserting non-uptodate buffers to
transaction's list which confuses JBD code and it reports IO errors, aborts
a transaction and generally makes users afraid about their data ;-P.

This patch fixes the problem by reorganizing ext3_..._write_end() code to
first call block_write_end() to mark buffers with valid data uptodate and
after that we file only uptodate buffers to transaction's lists. Also
fix a problem where we could leave blocks allocated beyond i_size (i_disksize
in fact).

Signed-off-by: Jan Kara <jack@suse.cz>

---
 fs/ext3/inode.c |   99 +++++++++++++++++++++++---------------------------------
 1 file changed, 42 insertions(+), 57 deletions(-)

--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1195,6 +1195,18 @@ int ext3_journal_dirty_data(handle_t *ha
 	return err;
 }
 
+/* For ordered writepage and write_end functions */
+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * Write could have mapped the buffer but it didn't copy the data in
+	 * yet. So avoid filing such buffer into a transaction.
+	 */
+	if (buffer_mapped(bh) && buffer_uptodate(bh))
+		return ext3_journal_dirty_data(handle, bh);
+	return 0;
+}
+
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1205,26 +1217,29 @@ static int write_end_fn(handle_t *handle
 }
 
 /*
- * Generic write_end handler for ordered and writeback ext3 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
- * after block_write_end.
+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
+ * for the whole page but later we failed to copy the data in. So the disk
+ * size we really have allocated is pos + len (block_write_end() has zeroed
+ * the freshly allocated buffers so we aren't going to write garbage). But we
+ * want to keep i_size at the place where data copying finished so that we
+ * don't confuse readers. The worst what can happen is that we expose a page
+ * of zeros at the end of file after a crash...
  */
-static int ext3_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned len,
+			      unsigned copied)
 {
-	struct inode *inode = file->f_mapping->host;
+	int mark_dirty = 0;
 
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
+	if (pos + len > EXT3_I(inode)->i_disksize) {
+		mark_dirty = 1;
+		EXT3_I(inode)->i_disksize = pos + len;
 	}
-
-	return copied;
+	if (pos + copied > inode->i_size) {
+		i_size_write(inode, pos + copied);
+		mark_dirty = 1;
+	}
+	if (mark_dirty)
+		mark_inode_dirty(inode);
 }
 
 /*
@@ -1244,29 +1259,17 @@ static int ext3_ordered_write_end(struct
 	unsigned from, to;
 	int ret = 0, ret2;
 
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	/* See comment at update_file_sizes() for why we check buffers upto
+	 * from + len */
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
-
 	ret = walk_page_buffers(handle, page_buffers(page),
-		from, to, NULL, ext3_journal_dirty_data);
+		from, to, NULL, journal_dirty_data_fn);
 
-	if (ret == 0) {
-		/*
-		 * generic_write_end() will run mark_inode_dirty() if i_size
-		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
-		 * into that.
-		 */
-		loff_t new_i_size;
-
-		new_i_size = pos + copied;
-		if (new_i_size > EXT3_I(inode)->i_disksize)
-			EXT3_I(inode)->i_disksize = new_i_size;
-		ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-		copied = ret2;
-		if (ret2 < 0)
-			ret = ret2;
-	}
+	if (ret == 0)
+		update_file_sizes(inode, pos, len, copied);
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
@@ -1283,22 +1286,11 @@ static int ext3_writeback_write_end(stru
 {
 	handle_t *handle = ext3_journal_current_handle();
 	struct inode *inode = file->f_mapping->host;
-	int ret = 0, ret2;
-	loff_t new_i_size;
+	int ret;
 
-	new_i_size = pos + copied;
-	if (new_i_size > EXT3_I(inode)->i_disksize)
-		EXT3_I(inode)->i_disksize = new_i_size;
-
-	ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-	copied = ret2;
-	if (ret2 < 0)
-		ret = ret2;
-
-	ret2 = ext3_journal_stop(handle);
-	if (!ret)
-		ret = ret2;
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+	update_file_sizes(inode, pos, len, copied);
+	ret = ext3_journal_stop(handle);
 	unlock_page(page);
 	page_cache_release(page);
 
@@ -1412,13 +1404,6 @@ static int bput_one(handle_t *handle, st
 	return 0;
 }
 
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-	if (buffer_mapped(bh))
-		return ext3_journal_dirty_data(handle, bh);
-	return 0;
-}
-
 /*
  * Note that we always start a transaction even if we're not journalling
  * data.  This is to preserve ordering: any hole instantiation within
Commit	Line	Data
8f69975d BS	1	From: Jan Kara <jack@suse.cz>
	2	Subject: [PATCH] ext3: Avoid false EIO errors
	3	References: bnc#479730
	4
	5	Sometimes block_write_begin() can map buffers in a page but later we fail to
	6	copy data into those buffers (because the source page has been paged out in the
	7	mean time). We then end up with !uptodate mapped buffers. To add a bit more to
	8	the confusion, block_write_end() does not commit any data (and thus does not
	9	any mark buffers as uptodate) if we didn't succeed with copying all the data.
	10
	11	Commit f4fc66a894546bdc88a775d0e83ad20a65210bcb (ext3: convert to new aops)
	12	missed these cases and thus we were inserting non-uptodate buffers to
	13	transaction's list which confuses JBD code and it reports IO errors, aborts
	14	a transaction and generally makes users afraid about their data ;-P.
	15
	16	This patch fixes the problem by reorganizing ext3_..._write_end() code to
	17	first call block_write_end() to mark buffers with valid data uptodate and
	18	after that we file only uptodate buffers to transaction's lists. Also
	19	fix a problem where we could leave blocks allocated beyond i_size (i_disksize
	20	in fact).
	21
	22	Signed-off-by: Jan Kara <jack@suse.cz>
	23
	24	---
	25	fs/ext3/inode.c \| 99 +++++++++++++++++++++++---------------------------------
	26	1 file changed, 42 insertions(+), 57 deletions(-)
	27
	28	--- a/fs/ext3/inode.c
	29	+++ b/fs/ext3/inode.c
	30	@@ -1195,6 +1195,18 @@ int ext3_journal_dirty_data(handle_t *ha
	31	return err;
	32	}
	33
	34	+/* For ordered writepage and write_end functions */
	35	+static int journal_dirty_data_fn(handle_t handle, struct buffer_head bh)
	36	+{
	37	+ /*
	38	+ * Write could have mapped the buffer but it didn't copy the data in
	39	+ * yet. So avoid filing such buffer into a transaction.
	40	+ */
	41	+ if (buffer_mapped(bh) && buffer_uptodate(bh))
	42	+ return ext3_journal_dirty_data(handle, bh);
	43	+ return 0;
	44	+}
	45	+
	46	/* For write_end() in data=journal mode */
	47	static int write_end_fn(handle_t handle, struct buffer_head bh)
	48	{
	49	@@ -1205,26 +1217,29 @@ static int write_end_fn(handle_t *handle
	50	}
	51
	52	/*
	53	- * Generic write_end handler for ordered and writeback ext3 journal modes.
	54	- * We can't use generic_write_end, because that unlocks the page and we need to
	55	- * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
	56	- * after block_write_end.
	57	+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
	58	+ * for the whole page but later we failed to copy the data in. So the disk
	59	+ * size we really have allocated is pos + len (block_write_end() has zeroed
	60	+ * the freshly allocated buffers so we aren't going to write garbage). But we
	61	+ * want to keep i_size at the place where data copying finished so that we
	62	+ * don't confuse readers. The worst what can happen is that we expose a page
	63	+ * of zeros at the end of file after a crash...
	64	*/
65	-static int ext3_generic_write_end(struct file *file,
66	- struct address_space *mapping,
67	- loff_t pos, unsigned len, unsigned copied,
68	- struct page page, void fsdata)
69	+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned len,
70	+ unsigned copied)
71	{
72	- struct inode *inode = file->f_mapping->host;
73	+ int mark_dirty = 0;
74
75	- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
76	-
77	- if (pos+copied > inode->i_size) {
78	- i_size_write(inode, pos+copied);
79	- mark_inode_dirty(inode);
80	+ if (pos + len > EXT3_I(inode)->i_disksize) {
81	+ mark_dirty = 1;
82	+ EXT3_I(inode)->i_disksize = pos + len;
83	}
84	-
85	- return copied;
86	+ if (pos + copied > inode->i_size) {
87	+ i_size_write(inode, pos + copied);
88	+ mark_dirty = 1;
89	+ }
90	+ if (mark_dirty)
91	+ mark_inode_dirty(inode);
92	}
93
94	/*
95	@@ -1244,29 +1259,17 @@ static int ext3_ordered_write_end(struct
96	unsigned from, to;
97	int ret = 0, ret2;
98
99	+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
100	+
101	+ /* See comment at update_file_sizes() for why we check buffers upto
102	+ * from + len */
103	from = pos & (PAGE_CACHE_SIZE - 1);
104	to = from + len;
105	-
106	ret = walk_page_buffers(handle, page_buffers(page),
107	- from, to, NULL, ext3_journal_dirty_data);
108	+ from, to, NULL, journal_dirty_data_fn);
109
110	- if (ret == 0) {
111	- /*
112	- * generic_write_end() will run mark_inode_dirty() if i_size
113	- * changes. So let's piggyback the i_disksize mark_inode_dirty
114	- * into that.
115	- */
116	- loff_t new_i_size;
117	-
118	- new_i_size = pos + copied;
119	- if (new_i_size > EXT3_I(inode)->i_disksize)
120	- EXT3_I(inode)->i_disksize = new_i_size;
121	- ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
122	- page, fsdata);
123	- copied = ret2;
124	- if (ret2 < 0)
125	- ret = ret2;
126	- }
127	+ if (ret == 0)
128	+ update_file_sizes(inode, pos, len, copied);
129	ret2 = ext3_journal_stop(handle);
130	if (!ret)
131	ret = ret2;
132	@@ -1283,22 +1286,11 @@ static int ext3_writeback_write_end(stru
133	{
134	handle_t *handle = ext3_journal_current_handle();
135	struct inode *inode = file->f_mapping->host;
136	- int ret = 0, ret2;
137	- loff_t new_i_size;
138	+ int ret;
139
140	- new_i_size = pos + copied;
141	- if (new_i_size > EXT3_I(inode)->i_disksize)
142	- EXT3_I(inode)->i_disksize = new_i_size;
143	-
144	- ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
145	- page, fsdata);
146	- copied = ret2;
147	- if (ret2 < 0)
148	- ret = ret2;
149	-
150	- ret2 = ext3_journal_stop(handle);
151	- if (!ret)
152	- ret = ret2;
153	+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
154	+ update_file_sizes(inode, pos, len, copied);
155	+ ret = ext3_journal_stop(handle);
156	unlock_page(page);
157	page_cache_release(page);
158
159	@@ -1412,13 +1404,6 @@ static int bput_one(handle_t *handle, st
160	return 0;
161	}
162
163	-static int journal_dirty_data_fn(handle_t handle, struct buffer_head bh)
164	-{
165	- if (buffer_mapped(bh))
166	- return ext3_journal_dirty_data(handle, bh);
167	- return 0;
168	-}
169	-
170	/*
171	* Note that we always start a transaction even if we're not journalling
172	* data. This is to preserve ordering: any hole instantiation within