]>
Commit | Line | Data |
---|---|---|
8f69975d BS |
1 | From: Jan Kara <jack@suse.cz> |
2 | Subject: [PATCH] ext3: Avoid false EIO errors | |
3 | References: bnc#479730 | |
4 | ||
5 | Sometimes block_write_begin() can map buffers in a page but later we fail to | |
6 | copy data into those buffers (because the source page has been paged out in the | |
7 | mean time). We then end up with !uptodate mapped buffers. To add a bit more to | |
8 | the confusion, block_write_end() does not commit any data (and thus does not | |
9 | any mark buffers as uptodate) if we didn't succeed with copying all the data. | |
10 | ||
11 | Commit f4fc66a894546bdc88a775d0e83ad20a65210bcb (ext3: convert to new aops) | |
12 | missed these cases and thus we were inserting non-uptodate buffers to | |
13 | transaction's list which confuses JBD code and it reports IO errors, aborts | |
14 | a transaction and generally makes users afraid about their data ;-P. | |
15 | ||
16 | This patch fixes the problem by reorganizing ext3_..._write_end() code to | |
17 | first call block_write_end() to mark buffers with valid data uptodate and | |
18 | after that we file only uptodate buffers to transaction's lists. Also | |
19 | fix a problem where we could leave blocks allocated beyond i_size (i_disksize | |
20 | in fact). | |
21 | ||
22 | Signed-off-by: Jan Kara <jack@suse.cz> | |
23 | ||
24 | --- | |
25 | fs/ext3/inode.c | 99 +++++++++++++++++++++++--------------------------------- | |
26 | 1 file changed, 42 insertions(+), 57 deletions(-) | |
27 | ||
28 | --- a/fs/ext3/inode.c | |
29 | +++ b/fs/ext3/inode.c | |
30 | @@ -1195,6 +1195,18 @@ int ext3_journal_dirty_data(handle_t *ha | |
31 | return err; | |
32 | } | |
33 | ||
34 | +/* For ordered writepage and write_end functions */ | |
35 | +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) | |
36 | +{ | |
37 | + /* | |
38 | + * Write could have mapped the buffer but it didn't copy the data in | |
39 | + * yet. So avoid filing such buffer into a transaction. | |
40 | + */ | |
41 | + if (buffer_mapped(bh) && buffer_uptodate(bh)) | |
42 | + return ext3_journal_dirty_data(handle, bh); | |
43 | + return 0; | |
44 | +} | |
45 | + | |
46 | /* For write_end() in data=journal mode */ | |
47 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) | |
48 | { | |
49 | @@ -1205,26 +1217,29 @@ static int write_end_fn(handle_t *handle | |
50 | } | |
51 | ||
52 | /* | |
53 | - * Generic write_end handler for ordered and writeback ext3 journal modes. | |
54 | - * We can't use generic_write_end, because that unlocks the page and we need to | |
55 | - * unlock the page after ext3_journal_stop, but ext3_journal_stop must run | |
56 | - * after block_write_end. | |
57 | + * This is nasty and subtle: ext3_write_begin() could have allocated blocks | |
58 | + * for the whole page but later we failed to copy the data in. So the disk | |
59 | + * size we really have allocated is pos + len (block_write_end() has zeroed | |
60 | + * the freshly allocated buffers so we aren't going to write garbage). But we | |
61 | + * want to keep i_size at the place where data copying finished so that we | |
62 | + * don't confuse readers. The worst what can happen is that we expose a page | |
63 | + * of zeros at the end of file after a crash... | |
64 | */ | |
65 | -static int ext3_generic_write_end(struct file *file, | |
66 | - struct address_space *mapping, | |
67 | - loff_t pos, unsigned len, unsigned copied, | |
68 | - struct page *page, void *fsdata) | |
69 | +static void update_file_sizes(struct inode *inode, loff_t pos, unsigned len, | |
70 | + unsigned copied) | |
71 | { | |
72 | - struct inode *inode = file->f_mapping->host; | |
73 | + int mark_dirty = 0; | |
74 | ||
75 | - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | |
76 | - | |
77 | - if (pos+copied > inode->i_size) { | |
78 | - i_size_write(inode, pos+copied); | |
79 | - mark_inode_dirty(inode); | |
80 | + if (pos + len > EXT3_I(inode)->i_disksize) { | |
81 | + mark_dirty = 1; | |
82 | + EXT3_I(inode)->i_disksize = pos + len; | |
83 | } | |
84 | - | |
85 | - return copied; | |
86 | + if (pos + copied > inode->i_size) { | |
87 | + i_size_write(inode, pos + copied); | |
88 | + mark_dirty = 1; | |
89 | + } | |
90 | + if (mark_dirty) | |
91 | + mark_inode_dirty(inode); | |
92 | } | |
93 | ||
94 | /* | |
95 | @@ -1244,29 +1259,17 @@ static int ext3_ordered_write_end(struct | |
96 | unsigned from, to; | |
97 | int ret = 0, ret2; | |
98 | ||
99 | + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | |
100 | + | |
101 | + /* See comment at update_file_sizes() for why we check buffers upto | |
102 | + * from + len */ | |
103 | from = pos & (PAGE_CACHE_SIZE - 1); | |
104 | to = from + len; | |
105 | - | |
106 | ret = walk_page_buffers(handle, page_buffers(page), | |
107 | - from, to, NULL, ext3_journal_dirty_data); | |
108 | + from, to, NULL, journal_dirty_data_fn); | |
109 | ||
110 | - if (ret == 0) { | |
111 | - /* | |
112 | - * generic_write_end() will run mark_inode_dirty() if i_size | |
113 | - * changes. So let's piggyback the i_disksize mark_inode_dirty | |
114 | - * into that. | |
115 | - */ | |
116 | - loff_t new_i_size; | |
117 | - | |
118 | - new_i_size = pos + copied; | |
119 | - if (new_i_size > EXT3_I(inode)->i_disksize) | |
120 | - EXT3_I(inode)->i_disksize = new_i_size; | |
121 | - ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, | |
122 | - page, fsdata); | |
123 | - copied = ret2; | |
124 | - if (ret2 < 0) | |
125 | - ret = ret2; | |
126 | - } | |
127 | + if (ret == 0) | |
128 | + update_file_sizes(inode, pos, len, copied); | |
129 | ret2 = ext3_journal_stop(handle); | |
130 | if (!ret) | |
131 | ret = ret2; | |
132 | @@ -1283,22 +1286,11 @@ static int ext3_writeback_write_end(stru | |
133 | { | |
134 | handle_t *handle = ext3_journal_current_handle(); | |
135 | struct inode *inode = file->f_mapping->host; | |
136 | - int ret = 0, ret2; | |
137 | - loff_t new_i_size; | |
138 | + int ret; | |
139 | ||
140 | - new_i_size = pos + copied; | |
141 | - if (new_i_size > EXT3_I(inode)->i_disksize) | |
142 | - EXT3_I(inode)->i_disksize = new_i_size; | |
143 | - | |
144 | - ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, | |
145 | - page, fsdata); | |
146 | - copied = ret2; | |
147 | - if (ret2 < 0) | |
148 | - ret = ret2; | |
149 | - | |
150 | - ret2 = ext3_journal_stop(handle); | |
151 | - if (!ret) | |
152 | - ret = ret2; | |
153 | + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | |
154 | + update_file_sizes(inode, pos, len, copied); | |
155 | + ret = ext3_journal_stop(handle); | |
156 | unlock_page(page); | |
157 | page_cache_release(page); | |
158 | ||
159 | @@ -1412,13 +1404,6 @@ static int bput_one(handle_t *handle, st | |
160 | return 0; | |
161 | } | |
162 | ||
163 | -static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) | |
164 | -{ | |
165 | - if (buffer_mapped(bh)) | |
166 | - return ext3_journal_dirty_data(handle, bh); | |
167 | - return 0; | |
168 | -} | |
169 | - | |
170 | /* | |
171 | * Note that we always start a transaction even if we're not journalling | |
172 | * data. This is to preserve ordering: any hole instantiation within |