1 From 7f42ec3941560f0902fe3671e36f2c20ffd3af0a Mon Sep 17 00:00:00 2001
2 From: Vyacheslav Dubeyko <slava@dubeyko.com>
3 Date: Mon, 30 Sep 2013 13:45:12 -0700
4 Subject: nilfs2: fix issue with race condition of competition between segments for dirty blocks
6 From: Vyacheslav Dubeyko <slava@dubeyko.com>
8 commit 7f42ec3941560f0902fe3671e36f2c20ffd3af0a upstream.
10 Many NILFS2 users were reported about strange file system corruption
13 NILFS: bad btree node (blocknr=185027): level = 0, flags = 0x0, nchildren = 768
14 NILFS error (device sda4): nilfs_bmap_last_key: broken bmap (inode number=11540)
16 But such error messages are consequence of file system's issue that takes
17 place more earlier. Fortunately, Jerome Poulin <jeromepoulin@gmail.com>
18 and Anton Eliasson <devel@antoneliasson.se> were reported about another
19 issue not so recently. These reports describe the issue with segctor
22 BUG: unable to handle kernel paging request at 0000000000004c83
23 IP: nilfs_end_page_io+0x12/0xd0 [nilfs2]
26 nilfs_segctor_do_construct+0xf25/0x1b20 [nilfs2]
27 nilfs_segctor_construct+0x17b/0x290 [nilfs2]
28 nilfs_segctor_thread+0x122/0x3b0 [nilfs2]
30 ret_from_fork+0x7c/0xb0
32 These two issues have one reason. This reason can raise third issue
33 too. Third issue results in hanging of segctor thread with eating of
38 One of the possible way or the issue reproducing was described by
39 Jermoe me Poulin <jeromepoulin@gmail.com>:
41 1. init S to get to single user mode.
42 2. sysrq+E to make sure only my shell is running
43 3. start network-manager to get my wifi connection up
44 4. login as root and launch "screen"
45 5. cd /boot/log/nilfs which is a ext3 mount point and can log when NILFS dies.
46 6. lscp | xz -9e > lscp.txt.xz
47 7. mount my snapshot using mount -o cp=3360839,ro /dev/vgUbuntu/root /mnt/nilfs
48 8. start a screen to dump /proc/kmsg to text file since rsyslog is killed
49 9. start a screen and launch strace -f -o find-cat.log -t find
50 /mnt/nilfs -type f -exec cat {} > /dev/null \;
51 10. start a screen and launch strace -f -o apt-get.log -t apt-get update
52 11. launch the last command again as it did not crash the first time
54 13. ps aux > ps-aux-crashed.log
56 14. sysrq+E wait for everything to terminate
59 Simplified way of the issue reproducing is starting kernel compilation
60 task and "apt-get update" in parallel.
64 The issue is reproduced not stable [60% - 80%]. It is very important to
65 have proper environment for the issue reproducing. The critical
66 conditions for successful reproducing:
68 (1) It should have big modified file by mmap() way.
70 (2) This file should have the count of dirty blocks are greater that
71 several segments in size (for example, two or three) from time to time
74 (3) It should be intensive background activity of files modification
79 First of all, it is possible to see that the reason of crash is not valid
82 NILFS [nilfs_segctor_complete_write]:2100 bh->b_count 0, bh->b_blocknr 13895680, bh->b_size 13897727, bh->b_page 0000000000001a82
83 NILFS [nilfs_segctor_complete_write]:2101 segbuf->sb_segnum 6783
85 Moreover, value of b_page (0x1a82) is 6786. This value looks like segment
86 number. And b_blocknr with b_size values look like block numbers. So,
87 buffer_head's pointer points on not proper address value.
89 Detailed investigation of the issue is discovered such picture:
91 [-----------------------------SEGMENT 6783-------------------------------]
92 NILFS [nilfs_segctor_do_construct]:2310 nilfs_segctor_begin_construction
93 NILFS [nilfs_segctor_do_construct]:2321 nilfs_segctor_collect
94 NILFS [nilfs_segctor_do_construct]:2336 nilfs_segctor_assign
95 NILFS [nilfs_segctor_do_construct]:2367 nilfs_segctor_update_segusage
96 NILFS [nilfs_segctor_do_construct]:2371 nilfs_segctor_prepare_write
97 NILFS [nilfs_segctor_do_construct]:2376 nilfs_add_checksums_on_logs
98 NILFS [nilfs_segctor_do_construct]:2381 nilfs_segctor_write
99 NILFS [nilfs_segbuf_submit_bio]:464 bio->bi_sector 111149024, segbuf->sb_segnum 6783
101 [-----------------------------SEGMENT 6784-------------------------------]
102 NILFS [nilfs_segctor_do_construct]:2310 nilfs_segctor_begin_construction
103 NILFS [nilfs_segctor_do_construct]:2321 nilfs_segctor_collect
104 NILFS [nilfs_lookup_dirty_data_buffers]:782 bh->b_count 1, bh->b_page ffffea000709b000, page->index 0, i_ino 1033103, i_size 25165824
105 NILFS [nilfs_lookup_dirty_data_buffers]:783 bh->b_assoc_buffers.next ffff8802174a6798, bh->b_assoc_buffers.prev ffff880221cffee8
106 NILFS [nilfs_segctor_do_construct]:2336 nilfs_segctor_assign
107 NILFS [nilfs_segctor_do_construct]:2367 nilfs_segctor_update_segusage
108 NILFS [nilfs_segctor_do_construct]:2371 nilfs_segctor_prepare_write
109 NILFS [nilfs_segctor_do_construct]:2376 nilfs_add_checksums_on_logs
110 NILFS [nilfs_segctor_do_construct]:2381 nilfs_segctor_write
111 NILFS [nilfs_segbuf_submit_bh]:575 bh->b_count 1, bh->b_page ffffea000709b000, page->index 0, i_ino 1033103, i_size 25165824
112 NILFS [nilfs_segbuf_submit_bh]:576 segbuf->sb_segnum 6784
113 NILFS [nilfs_segbuf_submit_bh]:577 bh->b_assoc_buffers.next ffff880218a0d5f8, bh->b_assoc_buffers.prev ffff880218bcdf50
114 NILFS [nilfs_segbuf_submit_bio]:464 bio->bi_sector 111150080, segbuf->sb_segnum 6784, segbuf->sb_nbio 0
116 NILFS [nilfs_segbuf_submit_bio]:464 bio->bi_sector 111164416, segbuf->sb_segnum 6784, segbuf->sb_nbio 15
118 [-----------------------------SEGMENT 6785-------------------------------]
119 NILFS [nilfs_segctor_do_construct]:2310 nilfs_segctor_begin_construction
120 NILFS [nilfs_segctor_do_construct]:2321 nilfs_segctor_collect
121 NILFS [nilfs_lookup_dirty_data_buffers]:782 bh->b_count 2, bh->b_page ffffea000709b000, page->index 0, i_ino 1033103, i_size 25165824
122 NILFS [nilfs_lookup_dirty_data_buffers]:783 bh->b_assoc_buffers.next ffff880219277e80, bh->b_assoc_buffers.prev ffff880221cffc88
123 NILFS [nilfs_segctor_do_construct]:2367 nilfs_segctor_update_segusage
124 NILFS [nilfs_segctor_do_construct]:2371 nilfs_segctor_prepare_write
125 NILFS [nilfs_segctor_do_construct]:2376 nilfs_add_checksums_on_logs
126 NILFS [nilfs_segctor_do_construct]:2381 nilfs_segctor_write
127 NILFS [nilfs_segbuf_submit_bh]:575 bh->b_count 2, bh->b_page ffffea000709b000, page->index 0, i_ino 1033103, i_size 25165824
128 NILFS [nilfs_segbuf_submit_bh]:576 segbuf->sb_segnum 6785
129 NILFS [nilfs_segbuf_submit_bh]:577 bh->b_assoc_buffers.next ffff880218a0d5f8, bh->b_assoc_buffers.prev ffff880222cc7ee8
130 NILFS [nilfs_segbuf_submit_bio]:464 bio->bi_sector 111165440, segbuf->sb_segnum 6785, segbuf->sb_nbio 0
132 NILFS [nilfs_segbuf_submit_bio]:464 bio->bi_sector 111177728, segbuf->sb_segnum 6785, segbuf->sb_nbio 12
134 NILFS [nilfs_segctor_do_construct]:2399 nilfs_segctor_wait
135 NILFS [nilfs_segbuf_wait]:676 segbuf->sb_segnum 6783
136 NILFS [nilfs_segbuf_wait]:676 segbuf->sb_segnum 6784
137 NILFS [nilfs_segbuf_wait]:676 segbuf->sb_segnum 6785
139 NILFS [nilfs_segctor_complete_write]:2100 bh->b_count 0, bh->b_blocknr 13895680, bh->b_size 13897727, bh->b_page 0000000000001a82
141 BUG: unable to handle kernel paging request at 0000000000001a82
142 IP: [<ffffffffa024d0f2>] nilfs_end_page_io+0x12/0xd0 [nilfs2]
144 Usually, for every segment we collect dirty files in list. Then, dirty
145 blocks are gathered for every dirty file, prepared for write and
146 submitted by means of nilfs_segbuf_submit_bh() call. Finally, it takes
147 place complete write phase after calling nilfs_end_bio_write() on the
148 block layer. Buffers/pages are marked as not dirty on final phase and
149 processed files removed from the list of dirty files.
151 It is possible to see that we had three prepare_write and submit_bio
152 phases before segbuf_wait and complete_write phase. Moreover, segments
153 compete between each other for dirty blocks because on every iteration
154 of segments processing dirty buffer_heads are added in several lists of
157 [SEGMENT 6784]: bh->b_assoc_buffers.next ffff880218a0d5f8, bh->b_assoc_buffers.prev ffff880218bcdf50
158 [SEGMENT 6785]: bh->b_assoc_buffers.next ffff880218a0d5f8, bh->b_assoc_buffers.prev ffff880222cc7ee8
160 The next pointer is the same but prev pointer has changed. It means
161 that buffer_head has next pointer from one list but prev pointer from
162 another. Such modification can be made several times. And, finally, it
163 can be resulted in various issues: (1) segctor hanging, (2) segctor
164 crashing, (3) file system metadata corruption.
169 (1) setting of BH_Async_Write flag in nilfs_segctor_prepare_write()
170 for every proccessed dirty block;
172 (2) checking of BH_Async_Write flag in
173 nilfs_lookup_dirty_data_buffers() and
174 nilfs_lookup_dirty_node_buffers();
176 (3) clearing of BH_Async_Write flag in nilfs_segctor_complete_write(),
177 nilfs_abort_logs(), nilfs_forget_buffer(), nilfs_clear_dirty_page().
179 Reported-by: Jerome Poulin <jeromepoulin@gmail.com>
180 Reported-by: Anton Eliasson <devel@antoneliasson.se>
181 Cc: Paul Fertser <fercerpav@gmail.com>
182 Cc: ARAI Shun-ichi <hermes@ceres.dti.ne.jp>
183 Cc: Piotr Szymaniak <szarpaj@grubelek.pl>
184 Cc: Juan Barry Manuel Canham <Linux@riotingpacifist.net>
185 Cc: Zahid Chowdhury <zahid.chowdhury@starsolutions.com>
186 Cc: Elmer Zhang <freeboy6716@gmail.com>
187 Cc: Kenneth Langga <klangga@gmail.com>
188 Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
189 Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
190 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
191 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
192 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
195 fs/nilfs2/page.c | 2 ++
196 fs/nilfs2/segment.c | 11 +++++++++--
197 2 files changed, 11 insertions(+), 2 deletions(-)
199 --- a/fs/nilfs2/page.c
200 +++ b/fs/nilfs2/page.c
201 @@ -94,6 +94,7 @@ void nilfs_forget_buffer(struct buffer_h
202 clear_buffer_nilfs_volatile(bh);
203 clear_buffer_nilfs_checked(bh);
204 clear_buffer_nilfs_redirected(bh);
205 + clear_buffer_async_write(bh);
206 clear_buffer_dirty(bh);
207 if (nilfs_page_buffers_clean(page))
208 __nilfs_clear_page_dirty(page);
209 @@ -429,6 +430,7 @@ void nilfs_clear_dirty_page(struct page
210 "discard block %llu, size %zu",
211 (u64)bh->b_blocknr, bh->b_size);
213 + clear_buffer_async_write(bh);
214 clear_buffer_dirty(bh);
215 clear_buffer_nilfs_volatile(bh);
216 clear_buffer_nilfs_checked(bh);
217 --- a/fs/nilfs2/segment.c
218 +++ b/fs/nilfs2/segment.c
219 @@ -665,7 +665,7 @@ static size_t nilfs_lookup_dirty_data_bu
221 bh = head = page_buffers(page);
223 - if (!buffer_dirty(bh))
224 + if (!buffer_dirty(bh) || buffer_async_write(bh))
227 list_add_tail(&bh->b_assoc_buffers, listp);
228 @@ -699,7 +699,8 @@ static void nilfs_lookup_dirty_node_buff
229 for (i = 0; i < pagevec_count(&pvec); i++) {
230 bh = head = page_buffers(pvec.pages[i]);
232 - if (buffer_dirty(bh)) {
233 + if (buffer_dirty(bh) &&
234 + !buffer_async_write(bh)) {
236 list_add_tail(&bh->b_assoc_buffers,
238 @@ -1579,6 +1580,7 @@ static void nilfs_segctor_prepare_write(
240 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
242 + set_buffer_async_write(bh);
243 if (bh->b_page != bd_page) {
246 @@ -1592,6 +1594,7 @@ static void nilfs_segctor_prepare_write(
248 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
250 + set_buffer_async_write(bh);
251 if (bh == segbuf->sb_super_root) {
252 if (bh->b_page != bd_page) {
254 @@ -1677,6 +1680,7 @@ static void nilfs_abort_logs(struct list
255 list_for_each_entry(segbuf, logs, sb_list) {
256 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
258 + clear_buffer_async_write(bh);
259 if (bh->b_page != bd_page) {
261 end_page_writeback(bd_page);
262 @@ -1686,6 +1690,7 @@ static void nilfs_abort_logs(struct list
264 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
266 + clear_buffer_async_write(bh);
267 if (bh == segbuf->sb_super_root) {
268 if (bh->b_page != bd_page) {
269 end_page_writeback(bd_page);
270 @@ -1755,6 +1760,7 @@ static void nilfs_segctor_complete_write
272 set_buffer_uptodate(bh);
273 clear_buffer_dirty(bh);
274 + clear_buffer_async_write(bh);
275 if (bh->b_page != bd_page) {
277 end_page_writeback(bd_page);
278 @@ -1776,6 +1782,7 @@ static void nilfs_segctor_complete_write
280 set_buffer_uptodate(bh);
281 clear_buffer_dirty(bh);
282 + clear_buffer_async_write(bh);
283 clear_buffer_delay(bh);
284 clear_buffer_nilfs_volatile(bh);
285 clear_buffer_nilfs_redirected(bh);