]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/3.14.33/nilfs2-fix-deadlock-of-segment-constructor-over-i_sync-flag.patch
4.14-stable patches
[thirdparty/kernel/stable-queue.git] / releases / 3.14.33 / nilfs2-fix-deadlock-of-segment-constructor-over-i_sync-flag.patch
1 From 7ef3ff2fea8bf5e4a21cef47ad87710a3d0fdb52 Mon Sep 17 00:00:00 2001
2 From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
3 Date: Thu, 5 Feb 2015 12:25:20 -0800
4 Subject: nilfs2: fix deadlock of segment constructor over I_SYNC flag
5
6 From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
7
8 commit 7ef3ff2fea8bf5e4a21cef47ad87710a3d0fdb52 upstream.
9
10 Nilfs2 eventually hangs in a stress test with fsstress program. This
11 issue was caused by the following deadlock over I_SYNC flag between
12 nilfs_segctor_thread() and writeback_sb_inodes():
13
14 nilfs_segctor_thread()
15 nilfs_segctor_thread_construct()
16 nilfs_segctor_unlock()
17 nilfs_dispose_list()
18 iput()
19 iput_final()
20 evict()
21 inode_wait_for_writeback() * wait for I_SYNC flag
22
23 writeback_sb_inodes()
24 * set I_SYNC flag on inode->i_state
25 __writeback_single_inode()
26 do_writepages()
27 nilfs_writepages()
28 nilfs_construct_dsync_segment()
29 nilfs_segctor_sync()
30 * wait for completion of segment constructor
31 inode_sync_complete()
32 * clear I_SYNC flag after __writeback_single_inode() completed
33
34 writeback_sb_inodes() calls do_writepages() for dirty inodes after
35 setting I_SYNC flag on inode->i_state. do_writepages() in turn calls
36 nilfs_writepages(), which can run segment constructor and wait for its
37 completion. On the other hand, segment constructor calls iput(), which
38 can call evict() and wait for the I_SYNC flag on
39 inode_wait_for_writeback().
40
41 Since segment constructor doesn't know when I_SYNC will be set, it
42 cannot know whether iput() will block or not unless inode->i_nlink has a
43 non-zero count. We can prevent evict() from being called in iput() by
44 implementing sop->drop_inode(), but it's not preferable to leave inodes
45 with i_nlink == 0 for long periods because it even defers file
46 truncation and inode deallocation. So, this instead resolves the
47 deadlock by calling iput() asynchronously with a workqueue for inodes
48 with i_nlink == 0.
49
50 Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
51 Cc: Al Viro <viro@zeniv.linux.org.uk>
52 Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
53 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
54 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
55 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
56
57 ---
58 fs/nilfs2/nilfs.h | 2 --
59 fs/nilfs2/segment.c | 44 +++++++++++++++++++++++++++++++++++++++-----
60 fs/nilfs2/segment.h | 5 +++++
61 3 files changed, 44 insertions(+), 7 deletions(-)
62
63 --- a/fs/nilfs2/nilfs.h
64 +++ b/fs/nilfs2/nilfs.h
65 @@ -141,7 +141,6 @@ enum {
66 * @ti_save: Backup of journal_info field of task_struct
67 * @ti_flags: Flags
68 * @ti_count: Nest level
69 - * @ti_garbage: List of inode to be put when releasing semaphore
70 */
71 struct nilfs_transaction_info {
72 u32 ti_magic;
73 @@ -150,7 +149,6 @@ struct nilfs_transaction_info {
74 one of other filesystems has a bug. */
75 unsigned short ti_flags;
76 unsigned short ti_count;
77 - struct list_head ti_garbage;
78 };
79
80 /* ti_magic */
81 --- a/fs/nilfs2/segment.c
82 +++ b/fs/nilfs2/segment.c
83 @@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struc
84 ti->ti_count = 0;
85 ti->ti_save = cur_ti;
86 ti->ti_magic = NILFS_TI_MAGIC;
87 - INIT_LIST_HEAD(&ti->ti_garbage);
88 current->journal_info = ti;
89
90 for (;;) {
91 @@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(str
92
93 up_write(&nilfs->ns_segctor_sem);
94 current->journal_info = ti->ti_save;
95 - if (!list_empty(&ti->ti_garbage))
96 - nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
97 }
98
99 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
100 @@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct th
101 }
102 }
103
104 +static void nilfs_iput_work_func(struct work_struct *work)
105 +{
106 + struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
107 + sc_iput_work);
108 + struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
109 +
110 + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
111 +}
112 +
113 static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
114 struct nilfs_root *root)
115 {
116 @@ -1899,8 +1905,8 @@ static int nilfs_segctor_collect_dirty_f
117 static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
118 struct the_nilfs *nilfs)
119 {
120 - struct nilfs_transaction_info *ti = current->journal_info;
121 struct nilfs_inode_info *ii, *n;
122 + int defer_iput = false;
123
124 spin_lock(&nilfs->ns_inode_lock);
125 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
126 @@ -1911,9 +1917,24 @@ static void nilfs_segctor_drop_written_f
127 clear_bit(NILFS_I_BUSY, &ii->i_state);
128 brelse(ii->i_bh);
129 ii->i_bh = NULL;
130 - list_move_tail(&ii->i_dirty, &ti->ti_garbage);
131 + list_del_init(&ii->i_dirty);
132 + if (!ii->vfs_inode.i_nlink) {
133 + /*
134 + * Defer calling iput() to avoid a deadlock
135 + * over I_SYNC flag for inodes with i_nlink == 0
136 + */
137 + list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
138 + defer_iput = true;
139 + } else {
140 + spin_unlock(&nilfs->ns_inode_lock);
141 + iput(&ii->vfs_inode);
142 + spin_lock(&nilfs->ns_inode_lock);
143 + }
144 }
145 spin_unlock(&nilfs->ns_inode_lock);
146 +
147 + if (defer_iput)
148 + schedule_work(&sci->sc_iput_work);
149 }
150
151 /*
152 @@ -2580,6 +2601,8 @@ static struct nilfs_sc_info *nilfs_segct
153 INIT_LIST_HEAD(&sci->sc_segbufs);
154 INIT_LIST_HEAD(&sci->sc_write_logs);
155 INIT_LIST_HEAD(&sci->sc_gc_inodes);
156 + INIT_LIST_HEAD(&sci->sc_iput_queue);
157 + INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
158 init_timer(&sci->sc_timer);
159
160 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
161 @@ -2606,6 +2629,8 @@ static void nilfs_segctor_write_out(stru
162 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
163 nilfs_transaction_unlock(sci->sc_super);
164
165 + flush_work(&sci->sc_iput_work);
166 +
167 } while (ret && retrycount-- > 0);
168 }
169
170 @@ -2630,6 +2655,9 @@ static void nilfs_segctor_destroy(struct
171 || sci->sc_seq_request != sci->sc_seq_done);
172 spin_unlock(&sci->sc_state_lock);
173
174 + if (flush_work(&sci->sc_iput_work))
175 + flag = true;
176 +
177 if (flag || !nilfs_segctor_confirm(sci))
178 nilfs_segctor_write_out(sci);
179
180 @@ -2639,6 +2667,12 @@ static void nilfs_segctor_destroy(struct
181 nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
182 }
183
184 + if (!list_empty(&sci->sc_iput_queue)) {
185 + nilfs_warning(sci->sc_super, __func__,
186 + "iput queue is not empty\n");
187 + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
188 + }
189 +
190 WARN_ON(!list_empty(&sci->sc_segbufs));
191 WARN_ON(!list_empty(&sci->sc_write_logs));
192
193 --- a/fs/nilfs2/segment.h
194 +++ b/fs/nilfs2/segment.h
195 @@ -26,6 +26,7 @@
196 #include <linux/types.h>
197 #include <linux/fs.h>
198 #include <linux/buffer_head.h>
199 +#include <linux/workqueue.h>
200 #include <linux/nilfs2_fs.h>
201 #include "nilfs.h"
202
203 @@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
204 * @sc_nblk_inc: Block count of current generation
205 * @sc_dirty_files: List of files to be written
206 * @sc_gc_inodes: List of GC inodes having blocks to be written
207 + * @sc_iput_queue: list of inodes for which iput should be done
208 + * @sc_iput_work: work struct to defer iput call
209 * @sc_freesegs: array of segment numbers to be freed
210 * @sc_nfreesegs: number of segments on @sc_freesegs
211 * @sc_dsync_inode: inode whose data pages are written for a sync operation
212 @@ -135,6 +138,8 @@ struct nilfs_sc_info {
213
214 struct list_head sc_dirty_files;
215 struct list_head sc_gc_inodes;
216 + struct list_head sc_iput_queue;
217 + struct work_struct sc_iput_work;
218
219 __u64 *sc_freesegs;
220 size_t sc_nfreesegs;