]>
Commit | Line | Data |
---|---|---|
c2a518a0 GKH |
1 | From foo@baz Fri Jan 18 09:16:11 CET 2019 |
2 | From: Chao Yu <yuchao0@huawei.com> | |
3 | Date: Sat, 7 May 2016 16:15:05 +0800 | |
4 | Subject: f2fs: fix inode cache leak | |
5 | ||
6 | From: Chao Yu <yuchao0@huawei.com> | |
7 | ||
8 | commit f61cce5b81f91ba336184008b24baec84afbb3dd upstream. | |
9 | ||
10 | When testing f2fs with inline_dentry option, generic/342 reports: | |
11 | VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day... | |
12 | ||
13 | After rmmod f2fs module, kenrel shows following dmesg: | |
14 | ============================================================================= | |
15 | BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown() | |
16 | ----------------------------------------------------------------------------- | |
17 | ||
18 | Disabling lock debugging due to kernel taint | |
19 | INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080 | |
20 | CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16 | |
21 | Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 | |
22 | 00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276 | |
23 | c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463 | |
24 | 616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065 | |
25 | Call Trace: | |
26 | [<c13a83a0>] dump_stack+0x5f/0x8f | |
27 | [<c11c7276>] slab_err+0x76/0x80 | |
28 | [<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0 | |
29 | [<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0 | |
30 | [<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0 | |
31 | [<c1198a38>] kmem_cache_destroy+0x158/0x1f0 | |
32 | [<c176b43d>] ? mutex_unlock+0xd/0x10 | |
33 | [<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs] | |
34 | [<c10f596c>] SyS_delete_module+0x16c/0x1d0 | |
35 | [<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0 | |
36 | [<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20 | |
37 | [<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210 | |
38 | [<c10ad50b>] ? trace_hardirqs_off+0xb/0x10 | |
39 | [<c1001b81>] do_fast_syscall_32+0xa1/0x1c0 | |
40 | [<c176d888>] sysenter_past_esp+0x45/0x74 | |
41 | INFO: Object 0xd1e6d9e0 @offset=6624 | |
42 | kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects | |
43 | CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16 | |
44 | Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 | |
45 | 00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7 | |
46 | c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14 | |
47 | f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266 | |
48 | Call Trace: | |
49 | [<c13a83a0>] dump_stack+0x5f/0x8f | |
50 | [<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0 | |
51 | [<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs] | |
52 | [<c10f596c>] SyS_delete_module+0x16c/0x1d0 | |
53 | [<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0 | |
54 | [<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20 | |
55 | [<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210 | |
56 | [<c10ad50b>] ? trace_hardirqs_off+0xb/0x10 | |
57 | [<c1001b81>] do_fast_syscall_32+0xa1/0x1c0 | |
58 | [<c176d888>] sysenter_past_esp+0x45/0x74 | |
59 | ||
60 | The reason is: in recovery flow, we use delayed iput mechanism for directory | |
61 | which has recovered dentry block. It means the reference of inode will be | |
62 | held until last dirty dentry page being writebacked. | |
63 | ||
64 | But when we mount f2fs with inline_dentry option, during recovery, dirent | |
65 | may only be recovered into dir inode page rather than dentry page, so there | |
66 | are no chance for us to release inode reference in ->writepage when | |
67 | writebacking last dentry page. | |
68 | ||
69 | We can call paired iget/iput explicityly for inline_dentry case, but for | |
70 | non-inline_dentry case, iput will call writeback_single_inode to write all | |
71 | data pages synchronously, but during recovery, ->writepages of f2fs skips | |
72 | writing all pages, result in losing dirent. | |
73 | ||
74 | This patch fixes this issue by obsoleting old mechanism, and introduce a | |
75 | new dir_list to hold all directory inodes which has recovered datas until | |
76 | finishing recovery. | |
77 | ||
78 | Signed-off-by: Chao Yu <yuchao0@huawei.com> | |
79 | Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> | |
80 | [bwh: Backported to 4.4: | |
81 | - Deleted add_dirty_dir_inode() function is different | |
82 | - Adjust context] | |
83 | Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk> | |
84 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
85 | --- | |
86 | fs/f2fs/checkpoint.c | 24 --------------------- | |
87 | fs/f2fs/f2fs.h | 2 - | |
88 | fs/f2fs/recovery.c | 56 ++++++++++++++++++++++++++++----------------------- | |
89 | 3 files changed, 31 insertions(+), 51 deletions(-) | |
90 | ||
91 | --- a/fs/f2fs/checkpoint.c | |
92 | +++ b/fs/f2fs/checkpoint.c | |
93 | @@ -771,24 +771,6 @@ out: | |
94 | f2fs_trace_pid(page); | |
95 | } | |
96 | ||
97 | -void add_dirty_dir_inode(struct inode *inode) | |
98 | -{ | |
99 | - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); | |
100 | - struct inode_entry *new = | |
101 | - f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); | |
102 | - int ret = 0; | |
103 | - | |
104 | - new->inode = inode; | |
105 | - INIT_LIST_HEAD(&new->list); | |
106 | - | |
107 | - spin_lock(&sbi->dir_inode_lock); | |
108 | - ret = __add_dirty_inode(inode, new); | |
109 | - spin_unlock(&sbi->dir_inode_lock); | |
110 | - | |
111 | - if (ret) | |
112 | - kmem_cache_free(inode_entry_slab, new); | |
113 | -} | |
114 | - | |
115 | void remove_dirty_dir_inode(struct inode *inode) | |
116 | { | |
117 | struct f2fs_sb_info *sbi = F2FS_I_SB(inode); | |
118 | @@ -811,12 +793,6 @@ void remove_dirty_dir_inode(struct inode | |
119 | stat_dec_dirty_dir(sbi); | |
120 | spin_unlock(&sbi->dir_inode_lock); | |
121 | kmem_cache_free(inode_entry_slab, entry); | |
122 | - | |
123 | - /* Only from the recovery routine */ | |
124 | - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { | |
125 | - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); | |
126 | - iput(inode); | |
127 | - } | |
128 | } | |
129 | ||
130 | void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) | |
131 | --- a/fs/f2fs/f2fs.h | |
132 | +++ b/fs/f2fs/f2fs.h | |
133 | @@ -1402,7 +1402,6 @@ enum { | |
134 | FI_NO_ALLOC, /* should not allocate any blocks */ | |
135 | FI_FREE_NID, /* free allocated nide */ | |
136 | FI_UPDATE_DIR, /* should update inode block for consistency */ | |
137 | - FI_DELAY_IPUT, /* used for the recovery */ | |
138 | FI_NO_EXTENT, /* not to use the extent cache */ | |
139 | FI_INLINE_XATTR, /* used for inline xattr */ | |
140 | FI_INLINE_DATA, /* used for inline data*/ | |
141 | @@ -1828,7 +1827,6 @@ void remove_orphan_inode(struct f2fs_sb_ | |
142 | int recover_orphan_inodes(struct f2fs_sb_info *); | |
143 | int get_valid_checkpoint(struct f2fs_sb_info *); | |
144 | void update_dirty_page(struct inode *, struct page *); | |
145 | -void add_dirty_dir_inode(struct inode *); | |
146 | void remove_dirty_dir_inode(struct inode *); | |
147 | void sync_dirty_dir_inodes(struct f2fs_sb_info *); | |
148 | void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); | |
149 | --- a/fs/f2fs/recovery.c | |
150 | +++ b/fs/f2fs/recovery.c | |
151 | @@ -89,7 +89,8 @@ static void del_fsync_inode(struct fsync | |
152 | kmem_cache_free(fsync_entry_slab, entry); | |
153 | } | |
154 | ||
155 | -static int recover_dentry(struct inode *inode, struct page *ipage) | |
156 | +static int recover_dentry(struct inode *inode, struct page *ipage, | |
157 | + struct list_head *dir_list) | |
158 | { | |
159 | struct f2fs_inode *raw_inode = F2FS_INODE(ipage); | |
160 | nid_t pino = le32_to_cpu(raw_inode->i_pino); | |
161 | @@ -97,18 +98,29 @@ static int recover_dentry(struct inode * | |
162 | struct qstr name; | |
163 | struct page *page; | |
164 | struct inode *dir, *einode; | |
165 | + struct fsync_inode_entry *entry; | |
166 | int err = 0; | |
167 | ||
168 | - dir = f2fs_iget(inode->i_sb, pino); | |
169 | - if (IS_ERR(dir)) { | |
170 | - err = PTR_ERR(dir); | |
171 | - goto out; | |
172 | + entry = get_fsync_inode(dir_list, pino); | |
173 | + if (!entry) { | |
174 | + dir = f2fs_iget(inode->i_sb, pino); | |
175 | + if (IS_ERR(dir)) { | |
176 | + err = PTR_ERR(dir); | |
177 | + goto out; | |
178 | + } | |
179 | + | |
180 | + entry = add_fsync_inode(dir_list, dir); | |
181 | + if (!entry) { | |
182 | + err = -ENOMEM; | |
183 | + iput(dir); | |
184 | + goto out; | |
185 | + } | |
186 | } | |
187 | ||
188 | - if (file_enc_name(inode)) { | |
189 | - iput(dir); | |
190 | + dir = entry->inode; | |
191 | + | |
192 | + if (file_enc_name(inode)) | |
193 | return 0; | |
194 | - } | |
195 | ||
196 | name.len = le32_to_cpu(raw_inode->i_namelen); | |
197 | name.name = raw_inode->i_name; | |
198 | @@ -116,7 +128,7 @@ static int recover_dentry(struct inode * | |
199 | if (unlikely(name.len > F2FS_NAME_LEN)) { | |
200 | WARN_ON(1); | |
201 | err = -ENAMETOOLONG; | |
202 | - goto out_err; | |
203 | + goto out; | |
204 | } | |
205 | retry: | |
206 | de = f2fs_find_entry(dir, &name, &page); | |
207 | @@ -142,23 +154,12 @@ retry: | |
208 | goto retry; | |
209 | } | |
210 | err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); | |
211 | - if (err) | |
212 | - goto out_err; | |
213 | - | |
214 | - if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { | |
215 | - iput(dir); | |
216 | - } else { | |
217 | - add_dirty_dir_inode(dir); | |
218 | - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); | |
219 | - } | |
220 | ||
221 | goto out; | |
222 | ||
223 | out_unmap_put: | |
224 | f2fs_dentry_kunmap(dir, page); | |
225 | f2fs_put_page(page, 0); | |
226 | -out_err: | |
227 | - iput(dir); | |
228 | out: | |
229 | f2fs_msg(inode->i_sb, KERN_NOTICE, | |
230 | "%s: ino = %x, name = %s, dir = %lx, err = %d", | |
231 | @@ -479,7 +480,8 @@ out: | |
232 | return err; | |
233 | } | |
234 | ||
235 | -static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) | |
236 | +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, | |
237 | + struct list_head *dir_list) | |
238 | { | |
239 | unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); | |
240 | struct curseg_info *curseg; | |
241 | @@ -506,7 +508,7 @@ static int recover_data(struct f2fs_sb_i | |
242 | break; | |
243 | } | |
244 | ||
245 | - entry = get_fsync_inode(head, ino_of_node(page)); | |
246 | + entry = get_fsync_inode(inode_list, ino_of_node(page)); | |
247 | if (!entry) | |
248 | goto next; | |
249 | /* | |
250 | @@ -517,7 +519,7 @@ static int recover_data(struct f2fs_sb_i | |
251 | if (entry->last_inode == blkaddr) | |
252 | recover_inode(entry->inode, page); | |
253 | if (entry->last_dentry == blkaddr) { | |
254 | - err = recover_dentry(entry->inode, page); | |
255 | + err = recover_dentry(entry->inode, page, dir_list); | |
256 | if (err) { | |
257 | f2fs_put_page(page, 1); | |
258 | break; | |
259 | @@ -545,6 +547,7 @@ int recover_fsync_data(struct f2fs_sb_in | |
260 | { | |
261 | struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); | |
262 | struct list_head inode_list; | |
263 | + struct list_head dir_list; | |
264 | block_t blkaddr; | |
265 | int err; | |
266 | int ret = 0; | |
267 | @@ -556,6 +559,7 @@ int recover_fsync_data(struct f2fs_sb_in | |
268 | return -ENOMEM; | |
269 | ||
270 | INIT_LIST_HEAD(&inode_list); | |
271 | + INIT_LIST_HEAD(&dir_list); | |
272 | ||
273 | /* prevent checkpoint */ | |
274 | mutex_lock(&sbi->cp_mutex); | |
275 | @@ -575,12 +579,11 @@ int recover_fsync_data(struct f2fs_sb_in | |
276 | need_writecp = true; | |
277 | ||
278 | /* step #2: recover data */ | |
279 | - err = recover_data(sbi, &inode_list); | |
280 | + err = recover_data(sbi, &inode_list, &dir_list); | |
281 | if (!err) | |
282 | f2fs_bug_on(sbi, !list_empty(&inode_list)); | |
283 | out: | |
284 | destroy_fsync_dnodes(&inode_list); | |
285 | - kmem_cache_destroy(fsync_entry_slab); | |
286 | ||
287 | /* truncate meta pages to be used by the recovery */ | |
288 | truncate_inode_pages_range(META_MAPPING(sbi), | |
289 | @@ -618,5 +621,8 @@ out: | |
290 | } else { | |
291 | mutex_unlock(&sbi->cp_mutex); | |
292 | } | |
293 | + | |
294 | + destroy_fsync_dnodes(&dir_list); | |
295 | + kmem_cache_destroy(fsync_entry_slab); | |
296 | return ret ? ret: err; | |
297 | } |