]>
Commit | Line | Data |
---|---|---|
7d777456 GKH |
1 | From 74ded2cc0427839ccdda41f2738130f0eea77fde Mon Sep 17 00:00:00 2001 |
2 | From: Theodore Ts'o <tytso@mit.edu> | |
3 | Date: Sun, 30 May 2010 22:49:21 -0400 | |
4 | Subject: ext4: Patch up how we claim metadata blocks for quota purposes | |
5 | ||
6 | commit 0637c6f4135f592f094207c7c21e7c0fc5557834 upstream (as of v2.6.33-rc3) | |
7 | ||
8 | As reported in Kernel Bugzilla #14936, commit d21cd8f triggered a BUG | |
9 | in the function ext4_da_update_reserve_space() found in | |
10 | fs/ext4/inode.c. The root cause of this BUG() was caused by the fact | |
11 | that ext4_calc_metadata_amount() can severely over-estimate how many | |
12 | metadata blocks will be needed, especially when using direct | |
13 | block-mapped files. | |
14 | ||
15 | In addition, it can also badly *under* estimate how much space is | |
16 | needed, since ext4_calc_metadata_amount() assumes that the blocks are | |
17 | contiguous, and this is not always true. If the application is | |
18 | writing blocks to a sparse file, the number of metadata blocks | |
19 | necessary can be severly underestimated by the functions | |
20 | ext4_da_reserve_space(), ext4_da_update_reserve_space() and | |
21 | ext4_da_release_space(). This was the cause of the dq_claim_space | |
22 | reports found on kerneloops.org. | |
23 | ||
24 | Unfortunately, doing this right means that we need to massively | |
25 | over-estimate the amount of free space needed. So in some cases we | |
26 | may need to force the inode to be written to disk asynchronously in | |
27 | to avoid spurious quota failures. | |
28 | ||
29 | http://bugzilla.kernel.org/show_bug.cgi?id=14936 | |
30 | ||
31 | Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> | |
32 | Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> | |
33 | --- | |
34 | fs/ext4/inode.c | 153 ++++++++++++++++++++++++++++++-------------------------- | |
35 | 1 file changed, 82 insertions(+), 71 deletions(-) | |
36 | ||
37 | --- a/fs/ext4/inode.c | |
38 | +++ b/fs/ext4/inode.c | |
39 | @@ -1085,43 +1085,47 @@ static int ext4_calc_metadata_amount(str | |
40 | return ext4_indirect_calc_metadata_amount(inode, blocks); | |
41 | } | |
42 | ||
43 | +/* | |
44 | + * Called with i_data_sem down, which is important since we can call | |
45 | + * ext4_discard_preallocations() from here. | |
46 | + */ | |
47 | static void ext4_da_update_reserve_space(struct inode *inode, int used) | |
48 | { | |
49 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
50 | - int total, mdb, mdb_free, mdb_claim = 0; | |
51 | + struct ext4_inode_info *ei = EXT4_I(inode); | |
52 | + int mdb_free = 0; | |
53 | ||
54 | - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | |
55 | - /* recalculate the number of metablocks still need to be reserved */ | |
56 | - total = EXT4_I(inode)->i_reserved_data_blocks - used; | |
57 | - mdb = ext4_calc_metadata_amount(inode, total); | |
58 | - | |
59 | - /* figure out how many metablocks to release */ | |
60 | - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | |
61 | - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; | |
62 | - | |
63 | - if (mdb_free) { | |
64 | - /* Account for allocated meta_blocks */ | |
65 | - mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks; | |
66 | - BUG_ON(mdb_free < mdb_claim); | |
67 | - mdb_free -= mdb_claim; | |
68 | + spin_lock(&ei->i_block_reservation_lock); | |
69 | + if (unlikely(used > ei->i_reserved_data_blocks)) { | |
70 | + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " | |
71 | + "with only %d reserved data blocks\n", | |
72 | + __func__, inode->i_ino, used, | |
73 | + ei->i_reserved_data_blocks); | |
74 | + WARN_ON(1); | |
75 | + used = ei->i_reserved_data_blocks; | |
76 | + } | |
77 | + | |
78 | + /* Update per-inode reservations */ | |
79 | + ei->i_reserved_data_blocks -= used; | |
80 | + used += ei->i_allocated_meta_blocks; | |
81 | + ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; | |
82 | + ei->i_allocated_meta_blocks = 0; | |
83 | + percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); | |
84 | ||
85 | - /* update fs dirty blocks counter */ | |
86 | + if (ei->i_reserved_data_blocks == 0) { | |
87 | + /* | |
88 | + * We can release all of the reserved metadata blocks | |
89 | + * only when we have written all of the delayed | |
90 | + * allocation blocks. | |
91 | + */ | |
92 | + mdb_free = ei->i_allocated_meta_blocks; | |
93 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); | |
94 | - EXT4_I(inode)->i_allocated_meta_blocks = 0; | |
95 | - EXT4_I(inode)->i_reserved_meta_blocks = mdb; | |
96 | + ei->i_allocated_meta_blocks = 0; | |
97 | } | |
98 | - | |
99 | - /* update per-inode reservations */ | |
100 | - BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); | |
101 | - EXT4_I(inode)->i_reserved_data_blocks -= used; | |
102 | - percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim); | |
103 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
104 | ||
105 | - vfs_dq_claim_block(inode, used + mdb_claim); | |
106 | - | |
107 | - /* | |
108 | - * free those over-booking quota for metadata blocks | |
109 | - */ | |
110 | + /* Update quota subsystem */ | |
111 | + vfs_dq_claim_block(inode, used); | |
112 | if (mdb_free) | |
113 | vfs_dq_release_reservation_block(inode, mdb_free); | |
114 | ||
115 | @@ -1130,7 +1134,8 @@ static void ext4_da_update_reserve_space | |
116 | * there aren't any writers on the inode, we can discard the | |
117 | * inode's preallocations. | |
118 | */ | |
119 | - if (!total && (atomic_read(&inode->i_writecount) == 0)) | |
120 | + if ((ei->i_reserved_data_blocks == 0) && | |
121 | + (atomic_read(&inode->i_writecount) == 0)) | |
122 | ext4_discard_preallocations(inode); | |
123 | } | |
124 | ||
125 | @@ -1843,7 +1848,8 @@ static int ext4_da_reserve_space(struct | |
126 | { | |
127 | int retries = 0; | |
128 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
129 | - unsigned long md_needed, mdblocks, total = 0; | |
130 | + struct ext4_inode_info *ei = EXT4_I(inode); | |
131 | + unsigned long md_needed, md_reserved, total = 0; | |
132 | ||
133 | /* | |
134 | * recalculate the amount of metadata blocks to reserve | |
135 | @@ -1851,35 +1857,44 @@ static int ext4_da_reserve_space(struct | |
136 | * worse case is one extent per block | |
137 | */ | |
138 | repeat: | |
139 | - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | |
140 | - total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; | |
141 | - mdblocks = ext4_calc_metadata_amount(inode, total); | |
142 | - BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); | |
143 | - | |
144 | - md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; | |
145 | + spin_lock(&ei->i_block_reservation_lock); | |
146 | + md_reserved = ei->i_reserved_meta_blocks; | |
147 | + md_needed = ext4_calc_metadata_amount(inode, nrblocks); | |
148 | total = md_needed + nrblocks; | |
149 | - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
150 | + spin_unlock(&ei->i_block_reservation_lock); | |
151 | ||
152 | /* | |
153 | * Make quota reservation here to prevent quota overflow | |
154 | * later. Real quota accounting is done at pages writeout | |
155 | * time. | |
156 | */ | |
157 | - if (vfs_dq_reserve_block(inode, total)) | |
158 | + if (vfs_dq_reserve_block(inode, total)) { | |
159 | + /* | |
160 | + * We tend to badly over-estimate the amount of | |
161 | + * metadata blocks which are needed, so if we have | |
162 | + * reserved any metadata blocks, try to force out the | |
163 | + * inode and see if we have any better luck. | |
164 | + */ | |
165 | + if (md_reserved && retries++ <= 3) | |
166 | + goto retry; | |
167 | return -EDQUOT; | |
168 | + } | |
169 | ||
170 | if (ext4_claim_free_blocks(sbi, total)) { | |
171 | vfs_dq_release_reservation_block(inode, total); | |
172 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { | |
173 | + retry: | |
174 | + if (md_reserved) | |
175 | + write_inode_now(inode, (retries == 3)); | |
176 | yield(); | |
177 | goto repeat; | |
178 | } | |
179 | return -ENOSPC; | |
180 | } | |
181 | - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | |
182 | - EXT4_I(inode)->i_reserved_data_blocks += nrblocks; | |
183 | - EXT4_I(inode)->i_reserved_meta_blocks += md_needed; | |
184 | - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
185 | + spin_lock(&ei->i_block_reservation_lock); | |
186 | + ei->i_reserved_data_blocks += nrblocks; | |
187 | + ei->i_reserved_meta_blocks += md_needed; | |
188 | + spin_unlock(&ei->i_block_reservation_lock); | |
189 | ||
190 | return 0; /* success */ | |
191 | } | |
192 | @@ -1887,49 +1902,45 @@ repeat: | |
193 | static void ext4_da_release_space(struct inode *inode, int to_free) | |
194 | { | |
195 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
196 | - int total, mdb, mdb_free, release; | |
197 | + struct ext4_inode_info *ei = EXT4_I(inode); | |
198 | ||
199 | if (!to_free) | |
200 | return; /* Nothing to release, exit */ | |
201 | ||
202 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | |
203 | ||
204 | - if (!EXT4_I(inode)->i_reserved_data_blocks) { | |
205 | + if (unlikely(to_free > ei->i_reserved_data_blocks)) { | |
206 | /* | |
207 | - * if there is no reserved blocks, but we try to free some | |
208 | - * then the counter is messed up somewhere. | |
209 | - * but since this function is called from invalidate | |
210 | - * page, it's harmless to return without any action | |
211 | + * if there aren't enough reserved blocks, then the | |
212 | + * counter is messed up somewhere. Since this | |
213 | + * function is called from invalidate page, it's | |
214 | + * harmless to return without any action. | |
215 | */ | |
216 | - printk(KERN_INFO "ext4 delalloc try to release %d reserved " | |
217 | - "blocks for inode %lu, but there is no reserved " | |
218 | - "data blocks\n", to_free, inode->i_ino); | |
219 | - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
220 | - return; | |
221 | + ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " | |
222 | + "ino %lu, to_free %d with only %d reserved " | |
223 | + "data blocks\n", inode->i_ino, to_free, | |
224 | + ei->i_reserved_data_blocks); | |
225 | + WARN_ON(1); | |
226 | + to_free = ei->i_reserved_data_blocks; | |
227 | } | |
228 | + ei->i_reserved_data_blocks -= to_free; | |
229 | ||
230 | - /* recalculate the number of metablocks still need to be reserved */ | |
231 | - total = EXT4_I(inode)->i_reserved_data_blocks - to_free; | |
232 | - mdb = ext4_calc_metadata_amount(inode, total); | |
233 | - | |
234 | - /* figure out how many metablocks to release */ | |
235 | - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | |
236 | - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; | |
237 | - | |
238 | - release = to_free + mdb_free; | |
239 | - | |
240 | - /* update fs dirty blocks counter for truncate case */ | |
241 | - percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); | |
242 | + if (ei->i_reserved_data_blocks == 0) { | |
243 | + /* | |
244 | + * We can release all of the reserved metadata blocks | |
245 | + * only when we have written all of the delayed | |
246 | + * allocation blocks. | |
247 | + */ | |
248 | + to_free += ei->i_allocated_meta_blocks; | |
249 | + ei->i_allocated_meta_blocks = 0; | |
250 | + } | |
251 | ||
252 | - /* update per-inode reservations */ | |
253 | - BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); | |
254 | - EXT4_I(inode)->i_reserved_data_blocks -= to_free; | |
255 | + /* update fs dirty blocks counter */ | |
256 | + percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); | |
257 | ||
258 | - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | |
259 | - EXT4_I(inode)->i_reserved_meta_blocks = mdb; | |
260 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
261 | ||
262 | - vfs_dq_release_reservation_block(inode, release); | |
263 | + vfs_dq_release_reservation_block(inode, to_free); | |
264 | } | |
265 | ||
266 | static void ext4_da_page_release_reservation(struct page *page, |