]>
Commit | Line | Data |
---|---|---|
86736342 JB |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "ctree.h" | |
4 | #include "delalloc-space.h" | |
5 | #include "block-rsv.h" | |
6 | #include "btrfs_inode.h" | |
7 | #include "space-info.h" | |
8 | #include "transaction.h" | |
9 | #include "qgroup.h" | |
07730d87 | 10 | #include "block-group.h" |
86736342 JB |
11 | |
12 | int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) | |
13 | { | |
14 | struct btrfs_root *root = inode->root; | |
15 | struct btrfs_fs_info *fs_info = root->fs_info; | |
16 | struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; | |
17 | u64 used; | |
18 | int ret = 0; | |
19 | int need_commit = 2; | |
20 | int have_pinned_space; | |
21 | ||
22 | /* Make sure bytes are sectorsize aligned */ | |
23 | bytes = ALIGN(bytes, fs_info->sectorsize); | |
24 | ||
25 | if (btrfs_is_free_space_inode(inode)) { | |
26 | need_commit = 0; | |
27 | ASSERT(current->journal_info); | |
28 | } | |
29 | ||
30 | again: | |
31 | /* Make sure we have enough space to handle the data first */ | |
32 | spin_lock(&data_sinfo->lock); | |
33 | used = btrfs_space_info_used(data_sinfo, true); | |
34 | ||
35 | if (used + bytes > data_sinfo->total_bytes) { | |
36 | struct btrfs_trans_handle *trans; | |
37 | ||
38 | /* | |
39 | * If we don't have enough free bytes in this space then we need | |
40 | * to alloc a new chunk. | |
41 | */ | |
42 | if (!data_sinfo->full) { | |
43 | u64 alloc_target; | |
44 | ||
45 | data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; | |
46 | spin_unlock(&data_sinfo->lock); | |
47 | ||
48 | alloc_target = btrfs_data_alloc_profile(fs_info); | |
49 | /* | |
50 | * It is ugly that we don't call nolock join | |
51 | * transaction for the free space inode case here. | |
52 | * But it is safe because we only do the data space | |
53 | * reservation for the free space cache in the | |
54 | * transaction context, the common join transaction | |
55 | * just increase the counter of the current transaction | |
56 | * handler, doesn't try to acquire the trans_lock of | |
57 | * the fs. | |
58 | */ | |
59 | trans = btrfs_join_transaction(root); | |
60 | if (IS_ERR(trans)) | |
61 | return PTR_ERR(trans); | |
62 | ||
63 | ret = btrfs_chunk_alloc(trans, alloc_target, | |
64 | CHUNK_ALLOC_NO_FORCE); | |
65 | btrfs_end_transaction(trans); | |
66 | if (ret < 0) { | |
67 | if (ret != -ENOSPC) | |
68 | return ret; | |
69 | else { | |
70 | have_pinned_space = 1; | |
71 | goto commit_trans; | |
72 | } | |
73 | } | |
74 | ||
75 | goto again; | |
76 | } | |
77 | ||
78 | /* | |
79 | * If we don't have enough pinned space to deal with this | |
80 | * allocation, and no removed chunk in current transaction, | |
81 | * don't bother committing the transaction. | |
82 | */ | |
83 | have_pinned_space = __percpu_counter_compare( | |
84 | &data_sinfo->total_bytes_pinned, | |
85 | used + bytes - data_sinfo->total_bytes, | |
86 | BTRFS_TOTAL_BYTES_PINNED_BATCH); | |
87 | spin_unlock(&data_sinfo->lock); | |
88 | ||
89 | /* Commit the current transaction and try again */ | |
90 | commit_trans: | |
91 | if (need_commit) { | |
92 | need_commit--; | |
93 | ||
94 | if (need_commit > 0) { | |
95 | btrfs_start_delalloc_roots(fs_info, -1); | |
96 | btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, | |
97 | (u64)-1); | |
98 | } | |
99 | ||
100 | trans = btrfs_join_transaction(root); | |
101 | if (IS_ERR(trans)) | |
102 | return PTR_ERR(trans); | |
103 | if (have_pinned_space >= 0 || | |
104 | test_bit(BTRFS_TRANS_HAVE_FREE_BGS, | |
105 | &trans->transaction->flags) || | |
106 | need_commit > 0) { | |
107 | ret = btrfs_commit_transaction(trans); | |
108 | if (ret) | |
109 | return ret; | |
110 | /* | |
111 | * The cleaner kthread might still be doing iput | |
112 | * operations. Wait for it to finish so that | |
113 | * more space is released. We don't need to | |
114 | * explicitly run the delayed iputs here because | |
115 | * the commit_transaction would have woken up | |
116 | * the cleaner. | |
117 | */ | |
118 | ret = btrfs_wait_on_delayed_iputs(fs_info); | |
119 | if (ret) | |
120 | return ret; | |
121 | goto again; | |
122 | } else { | |
123 | btrfs_end_transaction(trans); | |
124 | } | |
125 | } | |
126 | ||
127 | trace_btrfs_space_reservation(fs_info, | |
128 | "space_info:enospc", | |
129 | data_sinfo->flags, bytes, 1); | |
130 | return -ENOSPC; | |
131 | } | |
132 | btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); | |
86736342 JB |
133 | spin_unlock(&data_sinfo->lock); |
134 | ||
135 | return 0; | |
136 | } | |
137 | ||
138 | int btrfs_check_data_free_space(struct inode *inode, | |
139 | struct extent_changeset **reserved, u64 start, u64 len) | |
140 | { | |
141 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | |
142 | int ret; | |
143 | ||
144 | /* align the range */ | |
145 | len = round_up(start + len, fs_info->sectorsize) - | |
146 | round_down(start, fs_info->sectorsize); | |
147 | start = round_down(start, fs_info->sectorsize); | |
148 | ||
149 | ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); | |
150 | if (ret < 0) | |
151 | return ret; | |
152 | ||
153 | /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ | |
154 | ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); | |
155 | if (ret < 0) | |
156 | btrfs_free_reserved_data_space_noquota(inode, start, len); | |
157 | else | |
158 | ret = 0; | |
159 | return ret; | |
160 | } | |
161 | ||
162 | /* | |
163 | * Called if we need to clear a data reservation for this inode | |
164 | * Normally in a error case. | |
165 | * | |
166 | * This one will *NOT* use accurate qgroup reserved space API, just for case | |
167 | * which we can't sleep and is sure it won't affect qgroup reserved space. | |
168 | * Like clear_bit_hook(). | |
169 | */ | |
170 | void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, | |
171 | u64 len) | |
172 | { | |
173 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | |
174 | struct btrfs_space_info *data_sinfo; | |
175 | ||
176 | /* Make sure the range is aligned to sectorsize */ | |
177 | len = round_up(start + len, fs_info->sectorsize) - | |
178 | round_down(start, fs_info->sectorsize); | |
179 | start = round_down(start, fs_info->sectorsize); | |
180 | ||
181 | data_sinfo = fs_info->data_sinfo; | |
182 | spin_lock(&data_sinfo->lock); | |
183 | btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); | |
86736342 JB |
184 | spin_unlock(&data_sinfo->lock); |
185 | } | |
186 | ||
187 | /* | |
188 | * Called if we need to clear a data reservation for this inode | |
189 | * Normally in a error case. | |
190 | * | |
191 | * This one will handle the per-inode data rsv map for accurate reserved | |
192 | * space framework. | |
193 | */ | |
194 | void btrfs_free_reserved_data_space(struct inode *inode, | |
195 | struct extent_changeset *reserved, u64 start, u64 len) | |
196 | { | |
197 | struct btrfs_root *root = BTRFS_I(inode)->root; | |
198 | ||
199 | /* Make sure the range is aligned to sectorsize */ | |
200 | len = round_up(start + len, root->fs_info->sectorsize) - | |
201 | round_down(start, root->fs_info->sectorsize); | |
202 | start = round_down(start, root->fs_info->sectorsize); | |
203 | ||
204 | btrfs_free_reserved_data_space_noquota(inode, start, len); | |
205 | btrfs_qgroup_free_data(inode, reserved, start, len); | |
206 | } | |
207 | ||
208 | /** | |
209 | * btrfs_inode_rsv_release - release any excessive reservation. | |
210 | * @inode - the inode we need to release from. | |
211 | * @qgroup_free - free or convert qgroup meta. | |
212 | * Unlike normal operation, qgroup meta reservation needs to know if we are | |
213 | * freeing qgroup reservation or just converting it into per-trans. Normally | |
214 | * @qgroup_free is true for error handling, and false for normal release. | |
215 | * | |
216 | * This is the same as btrfs_block_rsv_release, except that it handles the | |
217 | * tracepoint for the reservation. | |
218 | */ | |
219 | static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) | |
220 | { | |
221 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
222 | struct btrfs_block_rsv *block_rsv = &inode->block_rsv; | |
223 | u64 released = 0; | |
224 | u64 qgroup_to_release = 0; | |
225 | ||
226 | /* | |
227 | * Since we statically set the block_rsv->size we just want to say we | |
228 | * are releasing 0 bytes, and then we'll just get the reservation over | |
229 | * the size free'd. | |
230 | */ | |
231 | released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, | |
232 | &qgroup_to_release); | |
233 | if (released > 0) | |
234 | trace_btrfs_space_reservation(fs_info, "delalloc", | |
235 | btrfs_ino(inode), released, 0); | |
236 | if (qgroup_free) | |
237 | btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); | |
238 | else | |
239 | btrfs_qgroup_convert_reserved_meta(inode->root, | |
240 | qgroup_to_release); | |
241 | } | |
242 | ||
243 | static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, | |
244 | struct btrfs_inode *inode) | |
245 | { | |
246 | struct btrfs_block_rsv *block_rsv = &inode->block_rsv; | |
247 | u64 reserve_size = 0; | |
248 | u64 qgroup_rsv_size = 0; | |
249 | u64 csum_leaves; | |
250 | unsigned outstanding_extents; | |
251 | ||
252 | lockdep_assert_held(&inode->lock); | |
253 | outstanding_extents = inode->outstanding_extents; | |
bcacf5f3 JB |
254 | |
255 | /* | |
256 | * Insert size for the number of outstanding extents, 1 normal size for | |
257 | * updating the inode. | |
258 | */ | |
259 | if (outstanding_extents) { | |
2bd36e7b | 260 | reserve_size = btrfs_calc_insert_metadata_size(fs_info, |
bcacf5f3 JB |
261 | outstanding_extents); |
262 | reserve_size += btrfs_calc_metadata_size(fs_info, 1); | |
263 | } | |
86736342 JB |
264 | csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, |
265 | inode->csum_bytes); | |
2bd36e7b JB |
266 | reserve_size += btrfs_calc_insert_metadata_size(fs_info, |
267 | csum_leaves); | |
86736342 JB |
268 | /* |
269 | * For qgroup rsv, the calculation is very simple: | |
270 | * account one nodesize for each outstanding extent | |
271 | * | |
272 | * This is overestimating in most cases. | |
273 | */ | |
274 | qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; | |
275 | ||
276 | spin_lock(&block_rsv->lock); | |
277 | block_rsv->size = reserve_size; | |
278 | block_rsv->qgroup_rsv_size = qgroup_rsv_size; | |
279 | spin_unlock(&block_rsv->lock); | |
280 | } | |
281 | ||
282 | static void calc_inode_reservations(struct btrfs_fs_info *fs_info, | |
283 | u64 num_bytes, u64 *meta_reserve, | |
284 | u64 *qgroup_reserve) | |
285 | { | |
286 | u64 nr_extents = count_max_extents(num_bytes); | |
287 | u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); | |
bcacf5f3 | 288 | u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); |
86736342 | 289 | |
2bd36e7b | 290 | *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, |
bcacf5f3 JB |
291 | nr_extents + csum_leaves); |
292 | ||
293 | /* | |
294 | * finish_ordered_io has to update the inode, so add the space required | |
295 | * for an inode update. | |
296 | */ | |
297 | *meta_reserve += inode_update; | |
86736342 JB |
298 | *qgroup_reserve = nr_extents * fs_info->nodesize; |
299 | } | |
300 | ||
301 | int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) | |
302 | { | |
303 | struct btrfs_root *root = inode->root; | |
304 | struct btrfs_fs_info *fs_info = root->fs_info; | |
305 | struct btrfs_block_rsv *block_rsv = &inode->block_rsv; | |
306 | u64 meta_reserve, qgroup_reserve; | |
307 | unsigned nr_extents; | |
308 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; | |
309 | int ret = 0; | |
310 | bool delalloc_lock = true; | |
311 | ||
312 | /* | |
313 | * If we are a free space inode we need to not flush since we will be in | |
314 | * the middle of a transaction commit. We also don't need the delalloc | |
315 | * mutex since we won't race with anybody. We need this mostly to make | |
316 | * lockdep shut its filthy mouth. | |
317 | * | |
318 | * If we have a transaction open (can happen if we call truncate_block | |
319 | * from truncate), then we need FLUSH_LIMIT so we don't deadlock. | |
320 | */ | |
321 | if (btrfs_is_free_space_inode(inode)) { | |
322 | flush = BTRFS_RESERVE_NO_FLUSH; | |
323 | delalloc_lock = false; | |
324 | } else { | |
325 | if (current->journal_info) | |
326 | flush = BTRFS_RESERVE_FLUSH_LIMIT; | |
327 | ||
328 | if (btrfs_transaction_in_commit(fs_info)) | |
329 | schedule_timeout(1); | |
330 | } | |
331 | ||
332 | if (delalloc_lock) | |
333 | mutex_lock(&inode->delalloc_mutex); | |
334 | ||
335 | num_bytes = ALIGN(num_bytes, fs_info->sectorsize); | |
336 | ||
337 | /* | |
338 | * We always want to do it this way, every other way is wrong and ends | |
339 | * in tears. Pre-reserving the amount we are going to add will always | |
340 | * be the right way, because otherwise if we have enough parallelism we | |
341 | * could end up with thousands of inodes all holding little bits of | |
342 | * reservations they were able to make previously and the only way to | |
343 | * reclaim that space is to ENOSPC out the operations and clear | |
344 | * everything out and try again, which is bad. This way we just | |
345 | * over-reserve slightly, and clean up the mess when we are done. | |
346 | */ | |
347 | calc_inode_reservations(fs_info, num_bytes, &meta_reserve, | |
348 | &qgroup_reserve); | |
349 | ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); | |
350 | if (ret) | |
351 | goto out_fail; | |
352 | ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); | |
353 | if (ret) | |
354 | goto out_qgroup; | |
355 | ||
356 | /* | |
357 | * Now we need to update our outstanding extents and csum bytes _first_ | |
358 | * and then add the reservation to the block_rsv. This keeps us from | |
359 | * racing with an ordered completion or some such that would think it | |
360 | * needs to free the reservation we just made. | |
361 | */ | |
362 | spin_lock(&inode->lock); | |
363 | nr_extents = count_max_extents(num_bytes); | |
364 | btrfs_mod_outstanding_extents(inode, nr_extents); | |
365 | inode->csum_bytes += num_bytes; | |
366 | btrfs_calculate_inode_block_rsv_size(fs_info, inode); | |
367 | spin_unlock(&inode->lock); | |
368 | ||
369 | /* Now we can safely add our space to our block rsv */ | |
370 | btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); | |
371 | trace_btrfs_space_reservation(root->fs_info, "delalloc", | |
372 | btrfs_ino(inode), meta_reserve, 1); | |
373 | ||
374 | spin_lock(&block_rsv->lock); | |
375 | block_rsv->qgroup_rsv_reserved += qgroup_reserve; | |
376 | spin_unlock(&block_rsv->lock); | |
377 | ||
378 | if (delalloc_lock) | |
379 | mutex_unlock(&inode->delalloc_mutex); | |
380 | return 0; | |
381 | out_qgroup: | |
382 | btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); | |
383 | out_fail: | |
384 | btrfs_inode_rsv_release(inode, true); | |
385 | if (delalloc_lock) | |
386 | mutex_unlock(&inode->delalloc_mutex); | |
387 | return ret; | |
388 | } | |
389 | ||
390 | /** | |
391 | * btrfs_delalloc_release_metadata - release a metadata reservation for an inode | |
392 | * @inode: the inode to release the reservation for. | |
393 | * @num_bytes: the number of bytes we are releasing. | |
394 | * @qgroup_free: free qgroup reservation or convert it to per-trans reservation | |
395 | * | |
396 | * This will release the metadata reservation for an inode. This can be called | |
397 | * once we complete IO for a given set of bytes to release their metadata | |
398 | * reservations, or on error for the same reason. | |
399 | */ | |
400 | void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, | |
401 | bool qgroup_free) | |
402 | { | |
403 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
404 | ||
405 | num_bytes = ALIGN(num_bytes, fs_info->sectorsize); | |
406 | spin_lock(&inode->lock); | |
407 | inode->csum_bytes -= num_bytes; | |
408 | btrfs_calculate_inode_block_rsv_size(fs_info, inode); | |
409 | spin_unlock(&inode->lock); | |
410 | ||
411 | if (btrfs_is_testing(fs_info)) | |
412 | return; | |
413 | ||
414 | btrfs_inode_rsv_release(inode, qgroup_free); | |
415 | } | |
416 | ||
417 | /** | |
418 | * btrfs_delalloc_release_extents - release our outstanding_extents | |
419 | * @inode: the inode to balance the reservation for. | |
420 | * @num_bytes: the number of bytes we originally reserved with | |
421 | * @qgroup_free: do we need to free qgroup meta reservation or convert them. | |
422 | * | |
423 | * When we reserve space we increase outstanding_extents for the extents we may | |
424 | * add. Once we've set the range as delalloc or created our ordered extents we | |
425 | * have outstanding_extents to track the real usage, so we use this to free our | |
426 | * temporarily tracked outstanding_extents. This _must_ be used in conjunction | |
427 | * with btrfs_delalloc_reserve_metadata. | |
428 | */ | |
429 | void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, | |
430 | bool qgroup_free) | |
431 | { | |
432 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | |
433 | unsigned num_extents; | |
434 | ||
435 | spin_lock(&inode->lock); | |
436 | num_extents = count_max_extents(num_bytes); | |
437 | btrfs_mod_outstanding_extents(inode, -num_extents); | |
438 | btrfs_calculate_inode_block_rsv_size(fs_info, inode); | |
439 | spin_unlock(&inode->lock); | |
440 | ||
441 | if (btrfs_is_testing(fs_info)) | |
442 | return; | |
443 | ||
444 | btrfs_inode_rsv_release(inode, qgroup_free); | |
445 | } | |
446 | ||
447 | /** | |
448 | * btrfs_delalloc_reserve_space - reserve data and metadata space for | |
449 | * delalloc | |
450 | * @inode: inode we're writing to | |
451 | * @start: start range we are writing to | |
452 | * @len: how long the range we are writing to | |
453 | * @reserved: mandatory parameter, record actually reserved qgroup ranges of | |
454 | * current reservation. | |
455 | * | |
456 | * This will do the following things | |
457 | * | |
458 | * - reserve space in data space info for num bytes | |
459 | * and reserve precious corresponding qgroup space | |
460 | * (Done in check_data_free_space) | |
461 | * | |
462 | * - reserve space for metadata space, based on the number of outstanding | |
463 | * extents and how much csums will be needed | |
464 | * also reserve metadata space in a per root over-reserve method. | |
465 | * - add to the inodes->delalloc_bytes | |
466 | * - add it to the fs_info's delalloc inodes list. | |
467 | * (Above 3 all done in delalloc_reserve_metadata) | |
468 | * | |
469 | * Return 0 for success | |
470 | * Return <0 for error(-ENOSPC or -EQUOT) | |
471 | */ | |
472 | int btrfs_delalloc_reserve_space(struct inode *inode, | |
473 | struct extent_changeset **reserved, u64 start, u64 len) | |
474 | { | |
475 | int ret; | |
476 | ||
477 | ret = btrfs_check_data_free_space(inode, reserved, start, len); | |
478 | if (ret < 0) | |
479 | return ret; | |
480 | ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); | |
481 | if (ret < 0) | |
482 | btrfs_free_reserved_data_space(inode, *reserved, start, len); | |
483 | return ret; | |
484 | } | |
485 | ||
486 | /** | |
487 | * btrfs_delalloc_release_space - release data and metadata space for delalloc | |
488 | * @inode: inode we're releasing space for | |
489 | * @start: start position of the space already reserved | |
490 | * @len: the len of the space already reserved | |
491 | * @release_bytes: the len of the space we consumed or didn't use | |
492 | * | |
493 | * This function will release the metadata space that was not used and will | |
494 | * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes | |
495 | * list if there are no delalloc bytes left. | |
496 | * Also it will handle the qgroup reserved space. | |
497 | */ | |
498 | void btrfs_delalloc_release_space(struct inode *inode, | |
499 | struct extent_changeset *reserved, | |
500 | u64 start, u64 len, bool qgroup_free) | |
501 | { | |
502 | btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); | |
503 | btrfs_free_reserved_data_space(inode, reserved, start, len); | |
504 | } |