]> git.ipfire.org Git - people/ms/linux.git/blame - fs/btrfs/free-space-cache.c
Merge branch 'for-6.0/dax' into libnvdimm-fixes
[people/ms/linux.git] / fs / btrfs / free-space-cache.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
0f9dd46c
JB
2/*
3 * Copyright (C) 2008 Red Hat. All rights reserved.
0f9dd46c
JB
4 */
5
96303081 6#include <linux/pagemap.h>
0f9dd46c 7#include <linux/sched.h>
f361bf4a 8#include <linux/sched/signal.h>
5a0e3ad6 9#include <linux/slab.h>
96303081 10#include <linux/math64.h>
6ab60601 11#include <linux/ratelimit.h>
540adea3 12#include <linux/error-injection.h>
84de76a2 13#include <linux/sched/mm.h>
18bb8bbf 14#include "misc.h"
0f9dd46c 15#include "ctree.h"
fa9c0d79
CM
16#include "free-space-cache.h"
17#include "transaction.h"
0af3d00b 18#include "disk-io.h"
43be2146 19#include "extent_io.h"
04216820 20#include "volumes.h"
8719aaae 21#include "space-info.h"
86736342 22#include "delalloc-space.h"
aac0023c 23#include "block-group.h"
b0643e59 24#include "discard.h"
e4f94347 25#include "subpage.h"
26c2c454 26#include "inode-item.h"
fa9c0d79 27
0ef6447a 28#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
5d90c5c7
DZ
29#define MAX_CACHE_BYTES_PER_GIG SZ_64K
30#define FORCE_EXTENT_THRESHOLD SZ_1M
0f9dd46c 31
55507ce3
FM
32struct btrfs_trim_range {
33 u64 start;
34 u64 bytes;
35 struct list_head list;
36};
37
34d52cb6 38static int link_free_space(struct btrfs_free_space_ctl *ctl,
0cb59c99 39 struct btrfs_free_space *info);
cd023e7b 40static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
32e1649b 41 struct btrfs_free_space *info, bool update_stat);
cd79909b
JB
42static int search_bitmap(struct btrfs_free_space_ctl *ctl,
43 struct btrfs_free_space *bitmap_info, u64 *offset,
44 u64 *bytes, bool for_alloc);
45static void free_bitmap(struct btrfs_free_space_ctl *ctl,
46 struct btrfs_free_space *bitmap_info);
47static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
48 struct btrfs_free_space *info, u64 offset,
f594f13c 49 u64 bytes, bool update_stats);
0cb59c99 50
0414efae
LZ
51static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
52 struct btrfs_path *path,
53 u64 offset)
0af3d00b 54{
0b246afa 55 struct btrfs_fs_info *fs_info = root->fs_info;
0af3d00b
JB
56 struct btrfs_key key;
57 struct btrfs_key location;
58 struct btrfs_disk_key disk_key;
59 struct btrfs_free_space_header *header;
60 struct extent_buffer *leaf;
61 struct inode *inode = NULL;
84de76a2 62 unsigned nofs_flag;
0af3d00b
JB
63 int ret;
64
0af3d00b 65 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 66 key.offset = offset;
0af3d00b
JB
67 key.type = 0;
68
69 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0) {
b3b4aa74 73 btrfs_release_path(path);
0af3d00b
JB
74 return ERR_PTR(-ENOENT);
75 }
76
77 leaf = path->nodes[0];
78 header = btrfs_item_ptr(leaf, path->slots[0],
79 struct btrfs_free_space_header);
80 btrfs_free_space_key(leaf, header, &disk_key);
81 btrfs_disk_key_to_cpu(&location, &disk_key);
b3b4aa74 82 btrfs_release_path(path);
0af3d00b 83
84de76a2
JB
84 /*
85 * We are often under a trans handle at this point, so we need to make
86 * sure NOFS is set to keep us from deadlocking.
87 */
88 nofs_flag = memalloc_nofs_save();
0202e83f 89 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
4222ea71 90 btrfs_release_path(path);
84de76a2 91 memalloc_nofs_restore(nofs_flag);
0af3d00b
JB
92 if (IS_ERR(inode))
93 return inode;
0af3d00b 94
528c0327 95 mapping_set_gfp_mask(inode->i_mapping,
c62d2555
MH
96 mapping_gfp_constraint(inode->i_mapping,
97 ~(__GFP_FS | __GFP_HIGHMEM)));
adae52b9 98
0414efae
LZ
99 return inode;
100}
101
32da5386 102struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
7949f339 103 struct btrfs_path *path)
0414efae 104{
7949f339 105 struct btrfs_fs_info *fs_info = block_group->fs_info;
0414efae 106 struct inode *inode = NULL;
5b0e95bf 107 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
0414efae
LZ
108
109 spin_lock(&block_group->lock);
110 if (block_group->inode)
111 inode = igrab(block_group->inode);
112 spin_unlock(&block_group->lock);
113 if (inode)
114 return inode;
115
77ab86bf 116 inode = __lookup_free_space_inode(fs_info->tree_root, path,
b3470b5d 117 block_group->start);
0414efae
LZ
118 if (IS_ERR(inode))
119 return inode;
120
0af3d00b 121 spin_lock(&block_group->lock);
5b0e95bf 122 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
0b246afa 123 btrfs_info(fs_info, "Old style space inode found, converting.");
5b0e95bf
JB
124 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
125 BTRFS_INODE_NODATACOW;
2f356126
JB
126 block_group->disk_cache_state = BTRFS_DC_CLEAR;
127 }
128
300e4f8a 129 if (!block_group->iref) {
0af3d00b
JB
130 block_group->inode = igrab(inode);
131 block_group->iref = 1;
132 }
133 spin_unlock(&block_group->lock);
134
135 return inode;
136}
137
48a3b636
ES
138static int __create_free_space_inode(struct btrfs_root *root,
139 struct btrfs_trans_handle *trans,
140 struct btrfs_path *path,
141 u64 ino, u64 offset)
0af3d00b
JB
142{
143 struct btrfs_key key;
144 struct btrfs_disk_key disk_key;
145 struct btrfs_free_space_header *header;
146 struct btrfs_inode_item *inode_item;
147 struct extent_buffer *leaf;
f0d1219d
NB
148 /* We inline CRCs for the free disk space cache */
149 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
150 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
0af3d00b
JB
151 int ret;
152
0414efae 153 ret = btrfs_insert_empty_inode(trans, root, path, ino);
0af3d00b
JB
154 if (ret)
155 return ret;
156
157 leaf = path->nodes[0];
158 inode_item = btrfs_item_ptr(leaf, path->slots[0],
159 struct btrfs_inode_item);
160 btrfs_item_key(leaf, &disk_key, path->slots[0]);
b159fa28 161 memzero_extent_buffer(leaf, (unsigned long)inode_item,
0af3d00b
JB
162 sizeof(*inode_item));
163 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
164 btrfs_set_inode_size(leaf, inode_item, 0);
165 btrfs_set_inode_nbytes(leaf, inode_item, 0);
166 btrfs_set_inode_uid(leaf, inode_item, 0);
167 btrfs_set_inode_gid(leaf, inode_item, 0);
168 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
5b0e95bf 169 btrfs_set_inode_flags(leaf, inode_item, flags);
0af3d00b
JB
170 btrfs_set_inode_nlink(leaf, inode_item, 1);
171 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
0414efae 172 btrfs_set_inode_block_group(leaf, inode_item, offset);
0af3d00b 173 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 174 btrfs_release_path(path);
0af3d00b
JB
175
176 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 177 key.offset = offset;
0af3d00b 178 key.type = 0;
0af3d00b
JB
179 ret = btrfs_insert_empty_item(trans, root, path, &key,
180 sizeof(struct btrfs_free_space_header));
181 if (ret < 0) {
b3b4aa74 182 btrfs_release_path(path);
0af3d00b
JB
183 return ret;
184 }
c9dc4c65 185
0af3d00b
JB
186 leaf = path->nodes[0];
187 header = btrfs_item_ptr(leaf, path->slots[0],
188 struct btrfs_free_space_header);
b159fa28 189 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
0af3d00b
JB
190 btrfs_set_free_space_key(leaf, header, &disk_key);
191 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 192 btrfs_release_path(path);
0af3d00b
JB
193
194 return 0;
195}
196
4ca75f1b 197int create_free_space_inode(struct btrfs_trans_handle *trans,
32da5386 198 struct btrfs_block_group *block_group,
0414efae
LZ
199 struct btrfs_path *path)
200{
201 int ret;
202 u64 ino;
203
543068a2 204 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
0414efae
LZ
205 if (ret < 0)
206 return ret;
207
4ca75f1b 208 return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
b3470b5d 209 ino, block_group->start);
0414efae
LZ
210}
211
36b216c8
BB
212/*
213 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
214 * handles lookup, otherwise it takes ownership and iputs the inode.
215 * Don't reuse an inode pointer after passing it into this function.
216 */
217int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
218 struct inode *inode,
219 struct btrfs_block_group *block_group)
220{
221 struct btrfs_path *path;
222 struct btrfs_key key;
223 int ret = 0;
224
225 path = btrfs_alloc_path();
226 if (!path)
227 return -ENOMEM;
228
229 if (!inode)
230 inode = lookup_free_space_inode(block_group, path);
231 if (IS_ERR(inode)) {
232 if (PTR_ERR(inode) != -ENOENT)
233 ret = PTR_ERR(inode);
234 goto out;
235 }
236 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
237 if (ret) {
238 btrfs_add_delayed_iput(inode);
239 goto out;
240 }
241 clear_nlink(inode);
242 /* One for the block groups ref */
243 spin_lock(&block_group->lock);
244 if (block_group->iref) {
245 block_group->iref = 0;
246 block_group->inode = NULL;
247 spin_unlock(&block_group->lock);
248 iput(inode);
249 } else {
250 spin_unlock(&block_group->lock);
251 }
252 /* One for the lookup ref */
253 btrfs_add_delayed_iput(inode);
254
255 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
256 key.type = 0;
257 key.offset = block_group->start;
258 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
259 -1, 1);
260 if (ret) {
261 if (ret > 0)
262 ret = 0;
263 goto out;
264 }
265 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
266out:
267 btrfs_free_path(path);
268 return ret;
269}
270
2ff7e61e 271int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
7b61cd92 272 struct btrfs_block_rsv *rsv)
0af3d00b 273{
c8174313 274 u64 needed_bytes;
7b61cd92 275 int ret;
c8174313
JB
276
277 /* 1 for slack space, 1 for updating the inode */
2bd36e7b
JB
278 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
279 btrfs_calc_metadata_size(fs_info, 1);
c8174313 280
7b61cd92
MX
281 spin_lock(&rsv->lock);
282 if (rsv->reserved < needed_bytes)
283 ret = -ENOSPC;
284 else
285 ret = 0;
286 spin_unlock(&rsv->lock);
4b286cd1 287 return ret;
7b61cd92
MX
288}
289
77ab86bf 290int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
32da5386 291 struct btrfs_block_group *block_group,
9a4a1429 292 struct inode *vfs_inode)
7b61cd92 293{
d9ac19c3 294 struct btrfs_truncate_control control = {
71d18b53 295 .inode = BTRFS_I(vfs_inode),
d9ac19c3 296 .new_size = 0,
487e81d2 297 .ino = btrfs_ino(BTRFS_I(vfs_inode)),
d9ac19c3 298 .min_type = BTRFS_EXTENT_DATA_KEY,
655807b8 299 .clear_extent_range = true,
d9ac19c3 300 };
9a4a1429
JB
301 struct btrfs_inode *inode = BTRFS_I(vfs_inode);
302 struct btrfs_root *root = inode->root;
303 struct extent_state *cached_state = NULL;
7b61cd92 304 int ret = 0;
35c76642 305 bool locked = false;
1bbc621e 306
1bbc621e 307 if (block_group) {
21e75ffe
JM
308 struct btrfs_path *path = btrfs_alloc_path();
309
310 if (!path) {
311 ret = -ENOMEM;
312 goto fail;
313 }
35c76642 314 locked = true;
1bbc621e
CM
315 mutex_lock(&trans->transaction->cache_write_mutex);
316 if (!list_empty(&block_group->io_list)) {
317 list_del_init(&block_group->io_list);
318
afdb5718 319 btrfs_wait_cache_io(trans, block_group, path);
1bbc621e
CM
320 btrfs_put_block_group(block_group);
321 }
322
323 /*
324 * now that we've truncated the cache away, its no longer
325 * setup or written
326 */
327 spin_lock(&block_group->lock);
328 block_group->disk_cache_state = BTRFS_DC_CLEAR;
329 spin_unlock(&block_group->lock);
21e75ffe 330 btrfs_free_path(path);
1bbc621e 331 }
0af3d00b 332
9a4a1429
JB
333 btrfs_i_size_write(inode, 0);
334 truncate_pagecache(vfs_inode, 0);
335
336 lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
337 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
0af3d00b
JB
338
339 /*
f7e9e8fc
OS
340 * We skip the throttling logic for free space cache inodes, so we don't
341 * need to check for -EAGAIN.
0af3d00b 342 */
71d18b53 343 ret = btrfs_truncate_inode_items(trans, root, &control);
c2ddb612 344
462b728e 345 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
c2ddb612
JB
346 btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
347
9a4a1429 348 unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
35c76642
FM
349 if (ret)
350 goto fail;
0af3d00b 351
9a4a1429 352 ret = btrfs_update_inode(trans, root, inode);
1bbc621e 353
1bbc621e 354fail:
35c76642
FM
355 if (locked)
356 mutex_unlock(&trans->transaction->cache_write_mutex);
79787eaa 357 if (ret)
66642832 358 btrfs_abort_transaction(trans, ret);
c8174313 359
82d5902d 360 return ret;
0af3d00b
JB
361}
362
1d480538 363static void readahead_cache(struct inode *inode)
9d66e233 364{
98caf953 365 struct file_ra_state ra;
9d66e233
JB
366 unsigned long last_index;
367
98caf953 368 file_ra_state_init(&ra, inode->i_mapping);
09cbfeaf 369 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
9d66e233 370
98caf953 371 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
9d66e233
JB
372}
373
4c6d1d85 374static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
f15376df 375 int write)
a67509c3 376{
5349d6c3 377 int num_pages;
5349d6c3 378
09cbfeaf 379 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
5349d6c3 380
8f6c72a9 381 /* Make sure we can fit our crcs and generation into the first page */
7dbdb443 382 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
5349d6c3
MX
383 return -ENOSPC;
384
4c6d1d85 385 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
5349d6c3 386
31e818fe 387 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
a67509c3
JB
388 if (!io_ctl->pages)
389 return -ENOMEM;
5349d6c3
MX
390
391 io_ctl->num_pages = num_pages;
f15376df 392 io_ctl->fs_info = btrfs_sb(inode->i_sb);
c9dc4c65 393 io_ctl->inode = inode;
5349d6c3 394
a67509c3
JB
395 return 0;
396}
663faf9f 397ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
a67509c3 398
4c6d1d85 399static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
400{
401 kfree(io_ctl->pages);
c9dc4c65 402 io_ctl->pages = NULL;
a67509c3
JB
403}
404
4c6d1d85 405static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
406{
407 if (io_ctl->cur) {
a67509c3
JB
408 io_ctl->cur = NULL;
409 io_ctl->orig = NULL;
410 }
411}
412
4c6d1d85 413static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
a67509c3 414{
b12d6869 415 ASSERT(io_ctl->index < io_ctl->num_pages);
a67509c3 416 io_ctl->page = io_ctl->pages[io_ctl->index++];
2b108268 417 io_ctl->cur = page_address(io_ctl->page);
a67509c3 418 io_ctl->orig = io_ctl->cur;
09cbfeaf 419 io_ctl->size = PAGE_SIZE;
a67509c3 420 if (clear)
619a9742 421 clear_page(io_ctl->cur);
a67509c3
JB
422}
423
4c6d1d85 424static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
a67509c3
JB
425{
426 int i;
427
428 io_ctl_unmap_page(io_ctl);
429
430 for (i = 0; i < io_ctl->num_pages; i++) {
a1ee5a45 431 if (io_ctl->pages[i]) {
e4f94347
QW
432 btrfs_page_clear_checked(io_ctl->fs_info,
433 io_ctl->pages[i],
434 page_offset(io_ctl->pages[i]),
435 PAGE_SIZE);
a1ee5a45 436 unlock_page(io_ctl->pages[i]);
09cbfeaf 437 put_page(io_ctl->pages[i]);
a1ee5a45 438 }
a67509c3
JB
439 }
440}
441
7a195f6d 442static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
a67509c3
JB
443{
444 struct page *page;
831fa14f 445 struct inode *inode = io_ctl->inode;
a67509c3
JB
446 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
447 int i;
448
449 for (i = 0; i < io_ctl->num_pages; i++) {
32443de3
QW
450 int ret;
451
a67509c3
JB
452 page = find_or_create_page(inode->i_mapping, i, mask);
453 if (!page) {
454 io_ctl_drop_pages(io_ctl);
455 return -ENOMEM;
456 }
32443de3
QW
457
458 ret = set_page_extent_mapped(page);
459 if (ret < 0) {
460 unlock_page(page);
461 put_page(page);
462 io_ctl_drop_pages(io_ctl);
463 return ret;
464 }
465
a67509c3
JB
466 io_ctl->pages[i] = page;
467 if (uptodate && !PageUptodate(page)) {
fb12489b 468 btrfs_read_folio(NULL, page_folio(page));
a67509c3 469 lock_page(page);
3797136b
JB
470 if (page->mapping != inode->i_mapping) {
471 btrfs_err(BTRFS_I(inode)->root->fs_info,
472 "free space cache page truncated");
473 io_ctl_drop_pages(io_ctl);
474 return -EIO;
475 }
a67509c3 476 if (!PageUptodate(page)) {
efe120a0
FH
477 btrfs_err(BTRFS_I(inode)->root->fs_info,
478 "error reading free space cache");
a67509c3
JB
479 io_ctl_drop_pages(io_ctl);
480 return -EIO;
481 }
482 }
483 }
484
32443de3 485 for (i = 0; i < io_ctl->num_pages; i++)
f7d61dcd 486 clear_page_dirty_for_io(io_ctl->pages[i]);
f7d61dcd 487
a67509c3
JB
488 return 0;
489}
490
4c6d1d85 491static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
a67509c3 492{
a67509c3
JB
493 io_ctl_map_page(io_ctl, 1);
494
495 /*
5b0e95bf
JB
496 * Skip the csum areas. If we don't check crcs then we just have a
497 * 64bit chunk at the front of the first page.
a67509c3 498 */
7dbdb443
NB
499 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
500 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
a67509c3 501
6994ca36 502 put_unaligned_le64(generation, io_ctl->cur);
a67509c3 503 io_ctl->cur += sizeof(u64);
a67509c3
JB
504}
505
4c6d1d85 506static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
a67509c3 507{
6994ca36 508 u64 cache_gen;
a67509c3 509
5b0e95bf
JB
510 /*
511 * Skip the crc area. If we don't check crcs then we just have a 64bit
512 * chunk at the front of the first page.
513 */
7dbdb443
NB
514 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
515 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
a67509c3 516
6994ca36
DS
517 cache_gen = get_unaligned_le64(io_ctl->cur);
518 if (cache_gen != generation) {
f15376df 519 btrfs_err_rl(io_ctl->fs_info,
94647322 520 "space cache generation (%llu) does not match inode (%llu)",
6994ca36 521 cache_gen, generation);
a67509c3
JB
522 io_ctl_unmap_page(io_ctl);
523 return -EIO;
524 }
525 io_ctl->cur += sizeof(u64);
5b0e95bf
JB
526 return 0;
527}
528
4c6d1d85 529static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
5b0e95bf
JB
530{
531 u32 *tmp;
532 u32 crc = ~(u32)0;
533 unsigned offset = 0;
534
5b0e95bf 535 if (index == 0)
cb54f257 536 offset = sizeof(u32) * io_ctl->num_pages;
5b0e95bf 537
4bb3c2e2
JT
538 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
539 btrfs_crc32c_final(crc, (u8 *)&crc);
5b0e95bf 540 io_ctl_unmap_page(io_ctl);
2b108268 541 tmp = page_address(io_ctl->pages[0]);
5b0e95bf
JB
542 tmp += index;
543 *tmp = crc;
5b0e95bf
JB
544}
545
4c6d1d85 546static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
5b0e95bf
JB
547{
548 u32 *tmp, val;
549 u32 crc = ~(u32)0;
550 unsigned offset = 0;
551
5b0e95bf
JB
552 if (index == 0)
553 offset = sizeof(u32) * io_ctl->num_pages;
554
2b108268 555 tmp = page_address(io_ctl->pages[0]);
5b0e95bf
JB
556 tmp += index;
557 val = *tmp;
5b0e95bf
JB
558
559 io_ctl_map_page(io_ctl, 0);
4bb3c2e2
JT
560 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
561 btrfs_crc32c_final(crc, (u8 *)&crc);
5b0e95bf 562 if (val != crc) {
f15376df 563 btrfs_err_rl(io_ctl->fs_info,
94647322 564 "csum mismatch on free space cache");
5b0e95bf
JB
565 io_ctl_unmap_page(io_ctl);
566 return -EIO;
567 }
568
a67509c3
JB
569 return 0;
570}
571
4c6d1d85 572static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
a67509c3
JB
573 void *bitmap)
574{
575 struct btrfs_free_space_entry *entry;
576
577 if (!io_ctl->cur)
578 return -ENOSPC;
579
580 entry = io_ctl->cur;
6994ca36
DS
581 put_unaligned_le64(offset, &entry->offset);
582 put_unaligned_le64(bytes, &entry->bytes);
a67509c3
JB
583 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
584 BTRFS_FREE_SPACE_EXTENT;
585 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
586 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
587
588 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
589 return 0;
590
5b0e95bf 591 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
592
593 /* No more pages to map */
594 if (io_ctl->index >= io_ctl->num_pages)
595 return 0;
596
597 /* map the next page */
598 io_ctl_map_page(io_ctl, 1);
599 return 0;
600}
601
4c6d1d85 602static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
a67509c3
JB
603{
604 if (!io_ctl->cur)
605 return -ENOSPC;
606
607 /*
608 * If we aren't at the start of the current page, unmap this one and
609 * map the next one if there is any left.
610 */
611 if (io_ctl->cur != io_ctl->orig) {
5b0e95bf 612 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
613 if (io_ctl->index >= io_ctl->num_pages)
614 return -ENOSPC;
615 io_ctl_map_page(io_ctl, 0);
616 }
617
69d24804 618 copy_page(io_ctl->cur, bitmap);
5b0e95bf 619 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
620 if (io_ctl->index < io_ctl->num_pages)
621 io_ctl_map_page(io_ctl, 0);
622 return 0;
623}
624
4c6d1d85 625static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
a67509c3 626{
5b0e95bf
JB
627 /*
628 * If we're not on the boundary we know we've modified the page and we
629 * need to crc the page.
630 */
631 if (io_ctl->cur != io_ctl->orig)
632 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
633 else
634 io_ctl_unmap_page(io_ctl);
a67509c3
JB
635
636 while (io_ctl->index < io_ctl->num_pages) {
637 io_ctl_map_page(io_ctl, 1);
5b0e95bf 638 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
a67509c3
JB
639 }
640}
641
4c6d1d85 642static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
5b0e95bf 643 struct btrfs_free_space *entry, u8 *type)
a67509c3
JB
644{
645 struct btrfs_free_space_entry *e;
2f120c05
JB
646 int ret;
647
648 if (!io_ctl->cur) {
649 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
650 if (ret)
651 return ret;
652 }
a67509c3
JB
653
654 e = io_ctl->cur;
6994ca36
DS
655 entry->offset = get_unaligned_le64(&e->offset);
656 entry->bytes = get_unaligned_le64(&e->bytes);
5b0e95bf 657 *type = e->type;
a67509c3
JB
658 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
659 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
660
661 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
5b0e95bf 662 return 0;
a67509c3
JB
663
664 io_ctl_unmap_page(io_ctl);
665
2f120c05 666 return 0;
a67509c3
JB
667}
668
4c6d1d85 669static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
5b0e95bf 670 struct btrfs_free_space *entry)
a67509c3 671{
5b0e95bf
JB
672 int ret;
673
5b0e95bf
JB
674 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
675 if (ret)
676 return ret;
677
69d24804 678 copy_page(entry->bitmap, io_ctl->cur);
a67509c3 679 io_ctl_unmap_page(io_ctl);
5b0e95bf
JB
680
681 return 0;
a67509c3
JB
682}
683
fa598b06
DS
684static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
685{
364be842 686 struct btrfs_block_group *block_group = ctl->block_group;
fa598b06
DS
687 u64 max_bytes;
688 u64 bitmap_bytes;
689 u64 extent_bytes;
690 u64 size = block_group->length;
691 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
692 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
693
694 max_bitmaps = max_t(u64, max_bitmaps, 1);
695
696 ASSERT(ctl->total_bitmaps <= max_bitmaps);
697
698 /*
699 * We are trying to keep the total amount of memory used per 1GiB of
700 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
701 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
702 * bitmaps, we may end up using more memory than this.
703 */
704 if (size < SZ_1G)
705 max_bytes = MAX_CACHE_BYTES_PER_GIG;
706 else
707 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
708
709 bitmap_bytes = ctl->total_bitmaps * ctl->unit;
710
711 /*
712 * we want the extent entry threshold to always be at most 1/2 the max
713 * bytes we can have, or whatever is less than that.
714 */
715 extent_bytes = max_bytes - bitmap_bytes;
716 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
717
718 ctl->extents_thresh =
719 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
720}
721
48a3b636
ES
722static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
723 struct btrfs_free_space_ctl *ctl,
724 struct btrfs_path *path, u64 offset)
9d66e233 725{
3ffbd68c 726 struct btrfs_fs_info *fs_info = root->fs_info;
9d66e233
JB
727 struct btrfs_free_space_header *header;
728 struct extent_buffer *leaf;
4c6d1d85 729 struct btrfs_io_ctl io_ctl;
9d66e233 730 struct btrfs_key key;
a67509c3 731 struct btrfs_free_space *e, *n;
b76808fc 732 LIST_HEAD(bitmaps);
9d66e233
JB
733 u64 num_entries;
734 u64 num_bitmaps;
735 u64 generation;
a67509c3 736 u8 type;
f6a39829 737 int ret = 0;
9d66e233 738
9d66e233 739 /* Nothing in the space cache, goodbye */
0414efae 740 if (!i_size_read(inode))
a67509c3 741 return 0;
9d66e233
JB
742
743 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
0414efae 744 key.offset = offset;
9d66e233
JB
745 key.type = 0;
746
747 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
0414efae 748 if (ret < 0)
a67509c3 749 return 0;
0414efae 750 else if (ret > 0) {
945d8962 751 btrfs_release_path(path);
a67509c3 752 return 0;
9d66e233
JB
753 }
754
0414efae
LZ
755 ret = -1;
756
9d66e233
JB
757 leaf = path->nodes[0];
758 header = btrfs_item_ptr(leaf, path->slots[0],
759 struct btrfs_free_space_header);
760 num_entries = btrfs_free_space_entries(leaf, header);
761 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
762 generation = btrfs_free_space_generation(leaf, header);
945d8962 763 btrfs_release_path(path);
9d66e233 764
e570fd27 765 if (!BTRFS_I(inode)->generation) {
0b246afa 766 btrfs_info(fs_info,
913e1535 767 "the free space cache file (%llu) is invalid, skip it",
e570fd27
MX
768 offset);
769 return 0;
770 }
771
9d66e233 772 if (BTRFS_I(inode)->generation != generation) {
0b246afa
JM
773 btrfs_err(fs_info,
774 "free space inode generation (%llu) did not match free space cache generation (%llu)",
775 BTRFS_I(inode)->generation, generation);
a67509c3 776 return 0;
9d66e233
JB
777 }
778
779 if (!num_entries)
a67509c3 780 return 0;
9d66e233 781
f15376df 782 ret = io_ctl_init(&io_ctl, inode, 0);
706efc66
LZ
783 if (ret)
784 return ret;
785
1d480538 786 readahead_cache(inode);
9d66e233 787
7a195f6d 788 ret = io_ctl_prepare_pages(&io_ctl, true);
a67509c3
JB
789 if (ret)
790 goto out;
9d66e233 791
5b0e95bf
JB
792 ret = io_ctl_check_crc(&io_ctl, 0);
793 if (ret)
794 goto free_cache;
795
a67509c3
JB
796 ret = io_ctl_check_generation(&io_ctl, generation);
797 if (ret)
798 goto free_cache;
9d66e233 799
a67509c3
JB
800 while (num_entries) {
801 e = kmem_cache_zalloc(btrfs_free_space_cachep,
802 GFP_NOFS);
3cc64e7e
ZC
803 if (!e) {
804 ret = -ENOMEM;
9d66e233 805 goto free_cache;
3cc64e7e 806 }
9d66e233 807
5b0e95bf
JB
808 ret = io_ctl_read_entry(&io_ctl, e, &type);
809 if (ret) {
810 kmem_cache_free(btrfs_free_space_cachep, e);
811 goto free_cache;
812 }
813
a67509c3 814 if (!e->bytes) {
3cc64e7e 815 ret = -1;
a67509c3
JB
816 kmem_cache_free(btrfs_free_space_cachep, e);
817 goto free_cache;
9d66e233 818 }
a67509c3
JB
819
820 if (type == BTRFS_FREE_SPACE_EXTENT) {
821 spin_lock(&ctl->tree_lock);
822 ret = link_free_space(ctl, e);
823 spin_unlock(&ctl->tree_lock);
824 if (ret) {
0b246afa 825 btrfs_err(fs_info,
c2cf52eb 826 "Duplicate entries in free space cache, dumping");
a67509c3 827 kmem_cache_free(btrfs_free_space_cachep, e);
9d66e233
JB
828 goto free_cache;
829 }
a67509c3 830 } else {
b12d6869 831 ASSERT(num_bitmaps);
a67509c3 832 num_bitmaps--;
3acd4850
CL
833 e->bitmap = kmem_cache_zalloc(
834 btrfs_free_space_bitmap_cachep, GFP_NOFS);
a67509c3 835 if (!e->bitmap) {
3cc64e7e 836 ret = -ENOMEM;
a67509c3
JB
837 kmem_cache_free(
838 btrfs_free_space_cachep, e);
9d66e233
JB
839 goto free_cache;
840 }
a67509c3
JB
841 spin_lock(&ctl->tree_lock);
842 ret = link_free_space(ctl, e);
843 ctl->total_bitmaps++;
fa598b06 844 recalculate_thresholds(ctl);
a67509c3
JB
845 spin_unlock(&ctl->tree_lock);
846 if (ret) {
0b246afa 847 btrfs_err(fs_info,
c2cf52eb 848 "Duplicate entries in free space cache, dumping");
dc89e982 849 kmem_cache_free(btrfs_free_space_cachep, e);
9d66e233
JB
850 goto free_cache;
851 }
a67509c3 852 list_add_tail(&e->list, &bitmaps);
9d66e233
JB
853 }
854
a67509c3
JB
855 num_entries--;
856 }
9d66e233 857
2f120c05
JB
858 io_ctl_unmap_page(&io_ctl);
859
a67509c3
JB
860 /*
861 * We add the bitmaps at the end of the entries in order that
862 * the bitmap entries are added to the cache.
863 */
864 list_for_each_entry_safe(e, n, &bitmaps, list) {
9d66e233 865 list_del_init(&e->list);
5b0e95bf
JB
866 ret = io_ctl_read_bitmap(&io_ctl, e);
867 if (ret)
868 goto free_cache;
9d66e233
JB
869 }
870
a67509c3 871 io_ctl_drop_pages(&io_ctl);
9d66e233
JB
872 ret = 1;
873out:
a67509c3 874 io_ctl_free(&io_ctl);
9d66e233 875 return ret;
9d66e233 876free_cache:
a67509c3 877 io_ctl_drop_pages(&io_ctl);
0414efae 878 __btrfs_remove_free_space_cache(ctl);
9d66e233
JB
879 goto out;
880}
881
cd79909b
JB
882static int copy_free_space_cache(struct btrfs_block_group *block_group,
883 struct btrfs_free_space_ctl *ctl)
884{
885 struct btrfs_free_space *info;
886 struct rb_node *n;
887 int ret = 0;
888
889 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
890 info = rb_entry(n, struct btrfs_free_space, offset_index);
891 if (!info->bitmap) {
32e1649b 892 unlink_free_space(ctl, info, true);
cd79909b
JB
893 ret = btrfs_add_free_space(block_group, info->offset,
894 info->bytes);
895 kmem_cache_free(btrfs_free_space_cachep, info);
896 } else {
897 u64 offset = info->offset;
898 u64 bytes = ctl->unit;
899
900 while (search_bitmap(ctl, info, &offset, &bytes,
901 false) == 0) {
902 ret = btrfs_add_free_space(block_group, offset,
903 bytes);
904 if (ret)
905 break;
f594f13c 906 bitmap_clear_bits(ctl, info, offset, bytes, true);
cd79909b
JB
907 offset = info->offset;
908 bytes = ctl->unit;
909 }
910 free_bitmap(ctl, info);
911 }
912 cond_resched();
913 }
914 return ret;
915}
916
32da5386 917int load_free_space_cache(struct btrfs_block_group *block_group)
0cb59c99 918{
bb6cb1c5 919 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 920 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
cd79909b 921 struct btrfs_free_space_ctl tmp_ctl = {};
0414efae
LZ
922 struct inode *inode;
923 struct btrfs_path *path;
5b0e95bf 924 int ret = 0;
0414efae 925 bool matched;
bf38be65 926 u64 used = block_group->used;
0414efae 927
cd79909b
JB
928 /*
929 * Because we could potentially discard our loaded free space, we want
930 * to load everything into a temporary structure first, and then if it's
931 * valid copy it all into the actual free space ctl.
932 */
933 btrfs_init_free_space_ctl(block_group, &tmp_ctl);
934
0414efae
LZ
935 /*
936 * If this block group has been marked to be cleared for one reason or
937 * another then we can't trust the on disk cache, so just return.
938 */
9d66e233 939 spin_lock(&block_group->lock);
0414efae
LZ
940 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
941 spin_unlock(&block_group->lock);
942 return 0;
943 }
9d66e233 944 spin_unlock(&block_group->lock);
0414efae
LZ
945
946 path = btrfs_alloc_path();
947 if (!path)
948 return 0;
d53ba474
JB
949 path->search_commit_root = 1;
950 path->skip_locking = 1;
0414efae 951
4222ea71
FM
952 /*
953 * We must pass a path with search_commit_root set to btrfs_iget in
954 * order to avoid a deadlock when allocating extents for the tree root.
955 *
956 * When we are COWing an extent buffer from the tree root, when looking
957 * for a free extent, at extent-tree.c:find_free_extent(), we can find
958 * block group without its free space cache loaded. When we find one
959 * we must load its space cache which requires reading its free space
960 * cache's inode item from the root tree. If this inode item is located
961 * in the same leaf that we started COWing before, then we end up in
962 * deadlock on the extent buffer (trying to read lock it when we
963 * previously write locked it).
964 *
965 * It's safe to read the inode item using the commit root because
966 * block groups, once loaded, stay in memory forever (until they are
967 * removed) as well as their space caches once loaded. New block groups
968 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
969 * we will never try to read their inode item while the fs is mounted.
970 */
7949f339 971 inode = lookup_free_space_inode(block_group, path);
0414efae
LZ
972 if (IS_ERR(inode)) {
973 btrfs_free_path(path);
974 return 0;
975 }
976
5b0e95bf
JB
977 /* We may have converted the inode and made the cache invalid. */
978 spin_lock(&block_group->lock);
979 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
980 spin_unlock(&block_group->lock);
a7e221e9 981 btrfs_free_path(path);
5b0e95bf
JB
982 goto out;
983 }
984 spin_unlock(&block_group->lock);
985
cd79909b 986 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
b3470b5d 987 path, block_group->start);
0414efae
LZ
988 btrfs_free_path(path);
989 if (ret <= 0)
990 goto out;
991
cd79909b
JB
992 matched = (tmp_ctl.free_space == (block_group->length - used -
993 block_group->bytes_super));
0414efae 994
cd79909b
JB
995 if (matched) {
996 ret = copy_free_space_cache(block_group, &tmp_ctl);
997 /*
998 * ret == 1 means we successfully loaded the free space cache,
999 * so we need to re-set it here.
1000 */
1001 if (ret == 0)
1002 ret = 1;
1003 } else {
1004 __btrfs_remove_free_space_cache(&tmp_ctl);
5d163e0e
JM
1005 btrfs_warn(fs_info,
1006 "block group %llu has wrong amount of free space",
b3470b5d 1007 block_group->start);
0414efae
LZ
1008 ret = -1;
1009 }
1010out:
1011 if (ret < 0) {
1012 /* This cache is bogus, make sure it gets cleared */
1013 spin_lock(&block_group->lock);
1014 block_group->disk_cache_state = BTRFS_DC_CLEAR;
1015 spin_unlock(&block_group->lock);
82d5902d 1016 ret = 0;
0414efae 1017
5d163e0e
JM
1018 btrfs_warn(fs_info,
1019 "failed to load free space cache for block group %llu, rebuilding it now",
b3470b5d 1020 block_group->start);
0414efae
LZ
1021 }
1022
66b53bae
JB
1023 spin_lock(&ctl->tree_lock);
1024 btrfs_discard_update_discardable(block_group);
1025 spin_unlock(&ctl->tree_lock);
0414efae
LZ
1026 iput(inode);
1027 return ret;
9d66e233
JB
1028}
1029
d4452bc5 1030static noinline_for_stack
4c6d1d85 1031int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
d4452bc5 1032 struct btrfs_free_space_ctl *ctl,
32da5386 1033 struct btrfs_block_group *block_group,
d4452bc5
CM
1034 int *entries, int *bitmaps,
1035 struct list_head *bitmap_list)
0cb59c99 1036{
c09544e0 1037 int ret;
d4452bc5 1038 struct btrfs_free_cluster *cluster = NULL;
1bbc621e 1039 struct btrfs_free_cluster *cluster_locked = NULL;
d4452bc5 1040 struct rb_node *node = rb_first(&ctl->free_space_offset);
55507ce3 1041 struct btrfs_trim_range *trim_entry;
be1a12a0 1042
43be2146 1043 /* Get the cluster for this block_group if it exists */
d4452bc5 1044 if (block_group && !list_empty(&block_group->cluster_list)) {
43be2146
JB
1045 cluster = list_entry(block_group->cluster_list.next,
1046 struct btrfs_free_cluster,
1047 block_group_list);
d4452bc5 1048 }
43be2146 1049
f75b130e 1050 if (!node && cluster) {
1bbc621e
CM
1051 cluster_locked = cluster;
1052 spin_lock(&cluster_locked->lock);
f75b130e
JB
1053 node = rb_first(&cluster->root);
1054 cluster = NULL;
1055 }
1056
a67509c3
JB
1057 /* Write out the extent entries */
1058 while (node) {
1059 struct btrfs_free_space *e;
0cb59c99 1060
a67509c3 1061 e = rb_entry(node, struct btrfs_free_space, offset_index);
d4452bc5 1062 *entries += 1;
0cb59c99 1063
d4452bc5 1064 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
a67509c3
JB
1065 e->bitmap);
1066 if (ret)
d4452bc5 1067 goto fail;
2f356126 1068
a67509c3 1069 if (e->bitmap) {
d4452bc5
CM
1070 list_add_tail(&e->list, bitmap_list);
1071 *bitmaps += 1;
2f356126 1072 }
a67509c3
JB
1073 node = rb_next(node);
1074 if (!node && cluster) {
1075 node = rb_first(&cluster->root);
1bbc621e
CM
1076 cluster_locked = cluster;
1077 spin_lock(&cluster_locked->lock);
a67509c3 1078 cluster = NULL;
43be2146 1079 }
a67509c3 1080 }
1bbc621e
CM
1081 if (cluster_locked) {
1082 spin_unlock(&cluster_locked->lock);
1083 cluster_locked = NULL;
1084 }
55507ce3
FM
1085
1086 /*
1087 * Make sure we don't miss any range that was removed from our rbtree
1088 * because trimming is running. Otherwise after a umount+mount (or crash
1089 * after committing the transaction) we would leak free space and get
1090 * an inconsistent free space cache report from fsck.
1091 */
1092 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1093 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1094 trim_entry->bytes, NULL);
1095 if (ret)
1096 goto fail;
1097 *entries += 1;
1098 }
1099
d4452bc5
CM
1100 return 0;
1101fail:
1bbc621e
CM
1102 if (cluster_locked)
1103 spin_unlock(&cluster_locked->lock);
d4452bc5
CM
1104 return -ENOSPC;
1105}
1106
1107static noinline_for_stack int
1108update_cache_item(struct btrfs_trans_handle *trans,
1109 struct btrfs_root *root,
1110 struct inode *inode,
1111 struct btrfs_path *path, u64 offset,
1112 int entries, int bitmaps)
1113{
1114 struct btrfs_key key;
1115 struct btrfs_free_space_header *header;
1116 struct extent_buffer *leaf;
1117 int ret;
1118
1119 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1120 key.offset = offset;
1121 key.type = 0;
1122
1123 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1124 if (ret < 0) {
1125 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
e182163d 1126 EXTENT_DELALLOC, 0, 0, NULL);
d4452bc5
CM
1127 goto fail;
1128 }
1129 leaf = path->nodes[0];
1130 if (ret > 0) {
1131 struct btrfs_key found_key;
1132 ASSERT(path->slots[0]);
1133 path->slots[0]--;
1134 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1135 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1136 found_key.offset != offset) {
1137 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
e182163d
OS
1138 inode->i_size - 1, EXTENT_DELALLOC, 0,
1139 0, NULL);
d4452bc5
CM
1140 btrfs_release_path(path);
1141 goto fail;
1142 }
1143 }
1144
1145 BTRFS_I(inode)->generation = trans->transid;
1146 header = btrfs_item_ptr(leaf, path->slots[0],
1147 struct btrfs_free_space_header);
1148 btrfs_set_free_space_entries(leaf, header, entries);
1149 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1150 btrfs_set_free_space_generation(leaf, header, trans->transid);
1151 btrfs_mark_buffer_dirty(leaf);
1152 btrfs_release_path(path);
1153
1154 return 0;
1155
1156fail:
1157 return -1;
1158}
1159
6701bdb3 1160static noinline_for_stack int write_pinned_extent_entries(
6b45f641 1161 struct btrfs_trans_handle *trans,
32da5386 1162 struct btrfs_block_group *block_group,
4c6d1d85 1163 struct btrfs_io_ctl *io_ctl,
5349d6c3 1164 int *entries)
d4452bc5
CM
1165{
1166 u64 start, extent_start, extent_end, len;
d4452bc5
CM
1167 struct extent_io_tree *unpin = NULL;
1168 int ret;
43be2146 1169
5349d6c3
MX
1170 if (!block_group)
1171 return 0;
1172
a67509c3
JB
1173 /*
1174 * We want to add any pinned extents to our free space cache
1175 * so we don't leak the space
d4452bc5 1176 *
db804f23
LZ
1177 * We shouldn't have switched the pinned extents yet so this is the
1178 * right one
1179 */
fe119a6e 1180 unpin = &trans->transaction->pinned_extents;
db804f23 1181
b3470b5d 1182 start = block_group->start;
db804f23 1183
b3470b5d 1184 while (start < block_group->start + block_group->length) {
db804f23
LZ
1185 ret = find_first_extent_bit(unpin, start,
1186 &extent_start, &extent_end,
e6138876 1187 EXTENT_DIRTY, NULL);
5349d6c3
MX
1188 if (ret)
1189 return 0;
0cb59c99 1190
a67509c3 1191 /* This pinned extent is out of our range */
b3470b5d 1192 if (extent_start >= block_group->start + block_group->length)
5349d6c3 1193 return 0;
2f356126 1194
db804f23 1195 extent_start = max(extent_start, start);
b3470b5d
DS
1196 extent_end = min(block_group->start + block_group->length,
1197 extent_end + 1);
db804f23 1198 len = extent_end - extent_start;
0cb59c99 1199
d4452bc5
CM
1200 *entries += 1;
1201 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
a67509c3 1202 if (ret)
5349d6c3 1203 return -ENOSPC;
0cb59c99 1204
db804f23 1205 start = extent_end;
a67509c3 1206 }
0cb59c99 1207
5349d6c3
MX
1208 return 0;
1209}
1210
1211static noinline_for_stack int
4c6d1d85 1212write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
5349d6c3 1213{
7ae1681e 1214 struct btrfs_free_space *entry, *next;
5349d6c3
MX
1215 int ret;
1216
0cb59c99 1217 /* Write out the bitmaps */
7ae1681e 1218 list_for_each_entry_safe(entry, next, bitmap_list, list) {
d4452bc5 1219 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
a67509c3 1220 if (ret)
5349d6c3 1221 return -ENOSPC;
0cb59c99 1222 list_del_init(&entry->list);
be1a12a0
JB
1223 }
1224
5349d6c3
MX
1225 return 0;
1226}
0cb59c99 1227
5349d6c3
MX
1228static int flush_dirty_cache(struct inode *inode)
1229{
1230 int ret;
be1a12a0 1231
0ef8b726 1232 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5349d6c3 1233 if (ret)
0ef8b726 1234 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
e182163d 1235 EXTENT_DELALLOC, 0, 0, NULL);
0cb59c99 1236
5349d6c3 1237 return ret;
d4452bc5
CM
1238}
1239
1240static void noinline_for_stack
a3bdccc4 1241cleanup_bitmap_list(struct list_head *bitmap_list)
d4452bc5 1242{
7ae1681e 1243 struct btrfs_free_space *entry, *next;
5349d6c3 1244
7ae1681e 1245 list_for_each_entry_safe(entry, next, bitmap_list, list)
d4452bc5 1246 list_del_init(&entry->list);
a3bdccc4
CM
1247}
1248
1249static void noinline_for_stack
1250cleanup_write_cache_enospc(struct inode *inode,
1251 struct btrfs_io_ctl *io_ctl,
7bf1a159 1252 struct extent_state **cached_state)
a3bdccc4 1253{
d4452bc5
CM
1254 io_ctl_drop_pages(io_ctl);
1255 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
e43bbe5e 1256 i_size_read(inode) - 1, cached_state);
d4452bc5 1257}
549b4fdb 1258
afdb5718
JM
1259static int __btrfs_wait_cache_io(struct btrfs_root *root,
1260 struct btrfs_trans_handle *trans,
32da5386 1261 struct btrfs_block_group *block_group,
afdb5718
JM
1262 struct btrfs_io_ctl *io_ctl,
1263 struct btrfs_path *path, u64 offset)
c9dc4c65
CM
1264{
1265 int ret;
1266 struct inode *inode = io_ctl->inode;
1267
1bbc621e
CM
1268 if (!inode)
1269 return 0;
1270
c9dc4c65
CM
1271 /* Flush the dirty pages in the cache file. */
1272 ret = flush_dirty_cache(inode);
1273 if (ret)
1274 goto out;
1275
1276 /* Update the cache item to tell everyone this cache file is valid. */
1277 ret = update_cache_item(trans, root, inode, path, offset,
1278 io_ctl->entries, io_ctl->bitmaps);
1279out:
c9dc4c65
CM
1280 if (ret) {
1281 invalidate_inode_pages2(inode->i_mapping);
1282 BTRFS_I(inode)->generation = 0;
bbcd1f4d
FM
1283 if (block_group)
1284 btrfs_debug(root->fs_info,
2e69a7a6
FM
1285 "failed to write free space cache for block group %llu error %d",
1286 block_group->start, ret);
c9dc4c65 1287 }
9a56fcd1 1288 btrfs_update_inode(trans, root, BTRFS_I(inode));
c9dc4c65
CM
1289
1290 if (block_group) {
1bbc621e
CM
1291 /* the dirty list is protected by the dirty_bgs_lock */
1292 spin_lock(&trans->transaction->dirty_bgs_lock);
1293
1294 /* the disk_cache_state is protected by the block group lock */
c9dc4c65
CM
1295 spin_lock(&block_group->lock);
1296
1297 /*
1298 * only mark this as written if we didn't get put back on
1bbc621e
CM
1299 * the dirty list while waiting for IO. Otherwise our
1300 * cache state won't be right, and we won't get written again
c9dc4c65
CM
1301 */
1302 if (!ret && list_empty(&block_group->dirty_list))
1303 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1304 else if (ret)
1305 block_group->disk_cache_state = BTRFS_DC_ERROR;
1306
1307 spin_unlock(&block_group->lock);
1bbc621e 1308 spin_unlock(&trans->transaction->dirty_bgs_lock);
c9dc4c65
CM
1309 io_ctl->inode = NULL;
1310 iput(inode);
1311 }
1312
1313 return ret;
1314
1315}
1316
afdb5718 1317int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
32da5386 1318 struct btrfs_block_group *block_group,
afdb5718
JM
1319 struct btrfs_path *path)
1320{
1321 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1322 block_group, &block_group->io_ctl,
b3470b5d 1323 path, block_group->start);
afdb5718
JM
1324}
1325
d4452bc5 1326/**
f092cf3c
NB
1327 * Write out cached info to an inode
1328 *
1329 * @root: root the inode belongs to
1330 * @inode: freespace inode we are writing out
1331 * @ctl: free space cache we are going to write out
1332 * @block_group: block_group for this cache if it belongs to a block_group
1333 * @io_ctl: holds context for the io
1334 * @trans: the trans handle
d4452bc5
CM
1335 *
1336 * This function writes out a free space cache struct to disk for quick recovery
8cd1e731 1337 * on mount. This will return 0 if it was successful in writing the cache out,
b8605454 1338 * or an errno if it was not.
d4452bc5
CM
1339 */
1340static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1341 struct btrfs_free_space_ctl *ctl,
32da5386 1342 struct btrfs_block_group *block_group,
c9dc4c65 1343 struct btrfs_io_ctl *io_ctl,
0e8d931a 1344 struct btrfs_trans_handle *trans)
d4452bc5
CM
1345{
1346 struct extent_state *cached_state = NULL;
5349d6c3 1347 LIST_HEAD(bitmap_list);
d4452bc5
CM
1348 int entries = 0;
1349 int bitmaps = 0;
1350 int ret;
c9dc4c65 1351 int must_iput = 0;
d4452bc5
CM
1352
1353 if (!i_size_read(inode))
b8605454 1354 return -EIO;
d4452bc5 1355
c9dc4c65 1356 WARN_ON(io_ctl->pages);
f15376df 1357 ret = io_ctl_init(io_ctl, inode, 1);
d4452bc5 1358 if (ret)
b8605454 1359 return ret;
d4452bc5 1360
e570fd27
MX
1361 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1362 down_write(&block_group->data_rwsem);
1363 spin_lock(&block_group->lock);
1364 if (block_group->delalloc_bytes) {
1365 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1366 spin_unlock(&block_group->lock);
1367 up_write(&block_group->data_rwsem);
1368 BTRFS_I(inode)->generation = 0;
1369 ret = 0;
c9dc4c65 1370 must_iput = 1;
e570fd27
MX
1371 goto out;
1372 }
1373 spin_unlock(&block_group->lock);
1374 }
1375
d4452bc5 1376 /* Lock all pages first so we can lock the extent safely. */
7a195f6d 1377 ret = io_ctl_prepare_pages(io_ctl, false);
b8605454 1378 if (ret)
b77000ed 1379 goto out_unlock;
d4452bc5
CM
1380
1381 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
ff13db41 1382 &cached_state);
d4452bc5 1383
c9dc4c65 1384 io_ctl_set_generation(io_ctl, trans->transid);
d4452bc5 1385
55507ce3 1386 mutex_lock(&ctl->cache_writeout_mutex);
5349d6c3 1387 /* Write out the extent entries in the free space cache */
1bbc621e 1388 spin_lock(&ctl->tree_lock);
c9dc4c65 1389 ret = write_cache_extent_entries(io_ctl, ctl,
d4452bc5
CM
1390 block_group, &entries, &bitmaps,
1391 &bitmap_list);
a3bdccc4
CM
1392 if (ret)
1393 goto out_nospc_locked;
d4452bc5 1394
5349d6c3
MX
1395 /*
1396 * Some spaces that are freed in the current transaction are pinned,
1397 * they will be added into free space cache after the transaction is
1398 * committed, we shouldn't lose them.
1bbc621e
CM
1399 *
1400 * If this changes while we are working we'll get added back to
1401 * the dirty list and redo it. No locking needed
5349d6c3 1402 */
6b45f641 1403 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
a3bdccc4
CM
1404 if (ret)
1405 goto out_nospc_locked;
5349d6c3 1406
55507ce3
FM
1407 /*
1408 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1409 * locked while doing it because a concurrent trim can be manipulating
1410 * or freeing the bitmap.
1411 */
c9dc4c65 1412 ret = write_bitmap_entries(io_ctl, &bitmap_list);
1bbc621e 1413 spin_unlock(&ctl->tree_lock);
55507ce3 1414 mutex_unlock(&ctl->cache_writeout_mutex);
5349d6c3
MX
1415 if (ret)
1416 goto out_nospc;
1417
1418 /* Zero out the rest of the pages just to make sure */
c9dc4c65 1419 io_ctl_zero_remaining_pages(io_ctl);
d4452bc5 1420
5349d6c3 1421 /* Everything is written out, now we dirty the pages in the file. */
088545f6
NB
1422 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1423 io_ctl->num_pages, 0, i_size_read(inode),
aa8c1a41 1424 &cached_state, false);
5349d6c3 1425 if (ret)
d4452bc5 1426 goto out_nospc;
5349d6c3 1427
e570fd27
MX
1428 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1429 up_write(&block_group->data_rwsem);
5349d6c3
MX
1430 /*
1431 * Release the pages and unlock the extent, we will flush
1432 * them out later
1433 */
c9dc4c65 1434 io_ctl_drop_pages(io_ctl);
bbc37d6e 1435 io_ctl_free(io_ctl);
5349d6c3
MX
1436
1437 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
e43bbe5e 1438 i_size_read(inode) - 1, &cached_state);
5349d6c3 1439
c9dc4c65
CM
1440 /*
1441 * at this point the pages are under IO and we're happy,
260db43c 1442 * The caller is responsible for waiting on them and updating
c9dc4c65
CM
1443 * the cache and the inode
1444 */
1445 io_ctl->entries = entries;
1446 io_ctl->bitmaps = bitmaps;
1447
1448 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
5349d6c3 1449 if (ret)
d4452bc5
CM
1450 goto out;
1451
c9dc4c65
CM
1452 return 0;
1453
a3bdccc4
CM
1454out_nospc_locked:
1455 cleanup_bitmap_list(&bitmap_list);
1456 spin_unlock(&ctl->tree_lock);
1457 mutex_unlock(&ctl->cache_writeout_mutex);
1458
a67509c3 1459out_nospc:
7bf1a159 1460 cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
e570fd27 1461
b77000ed 1462out_unlock:
e570fd27
MX
1463 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1464 up_write(&block_group->data_rwsem);
1465
fd8efa81
JT
1466out:
1467 io_ctl->inode = NULL;
1468 io_ctl_free(io_ctl);
1469 if (ret) {
1470 invalidate_inode_pages2(inode->i_mapping);
1471 BTRFS_I(inode)->generation = 0;
1472 }
9a56fcd1 1473 btrfs_update_inode(trans, root, BTRFS_I(inode));
fd8efa81
JT
1474 if (must_iput)
1475 iput(inode);
1476 return ret;
0414efae
LZ
1477}
1478
fe041534 1479int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
32da5386 1480 struct btrfs_block_group *block_group,
0414efae
LZ
1481 struct btrfs_path *path)
1482{
fe041534 1483 struct btrfs_fs_info *fs_info = trans->fs_info;
0414efae
LZ
1484 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1485 struct inode *inode;
1486 int ret = 0;
1487
0414efae
LZ
1488 spin_lock(&block_group->lock);
1489 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1490 spin_unlock(&block_group->lock);
e570fd27
MX
1491 return 0;
1492 }
0414efae
LZ
1493 spin_unlock(&block_group->lock);
1494
7949f339 1495 inode = lookup_free_space_inode(block_group, path);
0414efae
LZ
1496 if (IS_ERR(inode))
1497 return 0;
1498
77ab86bf
JM
1499 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1500 block_group, &block_group->io_ctl, trans);
c09544e0 1501 if (ret) {
bbcd1f4d 1502 btrfs_debug(fs_info,
2e69a7a6
FM
1503 "failed to write free space cache for block group %llu error %d",
1504 block_group->start, ret);
c9dc4c65
CM
1505 spin_lock(&block_group->lock);
1506 block_group->disk_cache_state = BTRFS_DC_ERROR;
1507 spin_unlock(&block_group->lock);
1508
1509 block_group->io_ctl.inode = NULL;
1510 iput(inode);
0414efae
LZ
1511 }
1512
c9dc4c65
CM
1513 /*
1514 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1515 * to wait for IO and put the inode
1516 */
1517
0cb59c99
JB
1518 return ret;
1519}
1520
34d52cb6 1521static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
96303081 1522 u64 offset)
0f9dd46c 1523{
b12d6869 1524 ASSERT(offset >= bitmap_start);
96303081 1525 offset -= bitmap_start;
34d52cb6 1526 return (unsigned long)(div_u64(offset, unit));
96303081 1527}
0f9dd46c 1528
34d52cb6 1529static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
96303081 1530{
34d52cb6 1531 return (unsigned long)(div_u64(bytes, unit));
96303081 1532}
0f9dd46c 1533
34d52cb6 1534static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
1535 u64 offset)
1536{
1537 u64 bitmap_start;
0ef6447a 1538 u64 bytes_per_bitmap;
0f9dd46c 1539
34d52cb6
LZ
1540 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1541 bitmap_start = offset - ctl->start;
0ef6447a 1542 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
96303081 1543 bitmap_start *= bytes_per_bitmap;
34d52cb6 1544 bitmap_start += ctl->start;
0f9dd46c 1545
96303081 1546 return bitmap_start;
0f9dd46c
JB
1547}
1548
96303081
JB
1549static int tree_insert_offset(struct rb_root *root, u64 offset,
1550 struct rb_node *node, int bitmap)
0f9dd46c
JB
1551{
1552 struct rb_node **p = &root->rb_node;
1553 struct rb_node *parent = NULL;
1554 struct btrfs_free_space *info;
1555
1556 while (*p) {
1557 parent = *p;
96303081 1558 info = rb_entry(parent, struct btrfs_free_space, offset_index);
0f9dd46c 1559
96303081 1560 if (offset < info->offset) {
0f9dd46c 1561 p = &(*p)->rb_left;
96303081 1562 } else if (offset > info->offset) {
0f9dd46c 1563 p = &(*p)->rb_right;
96303081
JB
1564 } else {
1565 /*
1566 * we could have a bitmap entry and an extent entry
1567 * share the same offset. If this is the case, we want
1568 * the extent entry to always be found first if we do a
1569 * linear search through the tree, since we want to have
1570 * the quickest allocation time, and allocating from an
1571 * extent is faster than allocating from a bitmap. So
1572 * if we're inserting a bitmap and we find an entry at
1573 * this offset, we want to go right, or after this entry
1574 * logically. If we are inserting an extent and we've
1575 * found a bitmap, we want to go left, or before
1576 * logically.
1577 */
1578 if (bitmap) {
207dde82
JB
1579 if (info->bitmap) {
1580 WARN_ON_ONCE(1);
1581 return -EEXIST;
1582 }
96303081
JB
1583 p = &(*p)->rb_right;
1584 } else {
207dde82
JB
1585 if (!info->bitmap) {
1586 WARN_ON_ONCE(1);
1587 return -EEXIST;
1588 }
96303081
JB
1589 p = &(*p)->rb_left;
1590 }
1591 }
0f9dd46c
JB
1592 }
1593
1594 rb_link_node(node, parent, p);
1595 rb_insert_color(node, root);
1596
1597 return 0;
1598}
1599
59c7b566
JB
1600/*
1601 * This is a little subtle. We *only* have ->max_extent_size set if we actually
1602 * searched through the bitmap and figured out the largest ->max_extent_size,
1603 * otherwise it's 0. In the case that it's 0 we don't want to tell the
1604 * allocator the wrong thing, we want to use the actual real max_extent_size
1605 * we've found already if it's larger, or we want to use ->bytes.
1606 *
1607 * This matters because find_free_space() will skip entries who's ->bytes is
1608 * less than the required bytes. So if we didn't search down this bitmap, we
1609 * may pick some previous entry that has a smaller ->max_extent_size than we
1610 * have. For example, assume we have two entries, one that has
1611 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set
1612 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will
1613 * call into find_free_space(), and return with max_extent_size == 4K, because
1614 * that first bitmap entry had ->max_extent_size set, but the second one did
1615 * not. If instead we returned 8K we'd come in searching for 8K, and find the
1616 * 8K contiguous range.
1617 *
1618 * Consider the other case, we have 2 8K chunks in that second entry and still
1619 * don't have ->max_extent_size set. We'll return 16K, and the next time the
1620 * allocator comes in it'll fully search our second bitmap, and this time it'll
1621 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the
1622 * right allocation the next loop through.
1623 */
1624static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
1625{
1626 if (entry->bitmap && entry->max_extent_size)
1627 return entry->max_extent_size;
1628 return entry->bytes;
1629}
1630
1631/*
1632 * We want the largest entry to be leftmost, so this is inverted from what you'd
1633 * normally expect.
1634 */
1635static bool entry_less(struct rb_node *node, const struct rb_node *parent)
1636{
1637 const struct btrfs_free_space *entry, *exist;
1638
1639 entry = rb_entry(node, struct btrfs_free_space, bytes_index);
1640 exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
1641 return get_max_extent_size(exist) < get_max_extent_size(entry);
1642}
1643
0f9dd46c 1644/*
70cb0743
JB
1645 * searches the tree for the given offset.
1646 *
96303081
JB
1647 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1648 * want a section that has at least bytes size and comes at or after the given
1649 * offset.
0f9dd46c 1650 */
96303081 1651static struct btrfs_free_space *
34d52cb6 1652tree_search_offset(struct btrfs_free_space_ctl *ctl,
96303081 1653 u64 offset, int bitmap_only, int fuzzy)
0f9dd46c 1654{
34d52cb6 1655 struct rb_node *n = ctl->free_space_offset.rb_node;
f1a8fc62 1656 struct btrfs_free_space *entry = NULL, *prev = NULL;
96303081
JB
1657
1658 /* find entry that is closest to the 'offset' */
f1a8fc62 1659 while (n) {
0f9dd46c 1660 entry = rb_entry(n, struct btrfs_free_space, offset_index);
96303081 1661 prev = entry;
0f9dd46c 1662
96303081 1663 if (offset < entry->offset)
0f9dd46c 1664 n = n->rb_left;
96303081 1665 else if (offset > entry->offset)
0f9dd46c 1666 n = n->rb_right;
96303081 1667 else
0f9dd46c 1668 break;
f1a8fc62
NB
1669
1670 entry = NULL;
0f9dd46c
JB
1671 }
1672
96303081
JB
1673 if (bitmap_only) {
1674 if (!entry)
1675 return NULL;
1676 if (entry->bitmap)
1677 return entry;
0f9dd46c 1678
96303081
JB
1679 /*
1680 * bitmap entry and extent entry may share same offset,
1681 * in that case, bitmap entry comes after extent entry.
1682 */
1683 n = rb_next(n);
1684 if (!n)
1685 return NULL;
1686 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1687 if (entry->offset != offset)
1688 return NULL;
0f9dd46c 1689
96303081
JB
1690 WARN_ON(!entry->bitmap);
1691 return entry;
1692 } else if (entry) {
1693 if (entry->bitmap) {
0f9dd46c 1694 /*
96303081
JB
1695 * if previous extent entry covers the offset,
1696 * we should return it instead of the bitmap entry
0f9dd46c 1697 */
de6c4115
MX
1698 n = rb_prev(&entry->offset_index);
1699 if (n) {
96303081
JB
1700 prev = rb_entry(n, struct btrfs_free_space,
1701 offset_index);
de6c4115
MX
1702 if (!prev->bitmap &&
1703 prev->offset + prev->bytes > offset)
1704 entry = prev;
0f9dd46c 1705 }
96303081
JB
1706 }
1707 return entry;
1708 }
1709
1710 if (!prev)
1711 return NULL;
1712
1713 /* find last entry before the 'offset' */
1714 entry = prev;
1715 if (entry->offset > offset) {
1716 n = rb_prev(&entry->offset_index);
1717 if (n) {
1718 entry = rb_entry(n, struct btrfs_free_space,
1719 offset_index);
b12d6869 1720 ASSERT(entry->offset <= offset);
0f9dd46c 1721 } else {
96303081
JB
1722 if (fuzzy)
1723 return entry;
1724 else
1725 return NULL;
0f9dd46c
JB
1726 }
1727 }
1728
96303081 1729 if (entry->bitmap) {
de6c4115
MX
1730 n = rb_prev(&entry->offset_index);
1731 if (n) {
96303081
JB
1732 prev = rb_entry(n, struct btrfs_free_space,
1733 offset_index);
de6c4115
MX
1734 if (!prev->bitmap &&
1735 prev->offset + prev->bytes > offset)
1736 return prev;
96303081 1737 }
34d52cb6 1738 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
96303081
JB
1739 return entry;
1740 } else if (entry->offset + entry->bytes > offset)
1741 return entry;
1742
1743 if (!fuzzy)
1744 return NULL;
1745
1746 while (1) {
167c0bd3
NB
1747 n = rb_next(&entry->offset_index);
1748 if (!n)
1749 return NULL;
1750 entry = rb_entry(n, struct btrfs_free_space, offset_index);
96303081
JB
1751 if (entry->bitmap) {
1752 if (entry->offset + BITS_PER_BITMAP *
34d52cb6 1753 ctl->unit > offset)
96303081
JB
1754 break;
1755 } else {
1756 if (entry->offset + entry->bytes > offset)
1757 break;
1758 }
96303081
JB
1759 }
1760 return entry;
0f9dd46c
JB
1761}
1762
32e1649b
NB
1763static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1764 struct btrfs_free_space *info,
1765 bool update_stat)
0f9dd46c 1766{
34d52cb6 1767 rb_erase(&info->offset_index, &ctl->free_space_offset);
59c7b566 1768 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
34d52cb6 1769 ctl->free_extents--;
dfb79ddb 1770
5dc7c10b 1771 if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
dfb79ddb 1772 ctl->discardable_extents[BTRFS_STAT_CURR]--;
5dc7c10b
DZ
1773 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1774 }
f333adb5 1775
32e1649b
NB
1776 if (update_stat)
1777 ctl->free_space -= info->bytes;
0f9dd46c
JB
1778}
1779
34d52cb6 1780static int link_free_space(struct btrfs_free_space_ctl *ctl,
0f9dd46c
JB
1781 struct btrfs_free_space *info)
1782{
1783 int ret = 0;
1784
b12d6869 1785 ASSERT(info->bytes || info->bitmap);
34d52cb6 1786 ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
96303081 1787 &info->offset_index, (info->bitmap != NULL));
0f9dd46c
JB
1788 if (ret)
1789 return ret;
1790
59c7b566
JB
1791 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1792
5dc7c10b 1793 if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
dfb79ddb 1794 ctl->discardable_extents[BTRFS_STAT_CURR]++;
5dc7c10b
DZ
1795 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1796 }
dfb79ddb 1797
34d52cb6
LZ
1798 ctl->free_space += info->bytes;
1799 ctl->free_extents++;
96303081
JB
1800 return ret;
1801}
1802
59c7b566
JB
1803static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
1804 struct btrfs_free_space *info)
1805{
1806 ASSERT(info->bitmap);
1807
1808 /*
1809 * If our entry is empty it's because we're on a cluster and we don't
1810 * want to re-link it into our ctl bytes index.
1811 */
1812 if (RB_EMPTY_NODE(&info->bytes_index))
1813 return;
1814
1815 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1816 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1817}
1818
f594f13c
NB
1819static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1820 struct btrfs_free_space *info,
1821 u64 offset, u64 bytes, bool update_stat)
96303081 1822{
dfb79ddb
DZ
1823 unsigned long start, count, end;
1824 int extent_delta = -1;
96303081 1825
34d52cb6
LZ
1826 start = offset_to_bit(info->offset, ctl->unit, offset);
1827 count = bytes_to_bits(bytes, ctl->unit);
dfb79ddb
DZ
1828 end = start + count;
1829 ASSERT(end <= BITS_PER_BITMAP);
96303081 1830
f38b6e75 1831 bitmap_clear(info->bitmap, start, count);
96303081
JB
1832
1833 info->bytes -= bytes;
553cceb4
JB
1834 if (info->max_extent_size > ctl->unit)
1835 info->max_extent_size = 0;
dfb79ddb 1836
59c7b566
JB
1837 relink_bitmap_entry(ctl, info);
1838
dfb79ddb
DZ
1839 if (start && test_bit(start - 1, info->bitmap))
1840 extent_delta++;
1841
1842 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1843 extent_delta++;
1844
1845 info->bitmap_extents += extent_delta;
5dc7c10b 1846 if (!btrfs_free_space_trimmed(info)) {
dfb79ddb 1847 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
5dc7c10b
DZ
1848 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1849 }
bb3ac5a4 1850
f594f13c
NB
1851 if (update_stat)
1852 ctl->free_space -= bytes;
96303081
JB
1853}
1854
34d52cb6 1855static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
817d52f8
JB
1856 struct btrfs_free_space *info, u64 offset,
1857 u64 bytes)
96303081 1858{
dfb79ddb
DZ
1859 unsigned long start, count, end;
1860 int extent_delta = 1;
96303081 1861
34d52cb6
LZ
1862 start = offset_to_bit(info->offset, ctl->unit, offset);
1863 count = bytes_to_bits(bytes, ctl->unit);
dfb79ddb
DZ
1864 end = start + count;
1865 ASSERT(end <= BITS_PER_BITMAP);
96303081 1866
f38b6e75 1867 bitmap_set(info->bitmap, start, count);
96303081 1868
59c7b566
JB
1869 /*
1870 * We set some bytes, we have no idea what the max extent size is
1871 * anymore.
1872 */
1873 info->max_extent_size = 0;
96303081 1874 info->bytes += bytes;
34d52cb6 1875 ctl->free_space += bytes;
dfb79ddb 1876
59c7b566
JB
1877 relink_bitmap_entry(ctl, info);
1878
dfb79ddb
DZ
1879 if (start && test_bit(start - 1, info->bitmap))
1880 extent_delta--;
1881
1882 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1883 extent_delta--;
1884
1885 info->bitmap_extents += extent_delta;
5dc7c10b 1886 if (!btrfs_free_space_trimmed(info)) {
dfb79ddb 1887 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
5dc7c10b
DZ
1888 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1889 }
96303081
JB
1890}
1891
a4820398
MX
1892/*
1893 * If we can not find suitable extent, we will use bytes to record
1894 * the size of the max extent.
1895 */
34d52cb6 1896static int search_bitmap(struct btrfs_free_space_ctl *ctl,
96303081 1897 struct btrfs_free_space *bitmap_info, u64 *offset,
0584f718 1898 u64 *bytes, bool for_alloc)
96303081
JB
1899{
1900 unsigned long found_bits = 0;
a4820398 1901 unsigned long max_bits = 0;
96303081
JB
1902 unsigned long bits, i;
1903 unsigned long next_zero;
a4820398 1904 unsigned long extent_bits;
96303081 1905
cef40483
JB
1906 /*
1907 * Skip searching the bitmap if we don't have a contiguous section that
1908 * is large enough for this allocation.
1909 */
0584f718
JB
1910 if (for_alloc &&
1911 bitmap_info->max_extent_size &&
cef40483
JB
1912 bitmap_info->max_extent_size < *bytes) {
1913 *bytes = bitmap_info->max_extent_size;
1914 return -1;
1915 }
1916
34d52cb6 1917 i = offset_to_bit(bitmap_info->offset, ctl->unit,
96303081 1918 max_t(u64, *offset, bitmap_info->offset));
34d52cb6 1919 bits = bytes_to_bits(*bytes, ctl->unit);
96303081 1920
ebb3dad4 1921 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
0584f718
JB
1922 if (for_alloc && bits == 1) {
1923 found_bits = 1;
1924 break;
1925 }
96303081
JB
1926 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1927 BITS_PER_BITMAP, i);
a4820398
MX
1928 extent_bits = next_zero - i;
1929 if (extent_bits >= bits) {
1930 found_bits = extent_bits;
96303081 1931 break;
a4820398
MX
1932 } else if (extent_bits > max_bits) {
1933 max_bits = extent_bits;
96303081
JB
1934 }
1935 i = next_zero;
1936 }
1937
1938 if (found_bits) {
34d52cb6
LZ
1939 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1940 *bytes = (u64)(found_bits) * ctl->unit;
96303081
JB
1941 return 0;
1942 }
1943
a4820398 1944 *bytes = (u64)(max_bits) * ctl->unit;
cef40483 1945 bitmap_info->max_extent_size = *bytes;
59c7b566 1946 relink_bitmap_entry(ctl, bitmap_info);
96303081
JB
1947 return -1;
1948}
1949
a4820398 1950/* Cache the size of the max extent in bytes */
34d52cb6 1951static struct btrfs_free_space *
53b381b3 1952find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
59c7b566 1953 unsigned long align, u64 *max_extent_size, bool use_bytes_index)
96303081
JB
1954{
1955 struct btrfs_free_space *entry;
1956 struct rb_node *node;
53b381b3
DW
1957 u64 tmp;
1958 u64 align_off;
96303081
JB
1959 int ret;
1960
34d52cb6 1961 if (!ctl->free_space_offset.rb_node)
a4820398 1962 goto out;
59c7b566
JB
1963again:
1964 if (use_bytes_index) {
1965 node = rb_first_cached(&ctl->free_space_bytes);
1966 } else {
1967 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
1968 0, 1);
1969 if (!entry)
1970 goto out;
1971 node = &entry->offset_index;
1972 }
96303081 1973
59c7b566
JB
1974 for (; node; node = rb_next(node)) {
1975 if (use_bytes_index)
1976 entry = rb_entry(node, struct btrfs_free_space,
1977 bytes_index);
1978 else
1979 entry = rb_entry(node, struct btrfs_free_space,
1980 offset_index);
96303081 1981
59c7b566
JB
1982 /*
1983 * If we are using the bytes index then all subsequent entries
1984 * in this tree are going to be < bytes, so simply set the max
1985 * extent size and exit the loop.
1986 *
1987 * If we're using the offset index then we need to keep going
1988 * through the rest of the tree.
1989 */
a4820398 1990 if (entry->bytes < *bytes) {
ad22cf6e
JB
1991 *max_extent_size = max(get_max_extent_size(entry),
1992 *max_extent_size);
59c7b566
JB
1993 if (use_bytes_index)
1994 break;
96303081 1995 continue;
a4820398 1996 }
96303081 1997
53b381b3
DW
1998 /* make sure the space returned is big enough
1999 * to match our requested alignment
2000 */
2001 if (*bytes >= align) {
a4820398 2002 tmp = entry->offset - ctl->start + align - 1;
47c5713f 2003 tmp = div64_u64(tmp, align);
53b381b3
DW
2004 tmp = tmp * align + ctl->start;
2005 align_off = tmp - entry->offset;
2006 } else {
2007 align_off = 0;
2008 tmp = entry->offset;
2009 }
2010
59c7b566
JB
2011 /*
2012 * We don't break here if we're using the bytes index because we
2013 * may have another entry that has the correct alignment that is
2014 * the right size, so we don't want to miss that possibility.
2015 * At worst this adds another loop through the logic, but if we
2016 * broke here we could prematurely ENOSPC.
2017 */
a4820398 2018 if (entry->bytes < *bytes + align_off) {
ad22cf6e
JB
2019 *max_extent_size = max(get_max_extent_size(entry),
2020 *max_extent_size);
53b381b3 2021 continue;
a4820398 2022 }
53b381b3 2023
96303081 2024 if (entry->bitmap) {
59c7b566 2025 struct rb_node *old_next = rb_next(node);
a4820398
MX
2026 u64 size = *bytes;
2027
0584f718 2028 ret = search_bitmap(ctl, entry, &tmp, &size, true);
53b381b3
DW
2029 if (!ret) {
2030 *offset = tmp;
a4820398 2031 *bytes = size;
96303081 2032 return entry;
ad22cf6e
JB
2033 } else {
2034 *max_extent_size =
2035 max(get_max_extent_size(entry),
2036 *max_extent_size);
53b381b3 2037 }
59c7b566
JB
2038
2039 /*
2040 * The bitmap may have gotten re-arranged in the space
2041 * index here because the max_extent_size may have been
2042 * updated. Start from the beginning again if this
2043 * happened.
2044 */
2045 if (use_bytes_index && old_next != rb_next(node))
2046 goto again;
96303081
JB
2047 continue;
2048 }
2049
53b381b3
DW
2050 *offset = tmp;
2051 *bytes = entry->bytes - align_off;
96303081
JB
2052 return entry;
2053 }
a4820398 2054out:
96303081
JB
2055 return NULL;
2056}
2057
34d52cb6 2058static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
2059 struct btrfs_free_space *info, u64 offset)
2060{
34d52cb6 2061 info->offset = offset_to_bitmap(ctl, offset);
f019f426 2062 info->bytes = 0;
dfb79ddb 2063 info->bitmap_extents = 0;
f2d0f676 2064 INIT_LIST_HEAD(&info->list);
34d52cb6
LZ
2065 link_free_space(ctl, info);
2066 ctl->total_bitmaps++;
fa598b06 2067 recalculate_thresholds(ctl);
96303081
JB
2068}
2069
34d52cb6 2070static void free_bitmap(struct btrfs_free_space_ctl *ctl,
edf6e2d1
LZ
2071 struct btrfs_free_space *bitmap_info)
2072{
27f0afc7
DZ
2073 /*
2074 * Normally when this is called, the bitmap is completely empty. However,
2075 * if we are blowing up the free space cache for one reason or another
2076 * via __btrfs_remove_free_space_cache(), then it may not be freed and
2077 * we may leave stats on the table.
2078 */
2079 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
2080 ctl->discardable_extents[BTRFS_STAT_CURR] -=
2081 bitmap_info->bitmap_extents;
2082 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
2083
2084 }
32e1649b 2085 unlink_free_space(ctl, bitmap_info, true);
3acd4850 2086 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
dc89e982 2087 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
34d52cb6 2088 ctl->total_bitmaps--;
fa598b06 2089 recalculate_thresholds(ctl);
edf6e2d1
LZ
2090}
2091
34d52cb6 2092static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
96303081
JB
2093 struct btrfs_free_space *bitmap_info,
2094 u64 *offset, u64 *bytes)
2095{
2096 u64 end;
6606bb97
JB
2097 u64 search_start, search_bytes;
2098 int ret;
96303081
JB
2099
2100again:
34d52cb6 2101 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
96303081 2102
6606bb97 2103 /*
bdb7d303
JB
2104 * We need to search for bits in this bitmap. We could only cover some
2105 * of the extent in this bitmap thanks to how we add space, so we need
2106 * to search for as much as it as we can and clear that amount, and then
2107 * go searching for the next bit.
6606bb97
JB
2108 */
2109 search_start = *offset;
bdb7d303 2110 search_bytes = ctl->unit;
13dbc089 2111 search_bytes = min(search_bytes, end - search_start + 1);
0584f718
JB
2112 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
2113 false);
b50c6e25
JB
2114 if (ret < 0 || search_start != *offset)
2115 return -EINVAL;
6606bb97 2116
bdb7d303
JB
2117 /* We may have found more bits than what we need */
2118 search_bytes = min(search_bytes, *bytes);
2119
2120 /* Cannot clear past the end of the bitmap */
2121 search_bytes = min(search_bytes, end - search_start + 1);
2122
f594f13c 2123 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
bdb7d303
JB
2124 *offset += search_bytes;
2125 *bytes -= search_bytes;
96303081
JB
2126
2127 if (*bytes) {
6606bb97 2128 struct rb_node *next = rb_next(&bitmap_info->offset_index);
edf6e2d1 2129 if (!bitmap_info->bytes)
34d52cb6 2130 free_bitmap(ctl, bitmap_info);
96303081 2131
6606bb97
JB
2132 /*
2133 * no entry after this bitmap, but we still have bytes to
2134 * remove, so something has gone wrong.
2135 */
2136 if (!next)
96303081
JB
2137 return -EINVAL;
2138
6606bb97
JB
2139 bitmap_info = rb_entry(next, struct btrfs_free_space,
2140 offset_index);
2141
2142 /*
2143 * if the next entry isn't a bitmap we need to return to let the
2144 * extent stuff do its work.
2145 */
96303081
JB
2146 if (!bitmap_info->bitmap)
2147 return -EAGAIN;
2148
6606bb97
JB
2149 /*
2150 * Ok the next item is a bitmap, but it may not actually hold
2151 * the information for the rest of this free space stuff, so
2152 * look for it, and if we don't find it return so we can try
2153 * everything over again.
2154 */
2155 search_start = *offset;
bdb7d303 2156 search_bytes = ctl->unit;
34d52cb6 2157 ret = search_bitmap(ctl, bitmap_info, &search_start,
0584f718 2158 &search_bytes, false);
6606bb97
JB
2159 if (ret < 0 || search_start != *offset)
2160 return -EAGAIN;
2161
96303081 2162 goto again;
edf6e2d1 2163 } else if (!bitmap_info->bytes)
34d52cb6 2164 free_bitmap(ctl, bitmap_info);
96303081
JB
2165
2166 return 0;
2167}
2168
2cdc342c
JB
2169static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2170 struct btrfs_free_space *info, u64 offset,
da080fe1 2171 u64 bytes, enum btrfs_trim_state trim_state)
2cdc342c
JB
2172{
2173 u64 bytes_to_set = 0;
2174 u64 end;
2175
da080fe1
DZ
2176 /*
2177 * This is a tradeoff to make bitmap trim state minimal. We mark the
2178 * whole bitmap untrimmed if at any point we add untrimmed regions.
2179 */
dfb79ddb 2180 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
5dc7c10b 2181 if (btrfs_free_space_trimmed(info)) {
dfb79ddb
DZ
2182 ctl->discardable_extents[BTRFS_STAT_CURR] +=
2183 info->bitmap_extents;
5dc7c10b
DZ
2184 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2185 }
da080fe1 2186 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
dfb79ddb 2187 }
da080fe1 2188
2cdc342c
JB
2189 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2190
2191 bytes_to_set = min(end - offset, bytes);
2192
2193 bitmap_set_bits(ctl, info, offset, bytes_to_set);
2194
2195 return bytes_to_set;
2196
2197}
2198
34d52cb6
LZ
2199static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2200 struct btrfs_free_space *info)
96303081 2201{
364be842 2202 struct btrfs_block_group *block_group = ctl->block_group;
0b246afa 2203 struct btrfs_fs_info *fs_info = block_group->fs_info;
d0bd4560
JB
2204 bool forced = false;
2205
2206#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 2207 if (btrfs_should_fragment_free_space(block_group))
d0bd4560
JB
2208 forced = true;
2209#endif
96303081 2210
5d90c5c7
DZ
2211 /* This is a way to reclaim large regions from the bitmaps. */
2212 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2213 return false;
2214
96303081
JB
2215 /*
2216 * If we are below the extents threshold then we can add this as an
2217 * extent, and don't have to deal with the bitmap
2218 */
d0bd4560 2219 if (!forced && ctl->free_extents < ctl->extents_thresh) {
32cb0840
JB
2220 /*
2221 * If this block group has some small extents we don't want to
2222 * use up all of our free slots in the cache with them, we want
01327610 2223 * to reserve them to larger extents, however if we have plenty
32cb0840
JB
2224 * of cache left then go ahead an dadd them, no sense in adding
2225 * the overhead of a bitmap if we don't have to.
2226 */
f9bb615a
DZ
2227 if (info->bytes <= fs_info->sectorsize * 8) {
2228 if (ctl->free_extents * 3 <= ctl->extents_thresh)
34d52cb6 2229 return false;
32cb0840 2230 } else {
34d52cb6 2231 return false;
32cb0840
JB
2232 }
2233 }
96303081
JB
2234
2235 /*
dde5740f
JB
2236 * The original block groups from mkfs can be really small, like 8
2237 * megabytes, so don't bother with a bitmap for those entries. However
2238 * some block groups can be smaller than what a bitmap would cover but
2239 * are still large enough that they could overflow the 32k memory limit,
2240 * so allow those block groups to still be allowed to have a bitmap
2241 * entry.
96303081 2242 */
b3470b5d 2243 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
34d52cb6
LZ
2244 return false;
2245
2246 return true;
2247}
2248
20e5506b 2249static const struct btrfs_free_space_op free_space_op = {
2cdc342c
JB
2250 .use_bitmap = use_bitmap,
2251};
2252
34d52cb6
LZ
2253static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2254 struct btrfs_free_space *info)
2255{
2256 struct btrfs_free_space *bitmap_info;
32da5386 2257 struct btrfs_block_group *block_group = NULL;
34d52cb6 2258 int added = 0;
2cdc342c 2259 u64 bytes, offset, bytes_added;
da080fe1 2260 enum btrfs_trim_state trim_state;
34d52cb6 2261 int ret;
96303081
JB
2262
2263 bytes = info->bytes;
2264 offset = info->offset;
da080fe1 2265 trim_state = info->trim_state;
96303081 2266
34d52cb6
LZ
2267 if (!ctl->op->use_bitmap(ctl, info))
2268 return 0;
2269
2cdc342c 2270 if (ctl->op == &free_space_op)
364be842 2271 block_group = ctl->block_group;
38e87880 2272again:
2cdc342c
JB
2273 /*
2274 * Since we link bitmaps right into the cluster we need to see if we
2275 * have a cluster here, and if so and it has our bitmap we need to add
2276 * the free space to that bitmap.
2277 */
2278 if (block_group && !list_empty(&block_group->cluster_list)) {
2279 struct btrfs_free_cluster *cluster;
2280 struct rb_node *node;
2281 struct btrfs_free_space *entry;
2282
2283 cluster = list_entry(block_group->cluster_list.next,
2284 struct btrfs_free_cluster,
2285 block_group_list);
2286 spin_lock(&cluster->lock);
2287 node = rb_first(&cluster->root);
2288 if (!node) {
2289 spin_unlock(&cluster->lock);
38e87880 2290 goto no_cluster_bitmap;
2cdc342c
JB
2291 }
2292
2293 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2294 if (!entry->bitmap) {
2295 spin_unlock(&cluster->lock);
38e87880 2296 goto no_cluster_bitmap;
2cdc342c
JB
2297 }
2298
2299 if (entry->offset == offset_to_bitmap(ctl, offset)) {
da080fe1
DZ
2300 bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2301 bytes, trim_state);
2cdc342c
JB
2302 bytes -= bytes_added;
2303 offset += bytes_added;
2304 }
2305 spin_unlock(&cluster->lock);
2306 if (!bytes) {
2307 ret = 1;
2308 goto out;
2309 }
2310 }
38e87880
CM
2311
2312no_cluster_bitmap:
34d52cb6 2313 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
96303081
JB
2314 1, 0);
2315 if (!bitmap_info) {
b12d6869 2316 ASSERT(added == 0);
96303081
JB
2317 goto new_bitmap;
2318 }
2319
da080fe1
DZ
2320 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2321 trim_state);
2cdc342c
JB
2322 bytes -= bytes_added;
2323 offset += bytes_added;
2324 added = 0;
96303081
JB
2325
2326 if (!bytes) {
2327 ret = 1;
2328 goto out;
2329 } else
2330 goto again;
2331
2332new_bitmap:
2333 if (info && info->bitmap) {
34d52cb6 2334 add_new_bitmap(ctl, info, offset);
96303081
JB
2335 added = 1;
2336 info = NULL;
2337 goto again;
2338 } else {
34d52cb6 2339 spin_unlock(&ctl->tree_lock);
96303081
JB
2340
2341 /* no pre-allocated info, allocate a new one */
2342 if (!info) {
dc89e982
JB
2343 info = kmem_cache_zalloc(btrfs_free_space_cachep,
2344 GFP_NOFS);
96303081 2345 if (!info) {
34d52cb6 2346 spin_lock(&ctl->tree_lock);
96303081
JB
2347 ret = -ENOMEM;
2348 goto out;
2349 }
2350 }
2351
2352 /* allocate the bitmap */
3acd4850
CL
2353 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2354 GFP_NOFS);
da080fe1 2355 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
34d52cb6 2356 spin_lock(&ctl->tree_lock);
96303081
JB
2357 if (!info->bitmap) {
2358 ret = -ENOMEM;
2359 goto out;
2360 }
2361 goto again;
2362 }
2363
2364out:
2365 if (info) {
3acd4850
CL
2366 if (info->bitmap)
2367 kmem_cache_free(btrfs_free_space_bitmap_cachep,
2368 info->bitmap);
dc89e982 2369 kmem_cache_free(btrfs_free_space_cachep, info);
96303081 2370 }
0f9dd46c
JB
2371
2372 return ret;
2373}
2374
a7ccb255
DZ
2375/*
2376 * Free space merging rules:
2377 * 1) Merge trimmed areas together
2378 * 2) Let untrimmed areas coalesce with trimmed areas
2379 * 3) Always pull neighboring regions from bitmaps
2380 *
2381 * The above rules are for when we merge free space based on btrfs_trim_state.
2382 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2383 * same reason: to promote larger extent regions which makes life easier for
2384 * find_free_extent(). Rule 2 enables coalescing based on the common path
2385 * being returning free space from btrfs_finish_extent_commit(). So when free
2386 * space is trimmed, it will prevent aggregating trimmed new region and
2387 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
2388 * and provide find_free_extent() with the largest extents possible hoping for
2389 * the reuse path.
2390 */
945d8962 2391static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
f333adb5 2392 struct btrfs_free_space *info, bool update_stat)
0f9dd46c 2393{
bf53d468 2394 struct btrfs_free_space *left_info = NULL;
120d66ee
LZ
2395 struct btrfs_free_space *right_info;
2396 bool merged = false;
2397 u64 offset = info->offset;
2398 u64 bytes = info->bytes;
a7ccb255 2399 const bool is_trimmed = btrfs_free_space_trimmed(info);
6226cb0a 2400
0f9dd46c
JB
2401 /*
2402 * first we want to see if there is free space adjacent to the range we
2403 * are adding, if there is remove that struct and add a new one to
2404 * cover the entire range
2405 */
34d52cb6 2406 right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
96303081
JB
2407 if (right_info && rb_prev(&right_info->offset_index))
2408 left_info = rb_entry(rb_prev(&right_info->offset_index),
2409 struct btrfs_free_space, offset_index);
bf53d468 2410 else if (!right_info)
34d52cb6 2411 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
0f9dd46c 2412
a7ccb255
DZ
2413 /* See try_merge_free_space() comment. */
2414 if (right_info && !right_info->bitmap &&
2415 (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
32e1649b 2416 unlink_free_space(ctl, right_info, update_stat);
6226cb0a 2417 info->bytes += right_info->bytes;
dc89e982 2418 kmem_cache_free(btrfs_free_space_cachep, right_info);
120d66ee 2419 merged = true;
0f9dd46c
JB
2420 }
2421
a7ccb255 2422 /* See try_merge_free_space() comment. */
96303081 2423 if (left_info && !left_info->bitmap &&
a7ccb255
DZ
2424 left_info->offset + left_info->bytes == offset &&
2425 (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
32e1649b 2426 unlink_free_space(ctl, left_info, update_stat);
6226cb0a
JB
2427 info->offset = left_info->offset;
2428 info->bytes += left_info->bytes;
dc89e982 2429 kmem_cache_free(btrfs_free_space_cachep, left_info);
120d66ee 2430 merged = true;
0f9dd46c
JB
2431 }
2432
120d66ee
LZ
2433 return merged;
2434}
2435
20005523
FM
2436static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2437 struct btrfs_free_space *info,
2438 bool update_stat)
2439{
2440 struct btrfs_free_space *bitmap;
2441 unsigned long i;
2442 unsigned long j;
2443 const u64 end = info->offset + info->bytes;
2444 const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2445 u64 bytes;
2446
2447 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2448 if (!bitmap)
2449 return false;
2450
2451 i = offset_to_bit(bitmap->offset, ctl->unit, end);
2452 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2453 if (j == i)
2454 return false;
2455 bytes = (j - i) * ctl->unit;
2456 info->bytes += bytes;
2457
a7ccb255
DZ
2458 /* See try_merge_free_space() comment. */
2459 if (!btrfs_free_space_trimmed(bitmap))
2460 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2461
f594f13c 2462 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
20005523
FM
2463
2464 if (!bitmap->bytes)
2465 free_bitmap(ctl, bitmap);
2466
2467 return true;
2468}
2469
2470static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2471 struct btrfs_free_space *info,
2472 bool update_stat)
2473{
2474 struct btrfs_free_space *bitmap;
2475 u64 bitmap_offset;
2476 unsigned long i;
2477 unsigned long j;
2478 unsigned long prev_j;
2479 u64 bytes;
2480
2481 bitmap_offset = offset_to_bitmap(ctl, info->offset);
2482 /* If we're on a boundary, try the previous logical bitmap. */
2483 if (bitmap_offset == info->offset) {
2484 if (info->offset == 0)
2485 return false;
2486 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2487 }
2488
2489 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2490 if (!bitmap)
2491 return false;
2492
2493 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2494 j = 0;
2495 prev_j = (unsigned long)-1;
2496 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2497 if (j > i)
2498 break;
2499 prev_j = j;
2500 }
2501 if (prev_j == i)
2502 return false;
2503
2504 if (prev_j == (unsigned long)-1)
2505 bytes = (i + 1) * ctl->unit;
2506 else
2507 bytes = (i - prev_j) * ctl->unit;
2508
2509 info->offset -= bytes;
2510 info->bytes += bytes;
2511
a7ccb255
DZ
2512 /* See try_merge_free_space() comment. */
2513 if (!btrfs_free_space_trimmed(bitmap))
2514 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2515
f594f13c 2516 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
20005523
FM
2517
2518 if (!bitmap->bytes)
2519 free_bitmap(ctl, bitmap);
2520
2521 return true;
2522}
2523
2524/*
2525 * We prefer always to allocate from extent entries, both for clustered and
2526 * non-clustered allocation requests. So when attempting to add a new extent
2527 * entry, try to see if there's adjacent free space in bitmap entries, and if
2528 * there is, migrate that space from the bitmaps to the extent.
2529 * Like this we get better chances of satisfying space allocation requests
2530 * because we attempt to satisfy them based on a single cache entry, and never
2531 * on 2 or more entries - even if the entries represent a contiguous free space
2532 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2533 * ends).
2534 */
2535static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2536 struct btrfs_free_space *info,
2537 bool update_stat)
2538{
2539 /*
2540 * Only work with disconnected entries, as we can change their offset,
2541 * and must be extent entries.
2542 */
2543 ASSERT(!info->bitmap);
2544 ASSERT(RB_EMPTY_NODE(&info->offset_index));
2545
2546 if (ctl->total_bitmaps > 0) {
2547 bool stole_end;
2548 bool stole_front = false;
2549
2550 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2551 if (ctl->total_bitmaps > 0)
2552 stole_front = steal_from_bitmap_to_front(ctl, info,
2553 update_stat);
2554
2555 if (stole_end || stole_front)
2556 try_merge_free_space(ctl, info, update_stat);
2557 }
2558}
2559
290ef19a 2560int __btrfs_add_free_space(struct btrfs_block_group *block_group,
a7ccb255
DZ
2561 u64 offset, u64 bytes,
2562 enum btrfs_trim_state trim_state)
120d66ee 2563{
290ef19a
NB
2564 struct btrfs_fs_info *fs_info = block_group->fs_info;
2565 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
120d66ee
LZ
2566 struct btrfs_free_space *info;
2567 int ret = 0;
7fe6d45e 2568 u64 filter_bytes = bytes;
120d66ee 2569
169e0da9
NA
2570 ASSERT(!btrfs_is_zoned(fs_info));
2571
dc89e982 2572 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
120d66ee
LZ
2573 if (!info)
2574 return -ENOMEM;
2575
2576 info->offset = offset;
2577 info->bytes = bytes;
a7ccb255 2578 info->trim_state = trim_state;
20005523 2579 RB_CLEAR_NODE(&info->offset_index);
59c7b566 2580 RB_CLEAR_NODE(&info->bytes_index);
120d66ee 2581
34d52cb6 2582 spin_lock(&ctl->tree_lock);
120d66ee 2583
34d52cb6 2584 if (try_merge_free_space(ctl, info, true))
120d66ee
LZ
2585 goto link;
2586
2587 /*
2588 * There was no extent directly to the left or right of this new
2589 * extent then we know we're going to have to allocate a new extent, so
2590 * before we do that see if we need to drop this into a bitmap
2591 */
34d52cb6 2592 ret = insert_into_bitmap(ctl, info);
120d66ee
LZ
2593 if (ret < 0) {
2594 goto out;
2595 } else if (ret) {
2596 ret = 0;
2597 goto out;
2598 }
2599link:
20005523
FM
2600 /*
2601 * Only steal free space from adjacent bitmaps if we're sure we're not
2602 * going to add the new free space to existing bitmap entries - because
2603 * that would mean unnecessary work that would be reverted. Therefore
2604 * attempt to steal space from bitmaps if we're adding an extent entry.
2605 */
2606 steal_from_bitmap(ctl, info, true);
2607
7fe6d45e
DZ
2608 filter_bytes = max(filter_bytes, info->bytes);
2609
34d52cb6 2610 ret = link_free_space(ctl, info);
0f9dd46c 2611 if (ret)
dc89e982 2612 kmem_cache_free(btrfs_free_space_cachep, info);
96303081 2613out:
66b53bae 2614 btrfs_discard_update_discardable(block_group);
34d52cb6 2615 spin_unlock(&ctl->tree_lock);
6226cb0a 2616
0f9dd46c 2617 if (ret) {
ab8d0fc4 2618 btrfs_crit(fs_info, "unable to add free space :%d", ret);
b12d6869 2619 ASSERT(ret != -EEXIST);
0f9dd46c
JB
2620 }
2621
7fe6d45e
DZ
2622 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2623 btrfs_discard_check_filter(block_group, filter_bytes);
b0643e59 2624 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
7fe6d45e 2625 }
b0643e59 2626
0f9dd46c
JB
2627 return ret;
2628}
2629
169e0da9
NA
2630static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
2631 u64 bytenr, u64 size, bool used)
2632{
bb5a098d 2633 struct btrfs_space_info *sinfo = block_group->space_info;
169e0da9
NA
2634 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2635 u64 offset = bytenr - block_group->start;
2636 u64 to_free, to_unusable;
bb5a098d 2637 int bg_reclaim_threshold = 0;
98173255 2638 bool initial = (size == block_group->length);
d8da0e85 2639 u64 reclaimable_unusable;
98173255
NA
2640
2641 WARN_ON(!initial && offset + size > block_group->zone_capacity);
169e0da9 2642
bb5a098d
JB
2643 if (!initial)
2644 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
2645
169e0da9
NA
2646 spin_lock(&ctl->tree_lock);
2647 if (!used)
2648 to_free = size;
98173255
NA
2649 else if (initial)
2650 to_free = block_group->zone_capacity;
169e0da9
NA
2651 else if (offset >= block_group->alloc_offset)
2652 to_free = size;
2653 else if (offset + size <= block_group->alloc_offset)
2654 to_free = 0;
2655 else
2656 to_free = offset + size - block_group->alloc_offset;
2657 to_unusable = size - to_free;
2658
2659 ctl->free_space += to_free;
badae9c8
NA
2660 /*
2661 * If the block group is read-only, we should account freed space into
2662 * bytes_readonly.
2663 */
2664 if (!block_group->ro)
2665 block_group->zone_unusable += to_unusable;
169e0da9
NA
2666 spin_unlock(&ctl->tree_lock);
2667 if (!used) {
2668 spin_lock(&block_group->lock);
2669 block_group->alloc_offset -= size;
2670 spin_unlock(&block_group->lock);
2671 }
2672
d8da0e85
NA
2673 reclaimable_unusable = block_group->zone_unusable -
2674 (block_group->length - block_group->zone_capacity);
169e0da9 2675 /* All the region is now unusable. Mark it as unused and reclaim */
18bb8bbf 2676 if (block_group->zone_unusable == block_group->length) {
169e0da9 2677 btrfs_mark_bg_unused(block_group);
77233c2d 2678 } else if (bg_reclaim_threshold &&
d8da0e85
NA
2679 reclaimable_unusable >=
2680 div_factor_fine(block_group->zone_capacity,
2681 bg_reclaim_threshold)) {
18bb8bbf
JT
2682 btrfs_mark_bg_to_reclaim(block_group);
2683 }
169e0da9
NA
2684
2685 return 0;
2686}
2687
32da5386 2688int btrfs_add_free_space(struct btrfs_block_group *block_group,
478b4d9f
JB
2689 u64 bytenr, u64 size)
2690{
a7ccb255
DZ
2691 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2692
169e0da9
NA
2693 if (btrfs_is_zoned(block_group->fs_info))
2694 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2695 true);
2696
a7ccb255
DZ
2697 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2698 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2699
290ef19a 2700 return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
478b4d9f
JB
2701}
2702
169e0da9
NA
2703int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
2704 u64 bytenr, u64 size)
2705{
2706 if (btrfs_is_zoned(block_group->fs_info))
2707 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2708 false);
2709
2710 return btrfs_add_free_space(block_group, bytenr, size);
2711}
2712
b0643e59
DZ
2713/*
2714 * This is a subtle distinction because when adding free space back in general,
2715 * we want it to be added as untrimmed for async. But in the case where we add
2716 * it on loading of a block group, we want to consider it trimmed.
2717 */
2718int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2719 u64 bytenr, u64 size)
2720{
2721 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2722
169e0da9
NA
2723 if (btrfs_is_zoned(block_group->fs_info))
2724 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2725 true);
2726
b0643e59
DZ
2727 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2728 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2729 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2730
290ef19a 2731 return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
b0643e59
DZ
2732}
2733
32da5386 2734int btrfs_remove_free_space(struct btrfs_block_group *block_group,
6226cb0a 2735 u64 offset, u64 bytes)
0f9dd46c 2736{
34d52cb6 2737 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c 2738 struct btrfs_free_space *info;
b0175117
JB
2739 int ret;
2740 bool re_search = false;
0f9dd46c 2741
011b41bf
NA
2742 if (btrfs_is_zoned(block_group->fs_info)) {
2743 /*
2744 * This can happen with conventional zones when replaying log.
2745 * Since the allocation info of tree-log nodes are not recorded
2746 * to the extent-tree, calculate_alloc_pointer() failed to
2747 * advance the allocation pointer after last allocated tree log
2748 * node blocks.
2749 *
2750 * This function is called from
2751 * btrfs_pin_extent_for_log_replay() when replaying the log.
2752 * Advance the pointer not to overwrite the tree-log nodes.
2753 */
0ae79c6f
NA
2754 if (block_group->start + block_group->alloc_offset <
2755 offset + bytes) {
2756 block_group->alloc_offset =
2757 offset + bytes - block_group->start;
2758 }
169e0da9 2759 return 0;
011b41bf 2760 }
169e0da9 2761
34d52cb6 2762 spin_lock(&ctl->tree_lock);
6226cb0a 2763
96303081 2764again:
b0175117 2765 ret = 0;
bdb7d303
JB
2766 if (!bytes)
2767 goto out_lock;
2768
34d52cb6 2769 info = tree_search_offset(ctl, offset, 0, 0);
96303081 2770 if (!info) {
6606bb97
JB
2771 /*
2772 * oops didn't find an extent that matched the space we wanted
2773 * to remove, look for a bitmap instead
2774 */
34d52cb6 2775 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
6606bb97
JB
2776 1, 0);
2777 if (!info) {
b0175117
JB
2778 /*
2779 * If we found a partial bit of our free space in a
2780 * bitmap but then couldn't find the other part this may
2781 * be a problem, so WARN about it.
24a70313 2782 */
b0175117 2783 WARN_ON(re_search);
6606bb97
JB
2784 goto out_lock;
2785 }
96303081
JB
2786 }
2787
b0175117 2788 re_search = false;
bdb7d303 2789 if (!info->bitmap) {
32e1649b 2790 unlink_free_space(ctl, info, true);
bdb7d303
JB
2791 if (offset == info->offset) {
2792 u64 to_free = min(bytes, info->bytes);
2793
2794 info->bytes -= to_free;
2795 info->offset += to_free;
2796 if (info->bytes) {
2797 ret = link_free_space(ctl, info);
2798 WARN_ON(ret);
2799 } else {
2800 kmem_cache_free(btrfs_free_space_cachep, info);
2801 }
0f9dd46c 2802
bdb7d303
JB
2803 offset += to_free;
2804 bytes -= to_free;
2805 goto again;
2806 } else {
2807 u64 old_end = info->bytes + info->offset;
9b49c9b9 2808
bdb7d303 2809 info->bytes = offset - info->offset;
34d52cb6 2810 ret = link_free_space(ctl, info);
96303081
JB
2811 WARN_ON(ret);
2812 if (ret)
2813 goto out_lock;
96303081 2814
bdb7d303
JB
2815 /* Not enough bytes in this entry to satisfy us */
2816 if (old_end < offset + bytes) {
2817 bytes -= old_end - offset;
2818 offset = old_end;
2819 goto again;
2820 } else if (old_end == offset + bytes) {
2821 /* all done */
2822 goto out_lock;
2823 }
2824 spin_unlock(&ctl->tree_lock);
2825
290ef19a 2826 ret = __btrfs_add_free_space(block_group,
a7ccb255
DZ
2827 offset + bytes,
2828 old_end - (offset + bytes),
2829 info->trim_state);
bdb7d303
JB
2830 WARN_ON(ret);
2831 goto out;
2832 }
0f9dd46c 2833 }
96303081 2834
34d52cb6 2835 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
b0175117
JB
2836 if (ret == -EAGAIN) {
2837 re_search = true;
96303081 2838 goto again;
b0175117 2839 }
96303081 2840out_lock:
66b53bae 2841 btrfs_discard_update_discardable(block_group);
34d52cb6 2842 spin_unlock(&ctl->tree_lock);
0f9dd46c 2843out:
25179201
JB
2844 return ret;
2845}
2846
32da5386 2847void btrfs_dump_free_space(struct btrfs_block_group *block_group,
0f9dd46c
JB
2848 u64 bytes)
2849{
0b246afa 2850 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 2851 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
0f9dd46c
JB
2852 struct btrfs_free_space *info;
2853 struct rb_node *n;
2854 int count = 0;
2855
169e0da9
NA
2856 /*
2857 * Zoned btrfs does not use free space tree and cluster. Just print
2858 * out the free space after the allocation offset.
2859 */
2860 if (btrfs_is_zoned(fs_info)) {
afba2bc0
NA
2861 btrfs_info(fs_info, "free space %llu active %d",
2862 block_group->zone_capacity - block_group->alloc_offset,
2863 block_group->zone_is_active);
169e0da9
NA
2864 return;
2865 }
2866
9084cb6a 2867 spin_lock(&ctl->tree_lock);
34d52cb6 2868 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
0f9dd46c 2869 info = rb_entry(n, struct btrfs_free_space, offset_index);
f6175efa 2870 if (info->bytes >= bytes && !block_group->ro)
0f9dd46c 2871 count++;
0b246afa 2872 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
efe120a0 2873 info->offset, info->bytes,
96303081 2874 (info->bitmap) ? "yes" : "no");
0f9dd46c 2875 }
9084cb6a 2876 spin_unlock(&ctl->tree_lock);
0b246afa 2877 btrfs_info(fs_info, "block group has cluster?: %s",
96303081 2878 list_empty(&block_group->cluster_list) ? "no" : "yes");
0b246afa 2879 btrfs_info(fs_info,
efe120a0 2880 "%d blocks of free space at or bigger than bytes is", count);
0f9dd46c
JB
2881}
2882
cd79909b
JB
2883void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
2884 struct btrfs_free_space_ctl *ctl)
0f9dd46c 2885{
0b246afa 2886 struct btrfs_fs_info *fs_info = block_group->fs_info;
0f9dd46c 2887
34d52cb6 2888 spin_lock_init(&ctl->tree_lock);
0b246afa 2889 ctl->unit = fs_info->sectorsize;
b3470b5d 2890 ctl->start = block_group->start;
364be842 2891 ctl->block_group = block_group;
34d52cb6 2892 ctl->op = &free_space_op;
59c7b566 2893 ctl->free_space_bytes = RB_ROOT_CACHED;
55507ce3
FM
2894 INIT_LIST_HEAD(&ctl->trimming_ranges);
2895 mutex_init(&ctl->cache_writeout_mutex);
0f9dd46c 2896
34d52cb6
LZ
2897 /*
2898 * we only want to have 32k of ram per block group for keeping
2899 * track of free space, and if we pass 1/2 of that we want to
2900 * start converting things over to using bitmaps
2901 */
ee22184b 2902 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
0f9dd46c
JB
2903}
2904
fa9c0d79
CM
2905/*
2906 * for a given cluster, put all of its extents back into the free
2907 * space cache. If the block group passed doesn't match the block group
2908 * pointed to by the cluster, someone else raced in and freed the
2909 * cluster already. In that case, we just return without changing anything
2910 */
69b0e093 2911static void __btrfs_return_cluster_to_free_space(
32da5386 2912 struct btrfs_block_group *block_group,
fa9c0d79
CM
2913 struct btrfs_free_cluster *cluster)
2914{
34d52cb6 2915 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79
CM
2916 struct btrfs_free_space *entry;
2917 struct rb_node *node;
2918
2919 spin_lock(&cluster->lock);
95c85fba
JB
2920 if (cluster->block_group != block_group) {
2921 spin_unlock(&cluster->lock);
2922 return;
2923 }
fa9c0d79 2924
96303081 2925 cluster->block_group = NULL;
fa9c0d79 2926 cluster->window_start = 0;
96303081 2927 list_del_init(&cluster->block_group_list);
96303081 2928
fa9c0d79 2929 node = rb_first(&cluster->root);
96303081 2930 while (node) {
4e69b598
JB
2931 bool bitmap;
2932
fa9c0d79
CM
2933 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2934 node = rb_next(&entry->offset_index);
2935 rb_erase(&entry->offset_index, &cluster->root);
20005523 2936 RB_CLEAR_NODE(&entry->offset_index);
4e69b598
JB
2937
2938 bitmap = (entry->bitmap != NULL);
20005523 2939 if (!bitmap) {
dfb79ddb 2940 /* Merging treats extents as if they were new */
5dc7c10b 2941 if (!btrfs_free_space_trimmed(entry)) {
dfb79ddb 2942 ctl->discardable_extents[BTRFS_STAT_CURR]--;
5dc7c10b
DZ
2943 ctl->discardable_bytes[BTRFS_STAT_CURR] -=
2944 entry->bytes;
2945 }
dfb79ddb 2946
34d52cb6 2947 try_merge_free_space(ctl, entry, false);
20005523 2948 steal_from_bitmap(ctl, entry, false);
dfb79ddb
DZ
2949
2950 /* As we insert directly, update these statistics */
5dc7c10b 2951 if (!btrfs_free_space_trimmed(entry)) {
dfb79ddb 2952 ctl->discardable_extents[BTRFS_STAT_CURR]++;
5dc7c10b
DZ
2953 ctl->discardable_bytes[BTRFS_STAT_CURR] +=
2954 entry->bytes;
2955 }
20005523 2956 }
34d52cb6 2957 tree_insert_offset(&ctl->free_space_offset,
4e69b598 2958 entry->offset, &entry->offset_index, bitmap);
59c7b566
JB
2959 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
2960 entry_less);
fa9c0d79 2961 }
6bef4d31 2962 cluster->root = RB_ROOT;
fa9c0d79 2963 spin_unlock(&cluster->lock);
96303081 2964 btrfs_put_block_group(block_group);
fa9c0d79
CM
2965}
2966
48a3b636
ES
2967static void __btrfs_remove_free_space_cache_locked(
2968 struct btrfs_free_space_ctl *ctl)
0f9dd46c
JB
2969{
2970 struct btrfs_free_space *info;
2971 struct rb_node *node;
581bb050 2972
581bb050
LZ
2973 while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2974 info = rb_entry(node, struct btrfs_free_space, offset_index);
9b90f513 2975 if (!info->bitmap) {
32e1649b 2976 unlink_free_space(ctl, info, true);
9b90f513
JB
2977 kmem_cache_free(btrfs_free_space_cachep, info);
2978 } else {
2979 free_bitmap(ctl, info);
2980 }
351810c1
DS
2981
2982 cond_resched_lock(&ctl->tree_lock);
581bb050 2983 }
09655373
CM
2984}
2985
2986void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2987{
2988 spin_lock(&ctl->tree_lock);
2989 __btrfs_remove_free_space_cache_locked(ctl);
364be842
NB
2990 if (ctl->block_group)
2991 btrfs_discard_update_discardable(ctl->block_group);
581bb050
LZ
2992 spin_unlock(&ctl->tree_lock);
2993}
2994
32da5386 2995void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
581bb050
LZ
2996{
2997 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
fa9c0d79 2998 struct btrfs_free_cluster *cluster;
96303081 2999 struct list_head *head;
0f9dd46c 3000
34d52cb6 3001 spin_lock(&ctl->tree_lock);
96303081
JB
3002 while ((head = block_group->cluster_list.next) !=
3003 &block_group->cluster_list) {
3004 cluster = list_entry(head, struct btrfs_free_cluster,
3005 block_group_list);
fa9c0d79
CM
3006
3007 WARN_ON(cluster->block_group != block_group);
3008 __btrfs_return_cluster_to_free_space(block_group, cluster);
351810c1
DS
3009
3010 cond_resched_lock(&ctl->tree_lock);
fa9c0d79 3011 }
09655373 3012 __btrfs_remove_free_space_cache_locked(ctl);
66b53bae 3013 btrfs_discard_update_discardable(block_group);
34d52cb6 3014 spin_unlock(&ctl->tree_lock);
fa9c0d79 3015
0f9dd46c
JB
3016}
3017
6e80d4f8
DZ
3018/**
3019 * btrfs_is_free_space_trimmed - see if everything is trimmed
3020 * @block_group: block_group of interest
3021 *
3022 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
3023 */
3024bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
3025{
3026 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3027 struct btrfs_free_space *info;
3028 struct rb_node *node;
3029 bool ret = true;
3030
3031 spin_lock(&ctl->tree_lock);
3032 node = rb_first(&ctl->free_space_offset);
3033
3034 while (node) {
3035 info = rb_entry(node, struct btrfs_free_space, offset_index);
3036
3037 if (!btrfs_free_space_trimmed(info)) {
3038 ret = false;
3039 break;
3040 }
3041
3042 node = rb_next(node);
3043 }
3044
3045 spin_unlock(&ctl->tree_lock);
3046 return ret;
3047}
3048
32da5386 3049u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
a4820398
MX
3050 u64 offset, u64 bytes, u64 empty_size,
3051 u64 *max_extent_size)
0f9dd46c 3052{
34d52cb6 3053 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
9ddf648f
DZ
3054 struct btrfs_discard_ctl *discard_ctl =
3055 &block_group->fs_info->discard_ctl;
6226cb0a 3056 struct btrfs_free_space *entry = NULL;
96303081 3057 u64 bytes_search = bytes + empty_size;
6226cb0a 3058 u64 ret = 0;
53b381b3
DW
3059 u64 align_gap = 0;
3060 u64 align_gap_len = 0;
a7ccb255 3061 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
59c7b566 3062 bool use_bytes_index = (offset == block_group->start);
0f9dd46c 3063
2eda5708
NA
3064 ASSERT(!btrfs_is_zoned(block_group->fs_info));
3065
34d52cb6 3066 spin_lock(&ctl->tree_lock);
53b381b3 3067 entry = find_free_space(ctl, &offset, &bytes_search,
59c7b566
JB
3068 block_group->full_stripe_len, max_extent_size,
3069 use_bytes_index);
6226cb0a 3070 if (!entry)
96303081
JB
3071 goto out;
3072
3073 ret = offset;
3074 if (entry->bitmap) {
f594f13c 3075 bitmap_clear_bits(ctl, entry, offset, bytes, true);
9ddf648f
DZ
3076
3077 if (!btrfs_free_space_trimmed(entry))
3078 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3079
edf6e2d1 3080 if (!entry->bytes)
34d52cb6 3081 free_bitmap(ctl, entry);
96303081 3082 } else {
32e1649b 3083 unlink_free_space(ctl, entry, true);
53b381b3
DW
3084 align_gap_len = offset - entry->offset;
3085 align_gap = entry->offset;
a7ccb255 3086 align_gap_trim_state = entry->trim_state;
53b381b3 3087
9ddf648f
DZ
3088 if (!btrfs_free_space_trimmed(entry))
3089 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3090
53b381b3
DW
3091 entry->offset = offset + bytes;
3092 WARN_ON(entry->bytes < bytes + align_gap_len);
3093
3094 entry->bytes -= bytes + align_gap_len;
6226cb0a 3095 if (!entry->bytes)
dc89e982 3096 kmem_cache_free(btrfs_free_space_cachep, entry);
6226cb0a 3097 else
34d52cb6 3098 link_free_space(ctl, entry);
6226cb0a 3099 }
96303081 3100out:
66b53bae 3101 btrfs_discard_update_discardable(block_group);
34d52cb6 3102 spin_unlock(&ctl->tree_lock);
817d52f8 3103
53b381b3 3104 if (align_gap_len)
290ef19a 3105 __btrfs_add_free_space(block_group, align_gap, align_gap_len,
a7ccb255 3106 align_gap_trim_state);
0f9dd46c
JB
3107 return ret;
3108}
fa9c0d79
CM
3109
3110/*
3111 * given a cluster, put all of its extents back into the free space
3112 * cache. If a block group is passed, this function will only free
3113 * a cluster that belongs to the passed block group.
3114 *
3115 * Otherwise, it'll get a reference on the block group pointed to by the
3116 * cluster and remove the cluster from it.
3117 */
69b0e093 3118void btrfs_return_cluster_to_free_space(
32da5386 3119 struct btrfs_block_group *block_group,
fa9c0d79
CM
3120 struct btrfs_free_cluster *cluster)
3121{
34d52cb6 3122 struct btrfs_free_space_ctl *ctl;
fa9c0d79
CM
3123
3124 /* first, get a safe pointer to the block group */
3125 spin_lock(&cluster->lock);
3126 if (!block_group) {
3127 block_group = cluster->block_group;
3128 if (!block_group) {
3129 spin_unlock(&cluster->lock);
69b0e093 3130 return;
fa9c0d79
CM
3131 }
3132 } else if (cluster->block_group != block_group) {
3133 /* someone else has already freed it don't redo their work */
3134 spin_unlock(&cluster->lock);
69b0e093 3135 return;
fa9c0d79 3136 }
b5790d51 3137 btrfs_get_block_group(block_group);
fa9c0d79
CM
3138 spin_unlock(&cluster->lock);
3139
34d52cb6
LZ
3140 ctl = block_group->free_space_ctl;
3141
fa9c0d79 3142 /* now return any extents the cluster had on it */
34d52cb6 3143 spin_lock(&ctl->tree_lock);
69b0e093 3144 __btrfs_return_cluster_to_free_space(block_group, cluster);
34d52cb6 3145 spin_unlock(&ctl->tree_lock);
fa9c0d79 3146
6e80d4f8
DZ
3147 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
3148
fa9c0d79
CM
3149 /* finally drop our ref */
3150 btrfs_put_block_group(block_group);
fa9c0d79
CM
3151}
3152
32da5386 3153static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
96303081 3154 struct btrfs_free_cluster *cluster,
4e69b598 3155 struct btrfs_free_space *entry,
a4820398
MX
3156 u64 bytes, u64 min_start,
3157 u64 *max_extent_size)
96303081 3158{
34d52cb6 3159 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
96303081
JB
3160 int err;
3161 u64 search_start = cluster->window_start;
3162 u64 search_bytes = bytes;
3163 u64 ret = 0;
3164
96303081
JB
3165 search_start = min_start;
3166 search_bytes = bytes;
3167
0584f718 3168 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
a4820398 3169 if (err) {
ad22cf6e
JB
3170 *max_extent_size = max(get_max_extent_size(entry),
3171 *max_extent_size);
4e69b598 3172 return 0;
a4820398 3173 }
96303081
JB
3174
3175 ret = search_start;
f594f13c 3176 bitmap_clear_bits(ctl, entry, ret, bytes, false);
96303081
JB
3177
3178 return ret;
3179}
3180
fa9c0d79
CM
3181/*
3182 * given a cluster, try to allocate 'bytes' from it, returns 0
3183 * if it couldn't find anything suitably large, or a logical disk offset
3184 * if things worked out
3185 */
32da5386 3186u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
fa9c0d79 3187 struct btrfs_free_cluster *cluster, u64 bytes,
a4820398 3188 u64 min_start, u64 *max_extent_size)
fa9c0d79 3189{
34d52cb6 3190 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
9ddf648f
DZ
3191 struct btrfs_discard_ctl *discard_ctl =
3192 &block_group->fs_info->discard_ctl;
fa9c0d79
CM
3193 struct btrfs_free_space *entry = NULL;
3194 struct rb_node *node;
3195 u64 ret = 0;
3196
2eda5708
NA
3197 ASSERT(!btrfs_is_zoned(block_group->fs_info));
3198
fa9c0d79
CM
3199 spin_lock(&cluster->lock);
3200 if (bytes > cluster->max_size)
3201 goto out;
3202
3203 if (cluster->block_group != block_group)
3204 goto out;
3205
3206 node = rb_first(&cluster->root);
3207 if (!node)
3208 goto out;
3209
3210 entry = rb_entry(node, struct btrfs_free_space, offset_index);
67871254 3211 while (1) {
ad22cf6e
JB
3212 if (entry->bytes < bytes)
3213 *max_extent_size = max(get_max_extent_size(entry),
3214 *max_extent_size);
a4820398 3215
4e69b598
JB
3216 if (entry->bytes < bytes ||
3217 (!entry->bitmap && entry->offset < min_start)) {
fa9c0d79
CM
3218 node = rb_next(&entry->offset_index);
3219 if (!node)
3220 break;
3221 entry = rb_entry(node, struct btrfs_free_space,
3222 offset_index);
3223 continue;
3224 }
fa9c0d79 3225
4e69b598
JB
3226 if (entry->bitmap) {
3227 ret = btrfs_alloc_from_bitmap(block_group,
3228 cluster, entry, bytes,
a4820398
MX
3229 cluster->window_start,
3230 max_extent_size);
4e69b598 3231 if (ret == 0) {
4e69b598
JB
3232 node = rb_next(&entry->offset_index);
3233 if (!node)
3234 break;
3235 entry = rb_entry(node, struct btrfs_free_space,
3236 offset_index);
3237 continue;
3238 }
9b230628 3239 cluster->window_start += bytes;
4e69b598 3240 } else {
4e69b598
JB
3241 ret = entry->offset;
3242
3243 entry->offset += bytes;
3244 entry->bytes -= bytes;
3245 }
fa9c0d79 3246
fa9c0d79
CM
3247 break;
3248 }
3249out:
3250 spin_unlock(&cluster->lock);
96303081 3251
5e71b5d5
LZ
3252 if (!ret)
3253 return 0;
3254
34d52cb6 3255 spin_lock(&ctl->tree_lock);
5e71b5d5 3256
9ddf648f
DZ
3257 if (!btrfs_free_space_trimmed(entry))
3258 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3259
34d52cb6 3260 ctl->free_space -= bytes;
5dc7c10b
DZ
3261 if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3262 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3c179165
NB
3263
3264 spin_lock(&cluster->lock);
5e71b5d5 3265 if (entry->bytes == 0) {
3c179165 3266 rb_erase(&entry->offset_index, &cluster->root);
34d52cb6 3267 ctl->free_extents--;
4e69b598 3268 if (entry->bitmap) {
3acd4850
CL
3269 kmem_cache_free(btrfs_free_space_bitmap_cachep,
3270 entry->bitmap);
34d52cb6 3271 ctl->total_bitmaps--;
fa598b06 3272 recalculate_thresholds(ctl);
dfb79ddb
DZ
3273 } else if (!btrfs_free_space_trimmed(entry)) {
3274 ctl->discardable_extents[BTRFS_STAT_CURR]--;
4e69b598 3275 }
dc89e982 3276 kmem_cache_free(btrfs_free_space_cachep, entry);
5e71b5d5
LZ
3277 }
3278
3c179165 3279 spin_unlock(&cluster->lock);
34d52cb6 3280 spin_unlock(&ctl->tree_lock);
5e71b5d5 3281
fa9c0d79
CM
3282 return ret;
3283}
3284
32da5386 3285static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
96303081
JB
3286 struct btrfs_free_space *entry,
3287 struct btrfs_free_cluster *cluster,
1bb91902
AO
3288 u64 offset, u64 bytes,
3289 u64 cont1_bytes, u64 min_bytes)
96303081 3290{
34d52cb6 3291 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
96303081
JB
3292 unsigned long next_zero;
3293 unsigned long i;
1bb91902
AO
3294 unsigned long want_bits;
3295 unsigned long min_bits;
96303081 3296 unsigned long found_bits;
cef40483 3297 unsigned long max_bits = 0;
96303081
JB
3298 unsigned long start = 0;
3299 unsigned long total_found = 0;
4e69b598 3300 int ret;
96303081 3301
96009762 3302 i = offset_to_bit(entry->offset, ctl->unit,
96303081 3303 max_t(u64, offset, entry->offset));
96009762
WSH
3304 want_bits = bytes_to_bits(bytes, ctl->unit);
3305 min_bits = bytes_to_bits(min_bytes, ctl->unit);
96303081 3306
cef40483
JB
3307 /*
3308 * Don't bother looking for a cluster in this bitmap if it's heavily
3309 * fragmented.
3310 */
3311 if (entry->max_extent_size &&
3312 entry->max_extent_size < cont1_bytes)
3313 return -ENOSPC;
96303081
JB
3314again:
3315 found_bits = 0;
ebb3dad4 3316 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
96303081
JB
3317 next_zero = find_next_zero_bit(entry->bitmap,
3318 BITS_PER_BITMAP, i);
1bb91902 3319 if (next_zero - i >= min_bits) {
96303081 3320 found_bits = next_zero - i;
cef40483
JB
3321 if (found_bits > max_bits)
3322 max_bits = found_bits;
96303081
JB
3323 break;
3324 }
cef40483
JB
3325 if (next_zero - i > max_bits)
3326 max_bits = next_zero - i;
96303081
JB
3327 i = next_zero;
3328 }
3329
cef40483
JB
3330 if (!found_bits) {
3331 entry->max_extent_size = (u64)max_bits * ctl->unit;
4e69b598 3332 return -ENOSPC;
cef40483 3333 }
96303081 3334
1bb91902 3335 if (!total_found) {
96303081 3336 start = i;
b78d09bc 3337 cluster->max_size = 0;
96303081
JB
3338 }
3339
3340 total_found += found_bits;
3341
96009762
WSH
3342 if (cluster->max_size < found_bits * ctl->unit)
3343 cluster->max_size = found_bits * ctl->unit;
96303081 3344
1bb91902
AO
3345 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3346 i = next_zero + 1;
96303081
JB
3347 goto again;
3348 }
3349
96009762 3350 cluster->window_start = start * ctl->unit + entry->offset;
34d52cb6 3351 rb_erase(&entry->offset_index, &ctl->free_space_offset);
59c7b566
JB
3352 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3353
3354 /*
3355 * We need to know if we're currently on the normal space index when we
3356 * manipulate the bitmap so that we know we need to remove and re-insert
3357 * it into the space_index tree. Clear the bytes_index node here so the
3358 * bitmap manipulation helpers know not to mess with the space_index
3359 * until this bitmap entry is added back into the normal cache.
3360 */
3361 RB_CLEAR_NODE(&entry->bytes_index);
3362
4e69b598
JB
3363 ret = tree_insert_offset(&cluster->root, entry->offset,
3364 &entry->offset_index, 1);
b12d6869 3365 ASSERT(!ret); /* -EEXIST; Logic error */
96303081 3366
3f7de037 3367 trace_btrfs_setup_cluster(block_group, cluster,
96009762 3368 total_found * ctl->unit, 1);
96303081
JB
3369 return 0;
3370}
3371
4e69b598
JB
3372/*
3373 * This searches the block group for just extents to fill the cluster with.
1bb91902
AO
3374 * Try to find a cluster with at least bytes total bytes, at least one
3375 * extent of cont1_bytes, and other clusters of at least min_bytes.
4e69b598 3376 */
3de85bb9 3377static noinline int
32da5386 3378setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3de85bb9
JB
3379 struct btrfs_free_cluster *cluster,
3380 struct list_head *bitmaps, u64 offset, u64 bytes,
1bb91902 3381 u64 cont1_bytes, u64 min_bytes)
4e69b598 3382{
34d52cb6 3383 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
4e69b598
JB
3384 struct btrfs_free_space *first = NULL;
3385 struct btrfs_free_space *entry = NULL;
4e69b598
JB
3386 struct btrfs_free_space *last;
3387 struct rb_node *node;
4e69b598
JB
3388 u64 window_free;
3389 u64 max_extent;
3f7de037 3390 u64 total_size = 0;
4e69b598 3391
34d52cb6 3392 entry = tree_search_offset(ctl, offset, 0, 1);
4e69b598
JB
3393 if (!entry)
3394 return -ENOSPC;
3395
3396 /*
3397 * We don't want bitmaps, so just move along until we find a normal
3398 * extent entry.
3399 */
1bb91902
AO
3400 while (entry->bitmap || entry->bytes < min_bytes) {
3401 if (entry->bitmap && list_empty(&entry->list))
86d4a77b 3402 list_add_tail(&entry->list, bitmaps);
4e69b598
JB
3403 node = rb_next(&entry->offset_index);
3404 if (!node)
3405 return -ENOSPC;
3406 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3407 }
3408
4e69b598
JB
3409 window_free = entry->bytes;
3410 max_extent = entry->bytes;
3411 first = entry;
3412 last = entry;
4e69b598 3413
1bb91902
AO
3414 for (node = rb_next(&entry->offset_index); node;
3415 node = rb_next(&entry->offset_index)) {
4e69b598
JB
3416 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3417
86d4a77b
JB
3418 if (entry->bitmap) {
3419 if (list_empty(&entry->list))
3420 list_add_tail(&entry->list, bitmaps);
4e69b598 3421 continue;
86d4a77b
JB
3422 }
3423
1bb91902
AO
3424 if (entry->bytes < min_bytes)
3425 continue;
3426
3427 last = entry;
3428 window_free += entry->bytes;
3429 if (entry->bytes > max_extent)
4e69b598 3430 max_extent = entry->bytes;
4e69b598
JB
3431 }
3432
1bb91902
AO
3433 if (window_free < bytes || max_extent < cont1_bytes)
3434 return -ENOSPC;
3435
4e69b598
JB
3436 cluster->window_start = first->offset;
3437
3438 node = &first->offset_index;
3439
3440 /*
3441 * now we've found our entries, pull them out of the free space
3442 * cache and put them into the cluster rbtree
3443 */
3444 do {
3445 int ret;
3446
3447 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3448 node = rb_next(&entry->offset_index);
1bb91902 3449 if (entry->bitmap || entry->bytes < min_bytes)
4e69b598
JB
3450 continue;
3451
34d52cb6 3452 rb_erase(&entry->offset_index, &ctl->free_space_offset);
59c7b566 3453 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
4e69b598
JB
3454 ret = tree_insert_offset(&cluster->root, entry->offset,
3455 &entry->offset_index, 0);
3f7de037 3456 total_size += entry->bytes;
b12d6869 3457 ASSERT(!ret); /* -EEXIST; Logic error */
4e69b598
JB
3458 } while (node && entry != last);
3459
3460 cluster->max_size = max_extent;
3f7de037 3461 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
4e69b598
JB
3462 return 0;
3463}
3464
3465/*
3466 * This specifically looks for bitmaps that may work in the cluster, we assume
3467 * that we have already failed to find extents that will work.
3468 */
3de85bb9 3469static noinline int
32da5386 3470setup_cluster_bitmap(struct btrfs_block_group *block_group,
3de85bb9
JB
3471 struct btrfs_free_cluster *cluster,
3472 struct list_head *bitmaps, u64 offset, u64 bytes,
1bb91902 3473 u64 cont1_bytes, u64 min_bytes)
4e69b598 3474{
34d52cb6 3475 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1b9b922a 3476 struct btrfs_free_space *entry = NULL;
4e69b598 3477 int ret = -ENOSPC;
0f0fbf1d 3478 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
4e69b598 3479
34d52cb6 3480 if (ctl->total_bitmaps == 0)
4e69b598
JB
3481 return -ENOSPC;
3482
0f0fbf1d
LZ
3483 /*
3484 * The bitmap that covers offset won't be in the list unless offset
3485 * is just its start offset.
3486 */
1b9b922a
CM
3487 if (!list_empty(bitmaps))
3488 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3489
3490 if (!entry || entry->offset != bitmap_offset) {
0f0fbf1d
LZ
3491 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3492 if (entry && list_empty(&entry->list))
3493 list_add(&entry->list, bitmaps);
3494 }
3495
86d4a77b 3496 list_for_each_entry(entry, bitmaps, list) {
357b9784 3497 if (entry->bytes < bytes)
86d4a77b
JB
3498 continue;
3499 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
1bb91902 3500 bytes, cont1_bytes, min_bytes);
86d4a77b
JB
3501 if (!ret)
3502 return 0;
3503 }
3504
3505 /*
52621cb6
LZ
3506 * The bitmaps list has all the bitmaps that record free space
3507 * starting after offset, so no more search is required.
86d4a77b 3508 */
52621cb6 3509 return -ENOSPC;
4e69b598
JB
3510}
3511
fa9c0d79
CM
3512/*
3513 * here we try to find a cluster of blocks in a block group. The goal
1bb91902 3514 * is to find at least bytes+empty_size.
fa9c0d79
CM
3515 * We might not find them all in one contiguous area.
3516 *
3517 * returns zero and sets up cluster if things worked out, otherwise
3518 * it returns -enospc
3519 */
32da5386 3520int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
fa9c0d79
CM
3521 struct btrfs_free_cluster *cluster,
3522 u64 offset, u64 bytes, u64 empty_size)
3523{
2ceeae2e 3524 struct btrfs_fs_info *fs_info = block_group->fs_info;
34d52cb6 3525 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
86d4a77b 3526 struct btrfs_free_space *entry, *tmp;
52621cb6 3527 LIST_HEAD(bitmaps);
fa9c0d79 3528 u64 min_bytes;
1bb91902 3529 u64 cont1_bytes;
fa9c0d79
CM
3530 int ret;
3531
1bb91902
AO
3532 /*
3533 * Choose the minimum extent size we'll require for this
3534 * cluster. For SSD_SPREAD, don't allow any fragmentation.
3535 * For metadata, allow allocates with smaller extents. For
3536 * data, keep it dense.
3537 */
0b246afa 3538 if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
c1867eb3
DS
3539 cont1_bytes = bytes + empty_size;
3540 min_bytes = cont1_bytes;
451d7585 3541 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1bb91902 3542 cont1_bytes = bytes;
0b246afa 3543 min_bytes = fs_info->sectorsize;
1bb91902
AO
3544 } else {
3545 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
0b246afa 3546 min_bytes = fs_info->sectorsize;
1bb91902 3547 }
fa9c0d79 3548
34d52cb6 3549 spin_lock(&ctl->tree_lock);
7d0d2e8e
JB
3550
3551 /*
3552 * If we know we don't have enough space to make a cluster don't even
3553 * bother doing all the work to try and find one.
3554 */
1bb91902 3555 if (ctl->free_space < bytes) {
34d52cb6 3556 spin_unlock(&ctl->tree_lock);
7d0d2e8e
JB
3557 return -ENOSPC;
3558 }
3559
fa9c0d79
CM
3560 spin_lock(&cluster->lock);
3561
3562 /* someone already found a cluster, hooray */
3563 if (cluster->block_group) {
3564 ret = 0;
3565 goto out;
3566 }
fa9c0d79 3567
3f7de037
JB
3568 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3569 min_bytes);
3570
86d4a77b 3571 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
1bb91902
AO
3572 bytes + empty_size,
3573 cont1_bytes, min_bytes);
4e69b598 3574 if (ret)
86d4a77b 3575 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
1bb91902
AO
3576 offset, bytes + empty_size,
3577 cont1_bytes, min_bytes);
86d4a77b
JB
3578
3579 /* Clear our temporary list */
3580 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3581 list_del_init(&entry->list);
fa9c0d79 3582
4e69b598 3583 if (!ret) {
b5790d51 3584 btrfs_get_block_group(block_group);
4e69b598
JB
3585 list_add_tail(&cluster->block_group_list,
3586 &block_group->cluster_list);
3587 cluster->block_group = block_group;
3f7de037
JB
3588 } else {
3589 trace_btrfs_failed_cluster_setup(block_group);
fa9c0d79 3590 }
fa9c0d79
CM
3591out:
3592 spin_unlock(&cluster->lock);
34d52cb6 3593 spin_unlock(&ctl->tree_lock);
fa9c0d79
CM
3594
3595 return ret;
3596}
3597
3598/*
3599 * simple code to zero out a cluster
3600 */
3601void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3602{
3603 spin_lock_init(&cluster->lock);
3604 spin_lock_init(&cluster->refill_lock);
6bef4d31 3605 cluster->root = RB_ROOT;
fa9c0d79 3606 cluster->max_size = 0;
c759c4e1 3607 cluster->fragmented = false;
fa9c0d79
CM
3608 INIT_LIST_HEAD(&cluster->block_group_list);
3609 cluster->block_group = NULL;
3610}
3611
32da5386 3612static int do_trimming(struct btrfs_block_group *block_group,
7fe1e641 3613 u64 *total_trimmed, u64 start, u64 bytes,
55507ce3 3614 u64 reserved_start, u64 reserved_bytes,
b0643e59 3615 enum btrfs_trim_state reserved_trim_state,
55507ce3 3616 struct btrfs_trim_range *trim_entry)
f7039b1d 3617{
7fe1e641 3618 struct btrfs_space_info *space_info = block_group->space_info;
f7039b1d 3619 struct btrfs_fs_info *fs_info = block_group->fs_info;
55507ce3 3620 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
7fe1e641
LZ
3621 int ret;
3622 int update = 0;
b0643e59
DZ
3623 const u64 end = start + bytes;
3624 const u64 reserved_end = reserved_start + reserved_bytes;
3625 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3626 u64 trimmed = 0;
f7039b1d 3627
7fe1e641
LZ
3628 spin_lock(&space_info->lock);
3629 spin_lock(&block_group->lock);
3630 if (!block_group->ro) {
3631 block_group->reserved += reserved_bytes;
3632 space_info->bytes_reserved += reserved_bytes;
3633 update = 1;
3634 }
3635 spin_unlock(&block_group->lock);
3636 spin_unlock(&space_info->lock);
3637
2ff7e61e 3638 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
b0643e59 3639 if (!ret) {
7fe1e641 3640 *total_trimmed += trimmed;
b0643e59
DZ
3641 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3642 }
7fe1e641 3643
55507ce3 3644 mutex_lock(&ctl->cache_writeout_mutex);
b0643e59 3645 if (reserved_start < start)
290ef19a 3646 __btrfs_add_free_space(block_group, reserved_start,
b0643e59
DZ
3647 start - reserved_start,
3648 reserved_trim_state);
3649 if (start + bytes < reserved_start + reserved_bytes)
290ef19a 3650 __btrfs_add_free_space(block_group, end, reserved_end - end,
b0643e59 3651 reserved_trim_state);
290ef19a 3652 __btrfs_add_free_space(block_group, start, bytes, trim_state);
55507ce3
FM
3653 list_del(&trim_entry->list);
3654 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3655
3656 if (update) {
3657 spin_lock(&space_info->lock);
3658 spin_lock(&block_group->lock);
3659 if (block_group->ro)
3660 space_info->bytes_readonly += reserved_bytes;
3661 block_group->reserved -= reserved_bytes;
3662 space_info->bytes_reserved -= reserved_bytes;
7fe1e641 3663 spin_unlock(&block_group->lock);
8f63a840 3664 spin_unlock(&space_info->lock);
7fe1e641
LZ
3665 }
3666
3667 return ret;
3668}
3669
2bee7eb8
DZ
3670/*
3671 * If @async is set, then we will trim 1 region and return.
3672 */
32da5386 3673static int trim_no_bitmap(struct btrfs_block_group *block_group,
2bee7eb8
DZ
3674 u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3675 bool async)
7fe1e641 3676{
19b2a2c7
DZ
3677 struct btrfs_discard_ctl *discard_ctl =
3678 &block_group->fs_info->discard_ctl;
7fe1e641
LZ
3679 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3680 struct btrfs_free_space *entry;
3681 struct rb_node *node;
3682 int ret = 0;
3683 u64 extent_start;
3684 u64 extent_bytes;
b0643e59 3685 enum btrfs_trim_state extent_trim_state;
7fe1e641 3686 u64 bytes;
19b2a2c7 3687 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
f7039b1d
LD
3688
3689 while (start < end) {
55507ce3
FM
3690 struct btrfs_trim_range trim_entry;
3691
3692 mutex_lock(&ctl->cache_writeout_mutex);
34d52cb6 3693 spin_lock(&ctl->tree_lock);
f7039b1d 3694
2bee7eb8
DZ
3695 if (ctl->free_space < minlen)
3696 goto out_unlock;
f7039b1d 3697
34d52cb6 3698 entry = tree_search_offset(ctl, start, 0, 1);
2bee7eb8
DZ
3699 if (!entry)
3700 goto out_unlock;
f7039b1d 3701
2bee7eb8
DZ
3702 /* Skip bitmaps and if async, already trimmed entries */
3703 while (entry->bitmap ||
3704 (async && btrfs_free_space_trimmed(entry))) {
7fe1e641 3705 node = rb_next(&entry->offset_index);
2bee7eb8
DZ
3706 if (!node)
3707 goto out_unlock;
7fe1e641
LZ
3708 entry = rb_entry(node, struct btrfs_free_space,
3709 offset_index);
f7039b1d
LD
3710 }
3711
2bee7eb8
DZ
3712 if (entry->offset >= end)
3713 goto out_unlock;
f7039b1d 3714
7fe1e641
LZ
3715 extent_start = entry->offset;
3716 extent_bytes = entry->bytes;
b0643e59 3717 extent_trim_state = entry->trim_state;
4aa9ad52
DZ
3718 if (async) {
3719 start = entry->offset;
3720 bytes = entry->bytes;
3721 if (bytes < minlen) {
3722 spin_unlock(&ctl->tree_lock);
3723 mutex_unlock(&ctl->cache_writeout_mutex);
3724 goto next;
3725 }
32e1649b 3726 unlink_free_space(ctl, entry, true);
7fe6d45e
DZ
3727 /*
3728 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3729 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3730 * X when we come back around. So trim it now.
3731 */
3732 if (max_discard_size &&
3733 bytes >= (max_discard_size +
3734 BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
19b2a2c7
DZ
3735 bytes = max_discard_size;
3736 extent_bytes = max_discard_size;
3737 entry->offset += max_discard_size;
3738 entry->bytes -= max_discard_size;
4aa9ad52
DZ
3739 link_free_space(ctl, entry);
3740 } else {
3741 kmem_cache_free(btrfs_free_space_cachep, entry);
3742 }
3743 } else {
3744 start = max(start, extent_start);
3745 bytes = min(extent_start + extent_bytes, end) - start;
3746 if (bytes < minlen) {
3747 spin_unlock(&ctl->tree_lock);
3748 mutex_unlock(&ctl->cache_writeout_mutex);
3749 goto next;
3750 }
f7039b1d 3751
32e1649b 3752 unlink_free_space(ctl, entry, true);
4aa9ad52
DZ
3753 kmem_cache_free(btrfs_free_space_cachep, entry);
3754 }
7fe1e641 3755
34d52cb6 3756 spin_unlock(&ctl->tree_lock);
55507ce3
FM
3757 trim_entry.start = extent_start;
3758 trim_entry.bytes = extent_bytes;
3759 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3760 mutex_unlock(&ctl->cache_writeout_mutex);
f7039b1d 3761
7fe1e641 3762 ret = do_trimming(block_group, total_trimmed, start, bytes,
b0643e59
DZ
3763 extent_start, extent_bytes, extent_trim_state,
3764 &trim_entry);
2bee7eb8
DZ
3765 if (ret) {
3766 block_group->discard_cursor = start + bytes;
7fe1e641 3767 break;
2bee7eb8 3768 }
7fe1e641
LZ
3769next:
3770 start += bytes;
2bee7eb8
DZ
3771 block_group->discard_cursor = start;
3772 if (async && *total_trimmed)
3773 break;
f7039b1d 3774
7fe1e641
LZ
3775 if (fatal_signal_pending(current)) {
3776 ret = -ERESTARTSYS;
3777 break;
3778 }
3779
3780 cond_resched();
3781 }
2bee7eb8
DZ
3782
3783 return ret;
3784
3785out_unlock:
3786 block_group->discard_cursor = btrfs_block_group_end(block_group);
3787 spin_unlock(&ctl->tree_lock);
3788 mutex_unlock(&ctl->cache_writeout_mutex);
3789
7fe1e641
LZ
3790 return ret;
3791}
3792
da080fe1
DZ
3793/*
3794 * If we break out of trimming a bitmap prematurely, we should reset the
3795 * trimming bit. In a rather contrieved case, it's possible to race here so
3796 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3797 *
3798 * start = start of bitmap
3799 * end = near end of bitmap
3800 *
3801 * Thread 1: Thread 2:
3802 * trim_bitmaps(start)
3803 * trim_bitmaps(end)
3804 * end_trimming_bitmap()
3805 * reset_trimming_bitmap()
3806 */
3807static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3808{
3809 struct btrfs_free_space *entry;
3810
3811 spin_lock(&ctl->tree_lock);
3812 entry = tree_search_offset(ctl, offset, 1, 0);
dfb79ddb 3813 if (entry) {
5dc7c10b 3814 if (btrfs_free_space_trimmed(entry)) {
dfb79ddb
DZ
3815 ctl->discardable_extents[BTRFS_STAT_CURR] +=
3816 entry->bitmap_extents;
5dc7c10b
DZ
3817 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3818 }
da080fe1 3819 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
dfb79ddb
DZ
3820 }
3821
da080fe1
DZ
3822 spin_unlock(&ctl->tree_lock);
3823}
3824
dfb79ddb
DZ
3825static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3826 struct btrfs_free_space *entry)
da080fe1 3827{
dfb79ddb 3828 if (btrfs_free_space_trimming_bitmap(entry)) {
da080fe1 3829 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
dfb79ddb
DZ
3830 ctl->discardable_extents[BTRFS_STAT_CURR] -=
3831 entry->bitmap_extents;
5dc7c10b 3832 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
dfb79ddb 3833 }
da080fe1
DZ
3834}
3835
2bee7eb8
DZ
3836/*
3837 * If @async is set, then we will trim 1 region and return.
3838 */
32da5386 3839static int trim_bitmaps(struct btrfs_block_group *block_group,
2bee7eb8 3840 u64 *total_trimmed, u64 start, u64 end, u64 minlen,
7fe6d45e 3841 u64 maxlen, bool async)
7fe1e641 3842{
19b2a2c7
DZ
3843 struct btrfs_discard_ctl *discard_ctl =
3844 &block_group->fs_info->discard_ctl;
7fe1e641
LZ
3845 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3846 struct btrfs_free_space *entry;
3847 int ret = 0;
3848 int ret2;
3849 u64 bytes;
3850 u64 offset = offset_to_bitmap(ctl, start);
19b2a2c7 3851 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
7fe1e641
LZ
3852
3853 while (offset < end) {
3854 bool next_bitmap = false;
55507ce3 3855 struct btrfs_trim_range trim_entry;
7fe1e641 3856
55507ce3 3857 mutex_lock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3858 spin_lock(&ctl->tree_lock);
3859
3860 if (ctl->free_space < minlen) {
2bee7eb8
DZ
3861 block_group->discard_cursor =
3862 btrfs_block_group_end(block_group);
7fe1e641 3863 spin_unlock(&ctl->tree_lock);
55507ce3 3864 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3865 break;
3866 }
3867
3868 entry = tree_search_offset(ctl, offset, 1, 0);
7fe6d45e
DZ
3869 /*
3870 * Bitmaps are marked trimmed lossily now to prevent constant
3871 * discarding of the same bitmap (the reason why we are bound
3872 * by the filters). So, retrim the block group bitmaps when we
3873 * are preparing to punt to the unused_bgs list. This uses
3874 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3875 * which is the only discard index which sets minlen to 0.
3876 */
3877 if (!entry || (async && minlen && start == offset &&
2bee7eb8 3878 btrfs_free_space_trimmed(entry))) {
7fe1e641 3879 spin_unlock(&ctl->tree_lock);
55507ce3 3880 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3881 next_bitmap = true;
3882 goto next;
3883 }
3884
da080fe1
DZ
3885 /*
3886 * Async discard bitmap trimming begins at by setting the start
3887 * to be key.objectid and the offset_to_bitmap() aligns to the
3888 * start of the bitmap. This lets us know we are fully
3889 * scanning the bitmap rather than only some portion of it.
3890 */
3891 if (start == offset)
3892 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3893
7fe1e641 3894 bytes = minlen;
0584f718 3895 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
7fe1e641 3896 if (ret2 || start >= end) {
da080fe1 3897 /*
7fe6d45e
DZ
3898 * We lossily consider a bitmap trimmed if we only skip
3899 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
da080fe1 3900 */
7fe6d45e 3901 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
dfb79ddb 3902 end_trimming_bitmap(ctl, entry);
da080fe1
DZ
3903 else
3904 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
7fe1e641 3905 spin_unlock(&ctl->tree_lock);
55507ce3 3906 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3907 next_bitmap = true;
3908 goto next;
3909 }
3910
2bee7eb8
DZ
3911 /*
3912 * We already trimmed a region, but are using the locking above
3913 * to reset the trim_state.
3914 */
3915 if (async && *total_trimmed) {
3916 spin_unlock(&ctl->tree_lock);
3917 mutex_unlock(&ctl->cache_writeout_mutex);
3918 goto out;
3919 }
3920
7fe1e641 3921 bytes = min(bytes, end - start);
7fe6d45e 3922 if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
7fe1e641 3923 spin_unlock(&ctl->tree_lock);
55507ce3 3924 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3925 goto next;
3926 }
3927
7fe6d45e
DZ
3928 /*
3929 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3930 * If X < @minlen, we won't trim X when we come back around.
3931 * So trim it now. We differ here from trimming extents as we
3932 * don't keep individual state per bit.
3933 */
3934 if (async &&
3935 max_discard_size &&
3936 bytes > (max_discard_size + minlen))
19b2a2c7 3937 bytes = max_discard_size;
4aa9ad52 3938
f594f13c 3939 bitmap_clear_bits(ctl, entry, start, bytes, true);
7fe1e641
LZ
3940 if (entry->bytes == 0)
3941 free_bitmap(ctl, entry);
3942
3943 spin_unlock(&ctl->tree_lock);
55507ce3
FM
3944 trim_entry.start = start;
3945 trim_entry.bytes = bytes;
3946 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3947 mutex_unlock(&ctl->cache_writeout_mutex);
7fe1e641
LZ
3948
3949 ret = do_trimming(block_group, total_trimmed, start, bytes,
b0643e59 3950 start, bytes, 0, &trim_entry);
da080fe1
DZ
3951 if (ret) {
3952 reset_trimming_bitmap(ctl, offset);
2bee7eb8
DZ
3953 block_group->discard_cursor =
3954 btrfs_block_group_end(block_group);
7fe1e641 3955 break;
da080fe1 3956 }
7fe1e641
LZ
3957next:
3958 if (next_bitmap) {
3959 offset += BITS_PER_BITMAP * ctl->unit;
da080fe1 3960 start = offset;
7fe1e641
LZ
3961 } else {
3962 start += bytes;
f7039b1d 3963 }
2bee7eb8 3964 block_group->discard_cursor = start;
f7039b1d
LD
3965
3966 if (fatal_signal_pending(current)) {
da080fe1
DZ
3967 if (start != offset)
3968 reset_trimming_bitmap(ctl, offset);
f7039b1d
LD
3969 ret = -ERESTARTSYS;
3970 break;
3971 }
3972
3973 cond_resched();
3974 }
3975
2bee7eb8
DZ
3976 if (offset >= end)
3977 block_group->discard_cursor = end;
3978
3979out:
f7039b1d
LD
3980 return ret;
3981}
581bb050 3982
32da5386 3983int btrfs_trim_block_group(struct btrfs_block_group *block_group,
e33e17ee
JM
3984 u64 *trimmed, u64 start, u64 end, u64 minlen)
3985{
da080fe1 3986 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
e33e17ee 3987 int ret;
da080fe1 3988 u64 rem = 0;
e33e17ee 3989
2eda5708
NA
3990 ASSERT(!btrfs_is_zoned(block_group->fs_info));
3991
e33e17ee
JM
3992 *trimmed = 0;
3993
3994 spin_lock(&block_group->lock);
3995 if (block_group->removed) {
04216820 3996 spin_unlock(&block_group->lock);
e33e17ee 3997 return 0;
04216820 3998 }
6b7304af 3999 btrfs_freeze_block_group(block_group);
e33e17ee
JM
4000 spin_unlock(&block_group->lock);
4001
2bee7eb8 4002 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
e33e17ee
JM
4003 if (ret)
4004 goto out;
7fe1e641 4005
7fe6d45e 4006 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
da080fe1
DZ
4007 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
4008 /* If we ended in the middle of a bitmap, reset the trimming flag */
4009 if (rem)
4010 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
e33e17ee 4011out:
6b7304af 4012 btrfs_unfreeze_block_group(block_group);
7fe1e641
LZ
4013 return ret;
4014}
4015
2bee7eb8
DZ
4016int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
4017 u64 *trimmed, u64 start, u64 end, u64 minlen,
4018 bool async)
4019{
4020 int ret;
4021
4022 *trimmed = 0;
4023
4024 spin_lock(&block_group->lock);
4025 if (block_group->removed) {
4026 spin_unlock(&block_group->lock);
4027 return 0;
4028 }
6b7304af 4029 btrfs_freeze_block_group(block_group);
2bee7eb8
DZ
4030 spin_unlock(&block_group->lock);
4031
4032 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
6b7304af 4033 btrfs_unfreeze_block_group(block_group);
2bee7eb8
DZ
4034
4035 return ret;
4036}
4037
4038int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
4039 u64 *trimmed, u64 start, u64 end, u64 minlen,
7fe6d45e 4040 u64 maxlen, bool async)
2bee7eb8
DZ
4041{
4042 int ret;
4043
4044 *trimmed = 0;
4045
4046 spin_lock(&block_group->lock);
4047 if (block_group->removed) {
4048 spin_unlock(&block_group->lock);
4049 return 0;
4050 }
6b7304af 4051 btrfs_freeze_block_group(block_group);
2bee7eb8
DZ
4052 spin_unlock(&block_group->lock);
4053
7fe6d45e
DZ
4054 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
4055 async);
4056
6b7304af 4057 btrfs_unfreeze_block_group(block_group);
2bee7eb8
DZ
4058
4059 return ret;
4060}
4061
94846229
BB
4062bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
4063{
4064 return btrfs_super_cache_generation(fs_info->super_copy);
4065}
4066
36b216c8
BB
4067static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
4068 struct btrfs_trans_handle *trans)
4069{
4070 struct btrfs_block_group *block_group;
4071 struct rb_node *node;
77364faf 4072 int ret = 0;
36b216c8
BB
4073
4074 btrfs_info(fs_info, "cleaning free space cache v1");
4075
08dddb29 4076 node = rb_first_cached(&fs_info->block_group_cache_tree);
36b216c8
BB
4077 while (node) {
4078 block_group = rb_entry(node, struct btrfs_block_group, cache_node);
4079 ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
4080 if (ret)
4081 goto out;
4082 node = rb_next(node);
4083 }
4084out:
4085 return ret;
4086}
4087
94846229
BB
4088int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
4089{
4090 struct btrfs_trans_handle *trans;
4091 int ret;
4092
4093 /*
36b216c8
BB
4094 * update_super_roots will appropriately set or unset
4095 * super_copy->cache_generation based on SPACE_CACHE and
4096 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
4097 * transaction commit whether we are enabling space cache v1 and don't
4098 * have any other work to do, or are disabling it and removing free
4099 * space inodes.
94846229
BB
4100 */
4101 trans = btrfs_start_transaction(fs_info->tree_root, 0);
4102 if (IS_ERR(trans))
4103 return PTR_ERR(trans);
4104
36b216c8 4105 if (!active) {
94846229 4106 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
36b216c8
BB
4107 ret = cleanup_free_space_cache_v1(fs_info, trans);
4108 if (ret) {
4109 btrfs_abort_transaction(trans, ret);
4110 btrfs_end_transaction(trans);
4111 goto out;
4112 }
4113 }
94846229
BB
4114
4115 ret = btrfs_commit_transaction(trans);
36b216c8 4116out:
94846229
BB
4117 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4118
4119 return ret;
4120}
4121
74255aa0 4122#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
dc11dd5d
JB
4123/*
4124 * Use this if you need to make a bitmap or extent entry specifically, it
4125 * doesn't do any of the merging that add_free_space does, this acts a lot like
4126 * how the free space cache loading stuff works, so you can get really weird
4127 * configurations.
4128 */
32da5386 4129int test_add_free_space_entry(struct btrfs_block_group *cache,
dc11dd5d 4130 u64 offset, u64 bytes, bool bitmap)
74255aa0 4131{
dc11dd5d
JB
4132 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4133 struct btrfs_free_space *info = NULL, *bitmap_info;
4134 void *map = NULL;
da080fe1 4135 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
dc11dd5d
JB
4136 u64 bytes_added;
4137 int ret;
74255aa0 4138
dc11dd5d
JB
4139again:
4140 if (!info) {
4141 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4142 if (!info)
4143 return -ENOMEM;
74255aa0
JB
4144 }
4145
dc11dd5d
JB
4146 if (!bitmap) {
4147 spin_lock(&ctl->tree_lock);
4148 info->offset = offset;
4149 info->bytes = bytes;
cef40483 4150 info->max_extent_size = 0;
dc11dd5d
JB
4151 ret = link_free_space(ctl, info);
4152 spin_unlock(&ctl->tree_lock);
4153 if (ret)
4154 kmem_cache_free(btrfs_free_space_cachep, info);
4155 return ret;
4156 }
4157
4158 if (!map) {
3acd4850 4159 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
dc11dd5d
JB
4160 if (!map) {
4161 kmem_cache_free(btrfs_free_space_cachep, info);
4162 return -ENOMEM;
4163 }
4164 }
4165
4166 spin_lock(&ctl->tree_lock);
4167 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4168 1, 0);
4169 if (!bitmap_info) {
4170 info->bitmap = map;
4171 map = NULL;
4172 add_new_bitmap(ctl, info, offset);
4173 bitmap_info = info;
20005523 4174 info = NULL;
dc11dd5d 4175 }
74255aa0 4176
da080fe1
DZ
4177 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4178 trim_state);
cef40483 4179
dc11dd5d
JB
4180 bytes -= bytes_added;
4181 offset += bytes_added;
4182 spin_unlock(&ctl->tree_lock);
74255aa0 4183
dc11dd5d
JB
4184 if (bytes)
4185 goto again;
74255aa0 4186
20005523
FM
4187 if (info)
4188 kmem_cache_free(btrfs_free_space_cachep, info);
3acd4850
CL
4189 if (map)
4190 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
dc11dd5d 4191 return 0;
74255aa0
JB
4192}
4193
4194/*
4195 * Checks to see if the given range is in the free space cache. This is really
4196 * just used to check the absence of space, so if there is free space in the
4197 * range at all we will return 1.
4198 */
32da5386 4199int test_check_exists(struct btrfs_block_group *cache,
dc11dd5d 4200 u64 offset, u64 bytes)
74255aa0
JB
4201{
4202 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4203 struct btrfs_free_space *info;
4204 int ret = 0;
4205
4206 spin_lock(&ctl->tree_lock);
4207 info = tree_search_offset(ctl, offset, 0, 0);
4208 if (!info) {
4209 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4210 1, 0);
4211 if (!info)
4212 goto out;
4213 }
4214
4215have_info:
4216 if (info->bitmap) {
4217 u64 bit_off, bit_bytes;
4218 struct rb_node *n;
4219 struct btrfs_free_space *tmp;
4220
4221 bit_off = offset;
4222 bit_bytes = ctl->unit;
0584f718 4223 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
74255aa0
JB
4224 if (!ret) {
4225 if (bit_off == offset) {
4226 ret = 1;
4227 goto out;
4228 } else if (bit_off > offset &&
4229 offset + bytes > bit_off) {
4230 ret = 1;
4231 goto out;
4232 }
4233 }
4234
4235 n = rb_prev(&info->offset_index);
4236 while (n) {
4237 tmp = rb_entry(n, struct btrfs_free_space,
4238 offset_index);
4239 if (tmp->offset + tmp->bytes < offset)
4240 break;
4241 if (offset + bytes < tmp->offset) {
5473e0c4 4242 n = rb_prev(&tmp->offset_index);
74255aa0
JB
4243 continue;
4244 }
4245 info = tmp;
4246 goto have_info;
4247 }
4248
4249 n = rb_next(&info->offset_index);
4250 while (n) {
4251 tmp = rb_entry(n, struct btrfs_free_space,
4252 offset_index);
4253 if (offset + bytes < tmp->offset)
4254 break;
4255 if (tmp->offset + tmp->bytes < offset) {
5473e0c4 4256 n = rb_next(&tmp->offset_index);
74255aa0
JB
4257 continue;
4258 }
4259 info = tmp;
4260 goto have_info;
4261 }
4262
20005523 4263 ret = 0;
74255aa0
JB
4264 goto out;
4265 }
4266
4267 if (info->offset == offset) {
4268 ret = 1;
4269 goto out;
4270 }
4271
4272 if (offset > info->offset && offset < info->offset + info->bytes)
4273 ret = 1;
4274out:
4275 spin_unlock(&ctl->tree_lock);
4276 return ret;
4277}
dc11dd5d 4278#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */