From: Greg Kroah-Hartman Date: Tue, 3 Jan 2017 19:24:40 +0000 (+0100) Subject: 4.8-stable patches X-Git-Tag: v4.9.1~27 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e9cea67cafe0a9517a7d9a0a898a2865944437b7;p=thirdparty%2Fkernel%2Fstable-queue.git 4.8-stable patches added patches: aoe-fix-crash-in-page-count-manipulation.patch btrfs-fix-bug_on-in-btrfs_mark_buffer_dirty.patch btrfs-fix-deadlock-caused-by-fsync-when-logging-directory-entries.patch btrfs-fix-emptiness-check-for-dirtied-extent-buffers-at-check_leaf.patch btrfs-fix-qgroup-rescan-worker-initialization.patch btrfs-fix-relocation-incorrectly-dropping-data-references.patch btrfs-fix-tree-search-logic-when-replaying-directory-entry-deletes.patch btrfs-limit-async_work-allocation-and-worker-func-duration.patch btrfs-store-and-load-values-of-stripes_min-stripes_max-in-balance-status-item.patch --- diff --git a/queue-4.8/aoe-fix-crash-in-page-count-manipulation.patch b/queue-4.8/aoe-fix-crash-in-page-count-manipulation.patch new file mode 100644 index 00000000000..b6e24f4e07e --- /dev/null +++ b/queue-4.8/aoe-fix-crash-in-page-count-manipulation.patch @@ -0,0 +1,86 @@ +From 0cbc72a1781250f373327dd7e306e33859a42154 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Fri, 11 Nov 2016 18:28:50 -0700 +Subject: aoe: fix crash in page count manipulation + +From: Jens Axboe + +commit 0cbc72a1781250f373327dd7e306e33859a42154 upstream. + +aoeblk contains some mysterious code, that wants to elevate the bio +vec page counts while it's under IO. That is not needed, it's +fragile, and it's causing kernel oopses for some. + +Reported-by: Tested-by: Don Koch +Tested-by: Tested-by: Don Koch +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/aoe/aoecmd.c | 41 ----------------------------------------- + 1 file changed, 41 deletions(-) + +--- a/drivers/block/aoe/aoecmd.c ++++ b/drivers/block/aoe/aoecmd.c +@@ -853,45 +853,6 @@ rqbiocnt(struct request *r) + return n; + } + +-/* This can be removed if we are certain that no users of the block +- * layer will ever use zero-count pages in bios. Otherwise we have to +- * protect against the put_page sometimes done by the network layer. +- * +- * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for +- * discussion. +- * +- * We cannot use get_page in the workaround, because it insists on a +- * positive page count as a precondition. So we use _refcount directly. +- */ +-static void +-bio_pageinc(struct bio *bio) +-{ +- struct bio_vec bv; +- struct page *page; +- struct bvec_iter iter; +- +- bio_for_each_segment(bv, bio, iter) { +- /* Non-zero page count for non-head members of +- * compound pages is no longer allowed by the kernel. +- */ +- page = compound_head(bv.bv_page); +- page_ref_inc(page); +- } +-} +- +-static void +-bio_pagedec(struct bio *bio) +-{ +- struct page *page; +- struct bio_vec bv; +- struct bvec_iter iter; +- +- bio_for_each_segment(bv, bio, iter) { +- page = compound_head(bv.bv_page); +- page_ref_dec(page); +- } +-} +- + static void + bufinit(struct buf *buf, struct request *rq, struct bio *bio) + { +@@ -899,7 +860,6 @@ bufinit(struct buf *buf, struct request + buf->rq = rq; + buf->bio = bio; + buf->iter = bio->bi_iter; +- bio_pageinc(bio); + } + + static struct buf * +@@ -1127,7 +1087,6 @@ aoe_end_buf(struct aoedev *d, struct buf + if (buf == d->ip.buf) + d->ip.buf = NULL; + rq = buf->rq; +- bio_pagedec(buf->bio); + mempool_free(buf, d->bufpool); + n = (unsigned long) rq->special; + rq->special = (void *) --n; diff --git a/queue-4.8/btrfs-fix-bug_on-in-btrfs_mark_buffer_dirty.patch b/queue-4.8/btrfs-fix-bug_on-in-btrfs_mark_buffer_dirty.patch new file mode 100644 index 00000000000..64a7bd8a625 --- /dev/null +++ b/queue-4.8/btrfs-fix-bug_on-in-btrfs_mark_buffer_dirty.patch @@ -0,0 +1,53 @@ +From ef85b25e982b5bba1530b936e283ef129f02ab9d Mon Sep 17 00:00:00 2001 +From: Liu Bo +Date: Fri, 2 Sep 2016 12:35:34 -0700 +Subject: Btrfs: fix BUG_ON in btrfs_mark_buffer_dirty + +From: Liu Bo + +commit ef85b25e982b5bba1530b936e283ef129f02ab9d upstream. + +This can only happen with CONFIG_BTRFS_FS_CHECK_INTEGRITY=y. + +Commit 1ba98d0 ("Btrfs: detect corruption when non-root leaf has zero item") +assumes that a leaf is its root when leaf->bytenr == btrfs_root_bytenr(root), +however, we should not use btrfs_root_bytenr(root) since it's mainly got +updated during committing transaction. So the check can fail when doing +COW on this leaf while it is a root. + +This changes to use "if (leaf == btrfs_root_node(root))" instead, just like +how we check whether leaf is a root in __btrfs_cow_block(). + +Fixes: 1ba98d086fe3 (Btrfs: detect corruption when non-root leaf has zero item) +Reported-by: Jeff Mahoney +Signed-off-by: Liu Bo +Reviewed-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -572,13 +572,17 @@ static noinline int check_leaf(struct bt + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { ++ struct extent_buffer *eb; ++ ++ eb = btrfs_root_node(check_root); + /* if leaf is the root, then it's fine */ +- if (leaf->start != +- btrfs_root_bytenr(&check_root->root_item)) { ++ if (leaf != eb) { + CORRUPT("non-root leaf's nritems is 0", +- leaf, root, 0); ++ leaf, check_root, 0); ++ free_extent_buffer(eb); + return -EIO; + } ++ free_extent_buffer(eb); + } + return 0; + } diff --git a/queue-4.8/btrfs-fix-deadlock-caused-by-fsync-when-logging-directory-entries.patch b/queue-4.8/btrfs-fix-deadlock-caused-by-fsync-when-logging-directory-entries.patch new file mode 100644 index 00000000000..a9157b77eb2 --- /dev/null +++ b/queue-4.8/btrfs-fix-deadlock-caused-by-fsync-when-logging-directory-entries.patch @@ -0,0 +1,175 @@ +From ec125cfb7ae2157af3dd45dd8abe823e3e233eec Mon Sep 17 00:00:00 2001 +From: Robbie Ko +Date: Fri, 28 Oct 2016 10:48:26 +0800 +Subject: Btrfs: fix deadlock caused by fsync when logging directory entries + +From: Robbie Ko + +commit ec125cfb7ae2157af3dd45dd8abe823e3e233eec upstream. + +While logging new directory entries, at tree-log.c:log_new_dir_dentries(), +after we call btrfs_search_forward() we get a leaf with a read lock on it, +and without unlocking that leaf we can end up calling btrfs_iget() to get +an inode pointer. The later (btrfs_iget()) can end up doing a read-only +search on the same tree again, if the inode is not in memory already, which +ends up causing a deadlock if some other task in the meanwhile started a +write search on the tree and is attempting to write lock the same leaf +that btrfs_search_forward() locked while holding write locks on upper +levels of the tree blocking the read search from btrfs_iget(). In this +scenario we get a deadlock. + +So fix this by releasing the search path before calling btrfs_iget() at +tree-log.c:log_new_dir_dentries(). + +Example trace of such deadlock: + +[ 4077.478852] kworker/u24:10 D ffff88107fc90640 0 14431 2 0x00000000 +[ 4077.486752] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs] +[ 4077.494346] ffff880ffa56bad0 0000000000000046 0000000000009000 ffff880ffa56bfd8 +[ 4077.502629] ffff880ffa56bfd8 ffff881016ce21c0 ffffffffa06ecb26 ffff88101a5d6138 +[ 4077.510915] ffff880ebb5173b0 ffff880ffa56baf8 ffff880ebb517410 ffff881016ce21c0 +[ 4077.519202] Call Trace: +[ 4077.528752] [] ? btrfs_tree_lock+0xdd/0x2f0 [btrfs] +[ 4077.536049] [] ? wake_up_atomic_t+0x30/0x30 +[ 4077.542574] [] ? btrfs_search_slot+0x79f/0xb10 [btrfs] +[ 4077.550171] [] ? btrfs_lookup_file_extent+0x33/0x40 [btrfs] +[ 4077.558252] [] ? __btrfs_drop_extents+0x13b/0xdf0 [btrfs] +[ 4077.566140] [] ? add_delayed_data_ref+0xe2/0x150 [btrfs] +[ 4077.573928] [] ? btrfs_add_delayed_data_ref+0x149/0x1d0 [btrfs] +[ 4077.582399] [] ? __set_extent_bit+0x4c0/0x5c0 [btrfs] +[ 4077.589896] [] ? insert_reserved_file_extent.constprop.75+0xa4/0x320 [btrfs] +[ 4077.599632] [] ? start_transaction+0x8d/0x470 [btrfs] +[ 4077.607134] [] ? btrfs_finish_ordered_io+0x2e7/0x600 [btrfs] +[ 4077.615329] [] ? process_one_work+0x142/0x3d0 +[ 4077.622043] [] ? worker_thread+0x109/0x3b0 +[ 4077.628459] [] ? manage_workers.isra.26+0x270/0x270 +[ 4077.635759] [] ? kthread+0xaf/0xc0 +[ 4077.641404] [] ? kthread_create_on_node+0x110/0x110 +[ 4077.648696] [] ? ret_from_fork+0x58/0x90 +[ 4077.654926] [] ? kthread_create_on_node+0x110/0x110 + +[ 4078.358087] kworker/u24:15 D ffff88107fcd0640 0 14436 2 0x00000000 +[ 4078.365981] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs] +[ 4078.373574] ffff880ffa57fad0 0000000000000046 0000000000009000 ffff880ffa57ffd8 +[ 4078.381864] ffff880ffa57ffd8 ffff88103004d0a0 ffffffffa06ecb26 ffff88101a5d6138 +[ 4078.390163] ffff880fbeffc298 ffff880ffa57faf8 ffff880fbeffc2f8 ffff88103004d0a0 +[ 4078.398466] Call Trace: +[ 4078.408019] [] ? btrfs_tree_lock+0xdd/0x2f0 [btrfs] +[ 4078.415322] [] ? wake_up_atomic_t+0x30/0x30 +[ 4078.421844] [] ? btrfs_search_slot+0x79f/0xb10 [btrfs] +[ 4078.429438] [] ? btrfs_lookup_file_extent+0x33/0x40 [btrfs] +[ 4078.437518] [] ? __btrfs_drop_extents+0x13b/0xdf0 [btrfs] +[ 4078.445404] [] ? add_delayed_data_ref+0xe2/0x150 [btrfs] +[ 4078.453194] [] ? btrfs_add_delayed_data_ref+0x149/0x1d0 [btrfs] +[ 4078.461663] [] ? __set_extent_bit+0x4c0/0x5c0 [btrfs] +[ 4078.469161] [] ? insert_reserved_file_extent.constprop.75+0xa4/0x320 [btrfs] +[ 4078.478893] [] ? start_transaction+0x8d/0x470 [btrfs] +[ 4078.486388] [] ? btrfs_finish_ordered_io+0x2e7/0x600 [btrfs] +[ 4078.494561] [] ? process_one_work+0x142/0x3d0 +[ 4078.501278] [] ? pwq_activate_delayed_work+0x27/0x40 +[ 4078.508673] [] ? worker_thread+0x109/0x3b0 +[ 4078.515098] [] ? manage_workers.isra.26+0x270/0x270 +[ 4078.522396] [] ? kthread+0xaf/0xc0 +[ 4078.528032] [] ? kthread_create_on_node+0x110/0x110 +[ 4078.535325] [] ? ret_from_fork+0x58/0x90 +[ 4078.541552] [] ? kthread_create_on_node+0x110/0x110 + +[ 4079.355824] user-space-program D ffff88107fd30640 0 32020 1 0x00000000 +[ 4079.363716] ffff880eae8eba10 0000000000000086 0000000000009000 ffff880eae8ebfd8 +[ 4079.372003] ffff880eae8ebfd8 ffff881016c162c0 ffffffffa06ecb26 ffff88101a5d6138 +[ 4079.380294] ffff880fbed4b4c8 ffff880eae8eba38 ffff880fbed4b528 ffff881016c162c0 +[ 4079.388586] Call Trace: +[ 4079.398134] [] ? btrfs_tree_lock+0x85/0x2f0 [btrfs] +[ 4079.405431] [] ? wake_up_atomic_t+0x30/0x30 +[ 4079.411955] [] ? btrfs_lock_root_node+0x2b/0x40 [btrfs] +[ 4079.419644] [] ? btrfs_search_slot+0xa03/0xb10 [btrfs] +[ 4079.427237] [] ? btrfs_buffer_uptodate+0x52/0x70 [btrfs] +[ 4079.435041] [] ? generic_bin_search.constprop.38+0x80/0x190 [btrfs] +[ 4079.443897] [] ? btrfs_insert_empty_items+0x74/0xd0 [btrfs] +[ 4079.451975] [] ? copy_items+0x128/0x850 [btrfs] +[ 4079.458890] [] ? btrfs_log_inode+0x629/0xbf3 [btrfs] +[ 4079.466292] [] ? btrfs_log_inode_parent+0xc61/0xf30 [btrfs] +[ 4079.474373] [] ? btrfs_log_dentry_safe+0x59/0x80 [btrfs] +[ 4079.482161] [] ? btrfs_sync_file+0x20d/0x330 [btrfs] +[ 4079.489558] [] ? do_fsync+0x4c/0x80 +[ 4079.495300] [] ? SyS_fdatasync+0xa/0x10 +[ 4079.501422] [] ? system_call_fastpath+0x16/0x1b + +[ 4079.508334] user-space-program D ffff88107fc30640 0 32021 1 0x00000004 +[ 4079.516226] ffff880eae8efbf8 0000000000000086 0000000000009000 ffff880eae8effd8 +[ 4079.524513] ffff880eae8effd8 ffff881030279610 ffffffffa06ecb26 ffff88101a5d6138 +[ 4079.532802] ffff880ebb671d88 ffff880eae8efc20 ffff880ebb671de8 ffff881030279610 +[ 4079.541092] Call Trace: +[ 4079.550642] [] ? btrfs_tree_lock+0x85/0x2f0 [btrfs] +[ 4079.557941] [] ? wake_up_atomic_t+0x30/0x30 +[ 4079.564463] [] ? btrfs_search_slot+0x79f/0xb10 [btrfs] +[ 4079.572058] [] ? btrfs_truncate_inode_items+0x168/0xb90 [btrfs] +[ 4079.580526] [] ? join_transaction.isra.15+0x1e/0x3a0 [btrfs] +[ 4079.588701] [] ? start_transaction+0x8d/0x470 [btrfs] +[ 4079.596196] [] ? block_rsv_add_bytes+0x16/0x50 [btrfs] +[ 4079.603789] [] ? btrfs_truncate+0xe9/0x2e0 [btrfs] +[ 4079.610994] [] ? btrfs_setattr+0x30b/0x410 [btrfs] +[ 4079.618197] [] ? notify_change+0x1dc/0x680 +[ 4079.624625] [] ? aa_path_perm+0xd4/0x160 +[ 4079.630854] [] ? do_truncate+0x5b/0x90 +[ 4079.636889] [] ? do_sys_ftruncate.constprop.15+0x10a/0x160 +[ 4079.644869] [] ? SyS_fcntl+0x5b/0x570 +[ 4079.650805] [] ? system_call_fastpath+0x16/0x1b + +[ 4080.410607] user-space-program D ffff88107fc70640 0 32028 12639 0x00000004 +[ 4080.418489] ffff880eaeccbbe0 0000000000000086 0000000000009000 ffff880eaeccbfd8 +[ 4080.426778] ffff880eaeccbfd8 ffff880f317ef1e0 ffffffffa06ecb26 ffff88101a5d6138 +[ 4080.435067] ffff880ef7e93928 ffff880f317ef1e0 ffff880eaeccbc08 ffff880f317ef1e0 +[ 4080.443353] Call Trace: +[ 4080.452920] [] ? btrfs_tree_read_lock+0xdd/0x190 [btrfs] +[ 4080.460703] [] ? wake_up_atomic_t+0x30/0x30 +[ 4080.467225] [] ? btrfs_read_lock_root_node+0x2b/0x40 [btrfs] +[ 4080.475400] [] ? btrfs_search_slot+0x801/0xb10 [btrfs] +[ 4080.482994] [] ? btrfs_clean_one_deleted_snapshot+0xe0/0xe0 [btrfs] +[ 4080.491857] [] ? btrfs_lookup_inode+0x26/0x90 [btrfs] +[ 4080.499353] [] ? kmem_cache_alloc+0xaf/0xc0 +[ 4080.505879] [] ? btrfs_iget+0xd5/0x5d0 [btrfs] +[ 4080.512696] [] ? btrfs_get_token_64+0x104/0x120 [btrfs] +[ 4080.520387] [] ? btrfs_log_inode_parent+0xbdf/0xf30 [btrfs] +[ 4080.528469] [] ? btrfs_log_dentry_safe+0x59/0x80 [btrfs] +[ 4080.536258] [] ? btrfs_sync_file+0x20d/0x330 [btrfs] +[ 4080.543657] [] ? do_fsync+0x4c/0x80 +[ 4080.549399] [] ? SyS_fdatasync+0xa/0x10 +[ 4080.555534] [] ? system_call_fastpath+0x16/0x1b + +Signed-off-by: Robbie Ko +Reviewed-by: Filipe Manana +Fixes: 2f2ff0ee5e43 (Btrfs: fix metadata inconsistencies after directory fsync) +Signed-off-by: Filipe Manana +[Modified changelog for clarity and correctness] +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -5205,6 +5205,7 @@ process_leaf: + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + ++ btrfs_release_path(path); + di_inode = btrfs_iget(root->fs_info->sb, &di_key, + root, NULL); + if (IS_ERR(di_inode)) { +@@ -5214,13 +5215,12 @@ process_leaf: + + if (btrfs_inode_in_log(di_inode, trans->transid)) { + iput(di_inode); +- continue; ++ break; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) + log_mode = LOG_INODE_ALL; +- btrfs_release_path(path); + ret = btrfs_log_inode(trans, root, di_inode, + log_mode, 0, LLONG_MAX, ctx); + if (!ret && diff --git a/queue-4.8/btrfs-fix-emptiness-check-for-dirtied-extent-buffers-at-check_leaf.patch b/queue-4.8/btrfs-fix-emptiness-check-for-dirtied-extent-buffers-at-check_leaf.patch new file mode 100644 index 00000000000..f976940b979 --- /dev/null +++ b/queue-4.8/btrfs-fix-emptiness-check-for-dirtied-extent-buffers-at-check_leaf.patch @@ -0,0 +1,117 @@ +From f177d73949bf758542ca15a1c1945bd2e802cc65 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 23 Nov 2016 16:21:18 +0000 +Subject: Btrfs: fix emptiness check for dirtied extent buffers at check_leaf() + +From: Filipe Manana + +commit f177d73949bf758542ca15a1c1945bd2e802cc65 upstream. + +We can not simply use the owner field from an extent buffer's header to +get the id of the respective tree when the extent buffer is from a +relocation tree. When we create the root for a relocation tree we leave +(on purpose) the owner field with the same value as the subvolume's tree +root (we do this at ctree.c:btrfs_copy_root()). So we must ignore extent +buffers from relocation trees, which have the BTRFS_HEADER_FLAG_RELOC +flag set, because otherwise we will always consider the extent buffer +as not being the root of the tree (the root of original subvolume tree +is always different from the root of the respective relocation tree). + +This lead to assertion failures when running with the integrity checker +enabled (CONFIG_BTRFS_FS_CHECK_INTEGRITY=y) such as the following: + +[ 643.393409] BTRFS critical (device sdg): corrupt leaf, non-root leaf's nritems is 0: block=38506496, root=260, slot=0 +[ 643.397609] BTRFS info (device sdg): leaf 38506496 total ptrs 0 free space 3995 +[ 643.407075] assertion failed: 0, file: fs/btrfs/disk-io.c, line: 4078 +[ 643.408425] ------------[ cut here ]------------ +[ 643.409112] kernel BUG at fs/btrfs/ctree.h:3419! +[ 643.409773] invalid opcode: 0000 [#1] PREEMPT SMP +[ 643.410447] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq ppdev psmouse acpi_cpufreq parport_pc evdev parport tpm_tis tpm_tis_core pcspkr serio_raw i2c_piix4 sg tpm i2c_core button processor loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring scsi_mod virtio e1000 floppy +[ 643.414356] CPU: 11 PID: 32726 Comm: btrfs Not tainted 4.8.0-rc8-btrfs-next-35+ #1 +[ 643.414356] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 +[ 643.414356] task: ffff880145e95b00 task.stack: ffff88014826c000 +[ 643.414356] RIP: 0010:[] [] assfail.constprop.41+0x1c/0x1e [btrfs] +[ 643.414356] RSP: 0018:ffff88014826fa28 EFLAGS: 00010292 +[ 643.414356] RAX: 0000000000000039 RBX: ffff88014e2d7c38 RCX: 0000000000000001 +[ 643.414356] RDX: ffff88023f4d2f58 RSI: ffffffff81806c63 RDI: 00000000ffffffff +[ 643.414356] RBP: ffff88014826fa28 R08: 0000000000000001 R09: 0000000000000000 +[ 643.414356] R10: ffff88014826f918 R11: ffffffff82f3c5ed R12: ffff880172910000 +[ 643.414356] R13: ffff880233992230 R14: ffff8801a68a3310 R15: fffffffffffffff8 +[ 643.414356] FS: 00007f9ca305e8c0(0000) GS:ffff88023f4c0000(0000) knlGS:0000000000000000 +[ 643.414356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 643.414356] CR2: 00007f9ca3071000 CR3: 000000015d01b000 CR4: 00000000000006e0 +[ 643.414356] Stack: +[ 643.414356] ffff88014826fa50 ffffffffa02d655a 000000000000000a ffff88014e2d7c38 +[ 643.414356] 0000000000000000 ffff88014826faa8 ffffffffa02b72f3 ffff88014826fab8 +[ 643.414356] 00ffffffa03228e4 0000000000000000 0000000000000000 ffff8801bbd4e000 +[ 643.414356] Call Trace: +[ 643.414356] [] btrfs_mark_buffer_dirty+0xdf/0xe5 [btrfs] +[ 643.414356] [] btrfs_copy_root+0x18a/0x1d1 [btrfs] +[ 643.414356] [] create_reloc_root+0x72/0x1ba [btrfs] +[ 643.414356] [] btrfs_init_reloc_root+0x7b/0xa7 [btrfs] +[ 643.414356] [] record_root_in_trans+0xdf/0xed [btrfs] +[ 643.414356] [] btrfs_record_root_in_trans+0x50/0x6a [btrfs] +[ 643.414356] [] create_subvol+0x472/0x773 [btrfs] +[ 643.414356] [] btrfs_mksubvol+0x3da/0x463 [btrfs] +[ 643.414356] [] ? btrfs_mksubvol+0x3da/0x463 [btrfs] +[ 643.414356] [] ? preempt_count_add+0x65/0x68 +[ 643.414356] [] ? __mnt_want_write+0x62/0x77 +[ 643.414356] [] btrfs_ioctl_snap_create_transid+0xce/0x187 [btrfs] +[ 643.414356] [] btrfs_ioctl_snap_create+0x67/0x81 [btrfs] +[ 643.414356] [] btrfs_ioctl+0x508/0x20dd [btrfs] +[ 643.414356] [] ? __this_cpu_preempt_check+0x13/0x15 +[ 643.414356] [] ? handle_mm_fault+0x976/0x9ab +[ 643.414356] [] ? arch_local_irq_save+0x9/0xc +[ 643.414356] [] vfs_ioctl+0x18/0x34 +[ 643.414356] [] do_vfs_ioctl+0x581/0x600 +[ 643.414356] [] ? entry_SYSCALL_64_fastpath+0x5/0xa8 +[ 643.414356] [] ? trace_hardirqs_on_caller+0x17b/0x197 +[ 643.414356] [] SyS_ioctl+0x57/0x79 +[ 643.414356] [] entry_SYSCALL_64_fastpath+0x18/0xa8 +[ 643.414356] [] ? trace_hardirqs_off_caller+0x3f/0xaa +[ 643.414356] Code: 89 83 88 00 00 00 31 c0 5b 41 5c 41 5d 5d c3 55 89 f1 48 c7 c2 98 bc 35 a0 48 89 fe 48 c7 c7 05 be 35 a0 48 89 e5 e8 13 46 dd e0 <0f> 0b 55 89 f1 48 c7 c2 9f d3 35 a0 48 89 fe 48 c7 c7 7a d5 35 +[ 643.414356] RIP [] assfail.constprop.41+0x1c/0x1e [btrfs] +[ 643.414356] RSP +[ 643.468267] ---[ end trace 6a1b3fb1a9d7d6e3 ]--- + +This can be easily reproduced by running xfstests with the integrity +checker enabled. + +Fixes: 1ba98d086fe3 (Btrfs: detect corruption when non-root leaf has zero item) +Signed-off-by: Filipe Manana +Reviewed-by: Liu Bo +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -559,7 +559,15 @@ static noinline int check_leaf(struct bt + u32 nritems = btrfs_header_nritems(leaf); + int slot; + +- if (nritems == 0) { ++ /* ++ * Extent buffers from a relocation tree have a owner field that ++ * corresponds to the subvolume tree they are based on. So just from an ++ * extent buffer alone we can not find out what is the id of the ++ * corresponding subvolume tree, so we can not figure out if the extent ++ * buffer corresponds to the root of the relocation tree or not. So skip ++ * this check for relocation trees. ++ */ ++ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { + struct btrfs_root *check_root; + + key.objectid = btrfs_header_owner(leaf); +@@ -587,6 +595,9 @@ static noinline int check_leaf(struct bt + return 0; + } + ++ if (nritems == 0) ++ return 0; ++ + /* Check the 0 item */ + if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != + BTRFS_LEAF_DATA_SIZE(root)) { diff --git a/queue-4.8/btrfs-fix-qgroup-rescan-worker-initialization.patch b/queue-4.8/btrfs-fix-qgroup-rescan-worker-initialization.patch new file mode 100644 index 00000000000..d399156e5f1 --- /dev/null +++ b/queue-4.8/btrfs-fix-qgroup-rescan-worker-initialization.patch @@ -0,0 +1,48 @@ +From 8d9eddad19467b008e0c881bc3133d7da94b7ec1 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 24 Nov 2016 02:09:04 +0000 +Subject: Btrfs: fix qgroup rescan worker initialization + +From: Filipe Manana + +commit 8d9eddad19467b008e0c881bc3133d7da94b7ec1 upstream. + +We were setting the qgroup_rescan_running flag to true only after the +rescan worker started (which is a task run by a queue). So if a user +space task starts a rescan and immediately after asks to wait for the +rescan worker to finish, this second call might happen before the rescan +worker task starts running, in which case the rescan wait ioctl returns +immediatley, not waiting for the rescan worker to finish. + +This was making the fstest btrfs/022 fail very often. + +Fixes: d2c609b834d6 (btrfs: properly track when rescan worker is running) +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -2332,10 +2332,6 @@ static void btrfs_qgroup_rescan_worker(s + int err = -ENOMEM; + int ret = 0; + +- mutex_lock(&fs_info->qgroup_rescan_lock); +- fs_info->qgroup_rescan_running = true; +- mutex_unlock(&fs_info->qgroup_rescan_lock); +- + path = btrfs_alloc_path(); + if (!path) + goto out; +@@ -2446,6 +2442,7 @@ qgroup_rescan_init(struct btrfs_fs_info + sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_rescan_progress.objectid = progress_objectid; + init_completion(&fs_info->qgroup_rescan_completion); ++ fs_info->qgroup_rescan_running = true; + + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); diff --git a/queue-4.8/btrfs-fix-relocation-incorrectly-dropping-data-references.patch b/queue-4.8/btrfs-fix-relocation-incorrectly-dropping-data-references.patch new file mode 100644 index 00000000000..3954ec4cead --- /dev/null +++ b/queue-4.8/btrfs-fix-relocation-incorrectly-dropping-data-references.patch @@ -0,0 +1,215 @@ +From 054570a1dc94de20e7a612cddcc5a97db9c37b6f Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 1 Nov 2016 11:23:31 +0000 +Subject: Btrfs: fix relocation incorrectly dropping data references + +From: Filipe Manana + +commit 054570a1dc94de20e7a612cddcc5a97db9c37b6f upstream. + +During relocation of a data block group we create a relocation tree +for each fs/subvol tree by making a snapshot of each tree using +btrfs_copy_root() and the tree's commit root, and then setting the last +snapshot field for the fs/subvol tree's root to the value of the current +transaction id minus 1. However this can lead to relocation later +dropping references that it did not create if we have qgroups enabled, +leaving the filesystem in an inconsistent state that keeps aborting +transactions. + +Lets consider the following example to explain the problem, which requires +qgroups to be enabled. + +We are relocating data block group Y, we have a subvolume with id 258 that +has a root at level 1, that subvolume is used to store directory entries +for snapshots and we are currently at transaction 3404. + +When committing transaction 3404, we have a pending snapshot and therefore +we call btrfs_run_delayed_items() at transaction.c:create_pending_snapshot() +in order to create its dentry at subvolume 258. This results in COWing +leaf A from root 258 in order to add the dentry. Note that leaf A +also contains file extent items referring to extents from some other +block group X (we are currently relocating block group Y). Later on, still +at create_pending_snapshot() we call qgroup_account_snapshot(), which +switches the commit root for root 258 when it calls switch_commit_roots(), +so now the COWed version of leaf A, lets call it leaf A', is accessible +from the commit root of tree 258. At the end of qgroup_account_snapshot(), +we call record_root_in_trans() with 258 as its argument, which results +in btrfs_init_reloc_root() being called, which in turn calls +relocation.c:create_reloc_root() in order to create a relocation tree +associated to root 258, which results in assigning the value of 3403 +(which is the current transaction id minus 1 = 3404 - 1) to the +last_snapshot field of root 258. When creating the relocation tree root +at ctree.c:btrfs_copy_root() we add a shared reference for leaf A', +corresponding to the relocation tree's root, when we call btrfs_inc_ref() +against the COWed root (a copy of the commit root from tree 258), which +is at level 1. So at this point leaf A' has 2 references, one normal +reference corresponding to root 258 and one shared reference corresponding +to the root of the relocation tree. + +Transaction 3404 finishes its commit and transaction 3405 is started by +relocation when calling merge_reloc_root() for the relocation tree +associated to root 258. In the meanwhile leaf A' is COWed again, in +response to some filesystem operation, when we are still at transaction +3405. However when we COW leaf A', at ctree.c:update_ref_for_cow(), we +call btrfs_block_can_be_shared() in order to figure out if other trees +refer to the leaf and if any such trees exists, add a full back reference +to leaf A' - but btrfs_block_can_be_shared() incorrectly returns false +because the following condition is false: + + btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) + +which evaluates to 3404 <= 3403. So after leaf A' is COWed, it stays with +only one reference, corresponding to the shared reference we created when +we called btrfs_copy_root() to create the relocation tree's root and +btrfs_inc_ref() ends up not being called for leaf A' nor we end up setting +the flag BTRFS_BLOCK_FLAG_FULL_BACKREF in leaf A'. This results in not +adding shared references for the extents from block group X that leaf A' +refers to with its file extent items. + +Later, after merging the relocation root we do a call to to +btrfs_drop_snapshot() in order to delete the relocation tree. This ends +up calling do_walk_down() when path->slots[1] points to leaf A', which +results in calling btrfs_lookup_extent_info() to get the number of +references for leaf A', which is 1 at this time (only the shared reference +exists) and this value is stored at wc->refs[0]. After this walk_up_proc() +is called when wc->level is 0 and path->nodes[0] corresponds to leaf A'. +Because the current level is 0 and wc->refs[0] is 1, it does call +btrfs_dec_ref() against leaf A', which results in removing the single +references that the extents from block group X have which are associated +to root 258 - the expectation was to have each of these extents with 2 +references - one reference for root 258 and one shared reference related +to the root of the relocation tree, and so we would drop only the shared +reference (because leaf A' was supposed to have the flag +BTRFS_BLOCK_FLAG_FULL_BACKREF set). + +This leaves the filesystem in an inconsistent state as we now have file +extent items in a subvolume tree that point to extents from block group X +without references in the extent tree. So later on when we try to decrement +the references for these extents, for example due to a file unlink operation, +truncate operation or overwriting ranges of a file, we fail because the +expected references do not exist in the extent tree. + +This leads to warnings and transaction aborts like the following: + +[ 588.965795] ------------[ cut here ]------------ +[ 588.965815] WARNING: CPU: 2 PID: 2479 at fs/btrfs/extent-tree.c:1625 lookup_inline_extent_backref+0x432/0x5b0 [btrfs] +[ 588.965816] Modules linked in: af_packet iscsi_ibft iscsi_boot_sysfs xfs libcrc32c ppdev acpi_cpufreq button tpm_tis e1000 i2c_piix4 pcspkr parport_pc +parport tpm qemu_fw_cfg joydev btrfs xor raid6_pq sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci bochs_drm virtio_ring drm_kms_helper syscopyarea +sysfillrect sysimgblt fb_sys_fops virtio ttm serio_raw drm floppy sg +[ 588.965831] CPU: 2 PID: 2479 Comm: kworker/u8:7 Not tainted 4.7.3-3-default-fdm+ #1 +[ 588.965832] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 +[ 588.965844] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs] +[ 588.965845] 0000000000000000 ffff8802263bfa28 ffffffff813af542 0000000000000000 +[ 588.965847] 0000000000000000 ffff8802263bfa68 ffffffff81081e8b 0000065900000000 +[ 588.965848] ffff8801db2af000 000000012bbe2000 0000000000000000 ffff880215703b48 +[ 588.965849] Call Trace: +[ 588.965852] [] dump_stack+0x63/0x81 +[ 588.965854] [] __warn+0xcb/0xf0 +[ 588.965855] [] warn_slowpath_null+0x1d/0x20 +[ 588.965863] [] lookup_inline_extent_backref+0x432/0x5b0 [btrfs] +[ 588.965865] [] ? trace_clock_local+0x10/0x30 +[ 588.965867] [] ? rb_reserve_next_event+0x6f/0x460 +[ 588.965875] [] insert_inline_extent_backref+0x55/0xd0 [btrfs] +[ 588.965882] [] __btrfs_inc_extent_ref.isra.55+0x8f/0x240 [btrfs] +[ 588.965890] [] __btrfs_run_delayed_refs+0x74a/0x1260 [btrfs] +[ 588.965892] [] ? cpuacct_charge+0x86/0xa0 +[ 588.965900] [] btrfs_run_delayed_refs+0x9f/0x2c0 [btrfs] +[ 588.965908] [] delayed_ref_async_start+0x94/0xb0 [btrfs] +[ 588.965918] [] btrfs_scrubparity_helper+0xca/0x350 [btrfs] +[ 588.965928] [] btrfs_extent_refs_helper+0xe/0x10 [btrfs] +[ 588.965930] [] process_one_work+0x1f3/0x4e0 +[ 588.965931] [] worker_thread+0x48/0x4e0 +[ 588.965932] [] ? process_one_work+0x4e0/0x4e0 +[ 588.965934] [] kthread+0xc9/0xe0 +[ 588.965936] [] ret_from_fork+0x1f/0x40 +[ 588.965937] [] ? kthread_worker_fn+0x170/0x170 +[ 588.965938] ---[ end trace 34e5232c933a1749 ]--- +[ 588.966187] ------------[ cut here ]------------ +[ 588.966196] WARNING: CPU: 2 PID: 2479 at fs/btrfs/extent-tree.c:2966 btrfs_run_delayed_refs+0x28c/0x2c0 [btrfs] +[ 588.966196] BTRFS: Transaction aborted (error -5) +[ 588.966197] Modules linked in: af_packet iscsi_ibft iscsi_boot_sysfs xfs libcrc32c ppdev acpi_cpufreq button tpm_tis e1000 i2c_piix4 pcspkr parport_pc +parport tpm qemu_fw_cfg joydev btrfs xor raid6_pq sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci bochs_drm virtio_ring drm_kms_helper syscopyarea +sysfillrect sysimgblt fb_sys_fops virtio ttm serio_raw drm floppy sg +[ 588.966206] CPU: 2 PID: 2479 Comm: kworker/u8:7 Tainted: G W 4.7.3-3-default-fdm+ #1 +[ 588.966207] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 +[ 588.966217] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs] +[ 588.966217] 0000000000000000 ffff8802263bfc98 ffffffff813af542 ffff8802263bfce8 +[ 588.966219] 0000000000000000 ffff8802263bfcd8 ffffffff81081e8b 00000b96345ee000 +[ 588.966220] ffffffffa021ae1c ffff880215703b48 00000000000005fe ffff8802345ee000 +[ 588.966221] Call Trace: +[ 588.966223] [] dump_stack+0x63/0x81 +[ 588.966224] [] __warn+0xcb/0xf0 +[ 588.966225] [] warn_slowpath_fmt+0x4f/0x60 +[ 588.966233] [] btrfs_run_delayed_refs+0x28c/0x2c0 [btrfs] +[ 588.966241] [] delayed_ref_async_start+0x94/0xb0 [btrfs] +[ 588.966250] [] btrfs_scrubparity_helper+0xca/0x350 [btrfs] +[ 588.966259] [] btrfs_extent_refs_helper+0xe/0x10 [btrfs] +[ 588.966260] [] process_one_work+0x1f3/0x4e0 +[ 588.966261] [] worker_thread+0x48/0x4e0 +[ 588.966263] [] ? process_one_work+0x4e0/0x4e0 +[ 588.966264] [] kthread+0xc9/0xe0 +[ 588.966265] [] ret_from_fork+0x1f/0x40 +[ 588.966267] [] ? kthread_worker_fn+0x170/0x170 +[ 588.966268] ---[ end trace 34e5232c933a174a ]--- +[ 588.966269] BTRFS: error (device sda2) in btrfs_run_delayed_refs:2966: errno=-5 IO failure +[ 588.966270] BTRFS info (device sda2): forced readonly + +This was happening often on openSUSE and SLE systems using btrfs as the +root filesystem (with its default layout where multiple subvolumes are +used) where balance happens in the background triggered by a cron job and +snapshots are automatically created before/after package installations, +upgrades and removals. The issue could be triggered simply by running the +following loop on the first system boot post installation: + + while true; do + zypper -n in nfs-kernel-server + zypper -n rm nfs-kernel-server + done + +(If we were fast enough and made that loop before the cron job triggered +a balance operation and the balance finished) + +So fix by setting the last_snapshot field of the root to the value of the +generation of its commit root. Like this btrfs_block_can_be_shared() +behaves correctly for the case where the relocation root is created during +a transaction commit and for the case where it's created before a +transaction commit. + +Fixes: 6426c7ad697d (btrfs: qgroup: Fix qgroup accounting when creating snapshot) +Signed-off-by: Filipe Manana +Reviewed-by: Josef Bacik +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/relocation.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -1387,14 +1387,23 @@ static struct btrfs_root *create_reloc_r + root_key.offset = objectid; + + if (root->root_key.objectid == objectid) { ++ u64 commit_root_gen; ++ + /* called by btrfs_init_reloc_root */ + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); +- + last_snap = btrfs_root_last_snapshot(&root->root_item); +- btrfs_set_root_last_snapshot(&root->root_item, +- trans->transid - 1); ++ /* ++ * Set the last_snapshot field to the generation of the commit ++ * root - like this ctree.c:btrfs_block_can_be_shared() behaves ++ * correctly (returns true) when the relocation root is created ++ * either inside the critical section of a transaction commit ++ * (through transaction.c:qgroup_account_snapshot()) and when ++ * it's created before the transaction commit is started. ++ */ ++ commit_root_gen = btrfs_header_generation(root->commit_root); ++ btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); + } else { + /* + * called by btrfs_reloc_post_snapshot_hook. diff --git a/queue-4.8/btrfs-fix-tree-search-logic-when-replaying-directory-entry-deletes.patch b/queue-4.8/btrfs-fix-tree-search-logic-when-replaying-directory-entry-deletes.patch new file mode 100644 index 00000000000..239bd447f35 --- /dev/null +++ b/queue-4.8/btrfs-fix-tree-search-logic-when-replaying-directory-entry-deletes.patch @@ -0,0 +1,64 @@ +From 2a7bf53f577e49c43de4ffa7776056de26db65d9 Mon Sep 17 00:00:00 2001 +From: Robbie Ko +Date: Fri, 7 Oct 2016 17:30:47 +0800 +Subject: Btrfs: fix tree search logic when replaying directory entry deletes + +From: Robbie Ko + +commit 2a7bf53f577e49c43de4ffa7776056de26db65d9 upstream. + +If a log tree has a layout like the following: + +leaf N: + ... + item 240 key (282 DIR_LOG_ITEM 0) itemoff 8189 itemsize 8 + dir log end 1275809046 +leaf N + 1: + item 0 key (282 DIR_LOG_ITEM 3936149215) itemoff 16275 itemsize 8 + dir log end 18446744073709551615 + ... + +When we pass the value 1275809046 + 1 as the parameter start_ret to the +function tree-log.c:find_dir_range() (done by replay_dir_deletes()), we +end up with path->slots[0] having the value 239 (points to the last item +of leaf N, item 240). Because the dir log item in that position has an +offset value smaller than *start_ret (1275809046 + 1) we need to move on +to the next leaf, however the logic for that is wrong since it compares +the current slot to the number of items in the leaf, which is smaller +and therefore we don't lookup for the next leaf but instead we set the +slot to point to an item that does not exist, at slot 240, and we later +operate on that slot which has unexpected content or in the worst case +can result in an invalid memory access (accessing beyond the last page +of leaf N's extent buffer). + +So fix the logic that checks when we need to lookup at the next leaf +by first incrementing the slot and only after to check if that slot +is beyond the last item of the current leaf. + +Signed-off-by: Robbie Ko +Reviewed-by: Filipe Manana +Fixes: e02119d5a7b4 (Btrfs: Add a write ahead tree log to optimize synchronous operations) +Signed-off-by: Filipe Manana +[Modified changelog for clarity and correctness] +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -1940,12 +1940,11 @@ static noinline int find_dir_range(struc + next: + /* check the next slot in the tree to see if it is a valid item */ + nritems = btrfs_header_nritems(path->nodes[0]); ++ path->slots[0]++; + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; +- } else { +- path->slots[0]++; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); diff --git a/queue-4.8/btrfs-limit-async_work-allocation-and-worker-func-duration.patch b/queue-4.8/btrfs-limit-async_work-allocation-and-worker-func-duration.patch new file mode 100644 index 00000000000..1c26b29f88f --- /dev/null +++ b/queue-4.8/btrfs-limit-async_work-allocation-and-worker-func-duration.patch @@ -0,0 +1,128 @@ +From 2939e1a86f758b55cdba73e29397dd3d94df13bc Mon Sep 17 00:00:00 2001 +From: Maxim Patlasov +Date: Mon, 12 Dec 2016 14:32:44 -0800 +Subject: btrfs: limit async_work allocation and worker func duration + +From: Maxim Patlasov + +commit 2939e1a86f758b55cdba73e29397dd3d94df13bc upstream. + +Problem statement: unprivileged user who has read-write access to more than +one btrfs subvolume may easily consume all kernel memory (eventually +triggering oom-killer). + +Reproducer (./mkrmdir below essentially loops over mkdir/rmdir): + +[root@kteam1 ~]# cat prep.sh + +DEV=/dev/sdb +mkfs.btrfs -f $DEV +mount $DEV /mnt +for i in `seq 1 16` +do + mkdir /mnt/$i + btrfs subvolume create /mnt/SV_$i + ID=`btrfs subvolume list /mnt |grep "SV_$i$" |cut -d ' ' -f 2` + mount -t btrfs -o subvolid=$ID $DEV /mnt/$i + chmod a+rwx /mnt/$i +done + +[root@kteam1 ~]# sh prep.sh + +[maxim@kteam1 ~]$ for i in `seq 1 16`; do ./mkrmdir /mnt/$i 2000 2000 & done + +[root@kteam1 ~]# for i in `seq 1 4`; do grep "kmalloc-128" /proc/slabinfo | grep -v dma; sleep 60; done +kmalloc-128 10144 10144 128 32 1 : tunables 0 0 0 : slabdata 317 317 0 +kmalloc-128 9992352 9992352 128 32 1 : tunables 0 0 0 : slabdata 312261 312261 0 +kmalloc-128 24226752 24226752 128 32 1 : tunables 0 0 0 : slabdata 757086 757086 0 +kmalloc-128 42754240 42754240 128 32 1 : tunables 0 0 0 : slabdata 1336070 1336070 0 + +The huge numbers above come from insane number of async_work-s allocated +and queued by btrfs_wq_run_delayed_node. + +The problem is caused by btrfs_wq_run_delayed_node() queuing more and more +works if the number of delayed items is above BTRFS_DELAYED_BACKGROUND. The +worker func (btrfs_async_run_delayed_root) processes at least +BTRFS_DELAYED_BATCH items (if they are present in the list). So, the machinery +works as expected while the list is almost empty. As soon as it is getting +bigger, worker func starts to process more than one item at a time, it takes +longer, and the chances to have async_works queued more than needed is getting +higher. + +The problem above is worsened by another flaw of delayed-inode implementation: +if async_work was queued in a throttling branch (number of items >= +BTRFS_DELAYED_WRITEBACK), corresponding worker func won't quit until +the number of items < BTRFS_DELAYED_BACKGROUND / 2. So, it is possible that +the func occupies CPU infinitely (up to 30sec in my experiments): while the +func is trying to drain the list, the user activity may add more and more +items to the list. + +The patch fixes both problems in straightforward way: refuse queuing too +many works in btrfs_wq_run_delayed_node and bail out of worker func if +at least BTRFS_DELAYED_WRITEBACK items are processed. + +Changed in v2: remove support of thresh == NO_THRESHOLD. + +Signed-off-by: Maxim Patlasov +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/async-thread.c | 14 ++++++++++++++ + fs/btrfs/async-thread.h | 1 + + fs/btrfs/delayed-inode.c | 6 ++++-- + 3 files changed, 19 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/async-thread.c ++++ b/fs/btrfs/async-thread.c +@@ -86,6 +86,20 @@ btrfs_work_owner(struct btrfs_work *work + return work->wq->fs_info; + } + ++bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq) ++{ ++ /* ++ * We could compare wq->normal->pending with num_online_cpus() ++ * to support "thresh == NO_THRESHOLD" case, but it requires ++ * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's ++ * postpone it until someone needs the support of that case. ++ */ ++ if (wq->normal->thresh == NO_THRESHOLD) ++ return false; ++ ++ return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; ++} ++ + BTRFS_WORK_HELPER(worker_helper); + BTRFS_WORK_HELPER(delalloc_helper); + BTRFS_WORK_HELPER(flush_delalloc_helper); +--- a/fs/btrfs/async-thread.h ++++ b/fs/btrfs/async-thread.h +@@ -84,4 +84,5 @@ void btrfs_workqueue_set_max(struct btrf + void btrfs_set_work_high_priority(struct btrfs_work *work); + struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work); + struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq); ++bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq); + #endif +--- a/fs/btrfs/delayed-inode.c ++++ b/fs/btrfs/delayed-inode.c +@@ -1356,7 +1356,8 @@ release_path: + total_done++; + + btrfs_release_prepared_delayed_node(delayed_node); +- if (async_work->nr == 0 || total_done < async_work->nr) ++ if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) || ++ total_done < async_work->nr) + goto again; + + free_path: +@@ -1372,7 +1373,8 @@ static int btrfs_wq_run_delayed_node(str + { + struct btrfs_async_delayed_work *async_work; + +- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ++ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || ++ btrfs_workqueue_normal_congested(fs_info->delayed_workers)) + return 0; + + async_work = kmalloc(sizeof(*async_work), GFP_NOFS); diff --git a/queue-4.8/btrfs-store-and-load-values-of-stripes_min-stripes_max-in-balance-status-item.patch b/queue-4.8/btrfs-store-and-load-values-of-stripes_min-stripes_max-in-balance-status-item.patch new file mode 100644 index 00000000000..ad5fcc541b2 --- /dev/null +++ b/queue-4.8/btrfs-store-and-load-values-of-stripes_min-stripes_max-in-balance-status-item.patch @@ -0,0 +1,42 @@ +From ed0df618b1b06d7431ee4d985317fc5419a5d559 Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Tue, 1 Nov 2016 14:21:23 +0100 +Subject: btrfs: store and load values of stripes_min/stripes_max in balance status item + +From: David Sterba + +commit ed0df618b1b06d7431ee4d985317fc5419a5d559 upstream. + +The balance status item contains currently known filter values, but the +stripes filter was unintentionally not among them. This would mean, that +interrupted and automatically restarted balance does not apply the +stripe filters. + +Fixes: dee32d0ac3719ef8d640efaf0884111df444730f +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -2210,6 +2210,8 @@ btrfs_disk_balance_args_to_cpu(struct bt + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); ++ cpu->stripes_min = le32_to_cpu(disk->stripes_min); ++ cpu->stripes_max = le32_to_cpu(disk->stripes_max); + } + + static inline void +@@ -2228,6 +2230,8 @@ btrfs_cpu_balance_args_to_disk(struct bt + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); ++ disk->stripes_min = cpu_to_le32(cpu->stripes_min); ++ disk->stripes_max = cpu_to_le32(cpu->stripes_max); + } + + /* struct btrfs_super_block */