--- /dev/null
+From 0cbc72a1781250f373327dd7e306e33859a42154 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Fri, 11 Nov 2016 18:28:50 -0700
+Subject: aoe: fix crash in page count manipulation
+
+From: Jens Axboe <axboe@fb.com>
+
+commit 0cbc72a1781250f373327dd7e306e33859a42154 upstream.
+
+aoeblk contains some mysterious code, that wants to elevate the bio
+vec page counts while it's under IO. That is not needed, it's
+fragile, and it's causing kernel oopses for some.
+
+Reported-by: Tested-by: Don Koch <kochd@us.ibm.com>
+Tested-by: Tested-by: Don Koch <kochd@us.ibm.com>
+Signed-off-by: Jens Axboe <axboe@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/aoe/aoecmd.c | 41 -----------------------------------------
+ 1 file changed, 41 deletions(-)
+
+--- a/drivers/block/aoe/aoecmd.c
++++ b/drivers/block/aoe/aoecmd.c
+@@ -853,45 +853,6 @@ rqbiocnt(struct request *r)
+ return n;
+ }
+
+-/* This can be removed if we are certain that no users of the block
+- * layer will ever use zero-count pages in bios. Otherwise we have to
+- * protect against the put_page sometimes done by the network layer.
+- *
+- * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
+- * discussion.
+- *
+- * We cannot use get_page in the workaround, because it insists on a
+- * positive page count as a precondition. So we use _refcount directly.
+- */
+-static void
+-bio_pageinc(struct bio *bio)
+-{
+- struct bio_vec bv;
+- struct page *page;
+- struct bvec_iter iter;
+-
+- bio_for_each_segment(bv, bio, iter) {
+- /* Non-zero page count for non-head members of
+- * compound pages is no longer allowed by the kernel.
+- */
+- page = compound_head(bv.bv_page);
+- page_ref_inc(page);
+- }
+-}
+-
+-static void
+-bio_pagedec(struct bio *bio)
+-{
+- struct page *page;
+- struct bio_vec bv;
+- struct bvec_iter iter;
+-
+- bio_for_each_segment(bv, bio, iter) {
+- page = compound_head(bv.bv_page);
+- page_ref_dec(page);
+- }
+-}
+-
+ static void
+ bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+ {
+@@ -899,7 +860,6 @@ bufinit(struct buf *buf, struct request
+ buf->rq = rq;
+ buf->bio = bio;
+ buf->iter = bio->bi_iter;
+- bio_pageinc(bio);
+ }
+
+ static struct buf *
+@@ -1127,7 +1087,6 @@ aoe_end_buf(struct aoedev *d, struct buf
+ if (buf == d->ip.buf)
+ d->ip.buf = NULL;
+ rq = buf->rq;
+- bio_pagedec(buf->bio);
+ mempool_free(buf, d->bufpool);
+ n = (unsigned long) rq->special;
+ rq->special = (void *) --n;
--- /dev/null
+From ef85b25e982b5bba1530b936e283ef129f02ab9d Mon Sep 17 00:00:00 2001
+From: Liu Bo <bo.li.liu@oracle.com>
+Date: Fri, 2 Sep 2016 12:35:34 -0700
+Subject: Btrfs: fix BUG_ON in btrfs_mark_buffer_dirty
+
+From: Liu Bo <bo.li.liu@oracle.com>
+
+commit ef85b25e982b5bba1530b936e283ef129f02ab9d upstream.
+
+This can only happen with CONFIG_BTRFS_FS_CHECK_INTEGRITY=y.
+
+Commit 1ba98d0 ("Btrfs: detect corruption when non-root leaf has zero item")
+assumes that a leaf is its root when leaf->bytenr == btrfs_root_bytenr(root),
+however, we should not use btrfs_root_bytenr(root) since it's mainly got
+updated during committing transaction. So the check can fail when doing
+COW on this leaf while it is a root.
+
+This changes to use "if (leaf == btrfs_root_node(root))" instead, just like
+how we check whether leaf is a root in __btrfs_cow_block().
+
+Fixes: 1ba98d086fe3 (Btrfs: detect corruption when non-root leaf has zero item)
+Reported-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -572,13 +572,17 @@ static noinline int check_leaf(struct bt
+ * open_ctree() some roots has not yet been set up.
+ */
+ if (!IS_ERR_OR_NULL(check_root)) {
++ struct extent_buffer *eb;
++
++ eb = btrfs_root_node(check_root);
+ /* if leaf is the root, then it's fine */
+- if (leaf->start !=
+- btrfs_root_bytenr(&check_root->root_item)) {
++ if (leaf != eb) {
+ CORRUPT("non-root leaf's nritems is 0",
+- leaf, root, 0);
++ leaf, check_root, 0);
++ free_extent_buffer(eb);
+ return -EIO;
+ }
++ free_extent_buffer(eb);
+ }
+ return 0;
+ }
--- /dev/null
+From ec125cfb7ae2157af3dd45dd8abe823e3e233eec Mon Sep 17 00:00:00 2001
+From: Robbie Ko <robbieko@synology.com>
+Date: Fri, 28 Oct 2016 10:48:26 +0800
+Subject: Btrfs: fix deadlock caused by fsync when logging directory entries
+
+From: Robbie Ko <robbieko@synology.com>
+
+commit ec125cfb7ae2157af3dd45dd8abe823e3e233eec upstream.
+
+While logging new directory entries, at tree-log.c:log_new_dir_dentries(),
+after we call btrfs_search_forward() we get a leaf with a read lock on it,
+and without unlocking that leaf we can end up calling btrfs_iget() to get
+an inode pointer. The later (btrfs_iget()) can end up doing a read-only
+search on the same tree again, if the inode is not in memory already, which
+ends up causing a deadlock if some other task in the meanwhile started a
+write search on the tree and is attempting to write lock the same leaf
+that btrfs_search_forward() locked while holding write locks on upper
+levels of the tree blocking the read search from btrfs_iget(). In this
+scenario we get a deadlock.
+
+So fix this by releasing the search path before calling btrfs_iget() at
+tree-log.c:log_new_dir_dentries().
+
+Example trace of such deadlock:
+
+[ 4077.478852] kworker/u24:10 D ffff88107fc90640 0 14431 2 0x00000000
+[ 4077.486752] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
+[ 4077.494346] ffff880ffa56bad0 0000000000000046 0000000000009000 ffff880ffa56bfd8
+[ 4077.502629] ffff880ffa56bfd8 ffff881016ce21c0 ffffffffa06ecb26 ffff88101a5d6138
+[ 4077.510915] ffff880ebb5173b0 ffff880ffa56baf8 ffff880ebb517410 ffff881016ce21c0
+[ 4077.519202] Call Trace:
+[ 4077.528752] [<ffffffffa06ed5ed>] ? btrfs_tree_lock+0xdd/0x2f0 [btrfs]
+[ 4077.536049] [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4077.542574] [<ffffffffa068cc1f>] ? btrfs_search_slot+0x79f/0xb10 [btrfs]
+[ 4077.550171] [<ffffffffa06a5073>] ? btrfs_lookup_file_extent+0x33/0x40 [btrfs]
+[ 4077.558252] [<ffffffffa06c600b>] ? __btrfs_drop_extents+0x13b/0xdf0 [btrfs]
+[ 4077.566140] [<ffffffffa06fc9e2>] ? add_delayed_data_ref+0xe2/0x150 [btrfs]
+[ 4077.573928] [<ffffffffa06fd629>] ? btrfs_add_delayed_data_ref+0x149/0x1d0 [btrfs]
+[ 4077.582399] [<ffffffffa06cf3c0>] ? __set_extent_bit+0x4c0/0x5c0 [btrfs]
+[ 4077.589896] [<ffffffffa06b4a64>] ? insert_reserved_file_extent.constprop.75+0xa4/0x320 [btrfs]
+[ 4077.599632] [<ffffffffa06b206d>] ? start_transaction+0x8d/0x470 [btrfs]
+[ 4077.607134] [<ffffffffa06bab57>] ? btrfs_finish_ordered_io+0x2e7/0x600 [btrfs]
+[ 4077.615329] [<ffffffff8104cbc2>] ? process_one_work+0x142/0x3d0
+[ 4077.622043] [<ffffffff8104d729>] ? worker_thread+0x109/0x3b0
+[ 4077.628459] [<ffffffff8104d620>] ? manage_workers.isra.26+0x270/0x270
+[ 4077.635759] [<ffffffff81052b0f>] ? kthread+0xaf/0xc0
+[ 4077.641404] [<ffffffff81052a60>] ? kthread_create_on_node+0x110/0x110
+[ 4077.648696] [<ffffffff814a9ac8>] ? ret_from_fork+0x58/0x90
+[ 4077.654926] [<ffffffff81052a60>] ? kthread_create_on_node+0x110/0x110
+
+[ 4078.358087] kworker/u24:15 D ffff88107fcd0640 0 14436 2 0x00000000
+[ 4078.365981] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
+[ 4078.373574] ffff880ffa57fad0 0000000000000046 0000000000009000 ffff880ffa57ffd8
+[ 4078.381864] ffff880ffa57ffd8 ffff88103004d0a0 ffffffffa06ecb26 ffff88101a5d6138
+[ 4078.390163] ffff880fbeffc298 ffff880ffa57faf8 ffff880fbeffc2f8 ffff88103004d0a0
+[ 4078.398466] Call Trace:
+[ 4078.408019] [<ffffffffa06ed5ed>] ? btrfs_tree_lock+0xdd/0x2f0 [btrfs]
+[ 4078.415322] [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4078.421844] [<ffffffffa068cc1f>] ? btrfs_search_slot+0x79f/0xb10 [btrfs]
+[ 4078.429438] [<ffffffffa06a5073>] ? btrfs_lookup_file_extent+0x33/0x40 [btrfs]
+[ 4078.437518] [<ffffffffa06c600b>] ? __btrfs_drop_extents+0x13b/0xdf0 [btrfs]
+[ 4078.445404] [<ffffffffa06fc9e2>] ? add_delayed_data_ref+0xe2/0x150 [btrfs]
+[ 4078.453194] [<ffffffffa06fd629>] ? btrfs_add_delayed_data_ref+0x149/0x1d0 [btrfs]
+[ 4078.461663] [<ffffffffa06cf3c0>] ? __set_extent_bit+0x4c0/0x5c0 [btrfs]
+[ 4078.469161] [<ffffffffa06b4a64>] ? insert_reserved_file_extent.constprop.75+0xa4/0x320 [btrfs]
+[ 4078.478893] [<ffffffffa06b206d>] ? start_transaction+0x8d/0x470 [btrfs]
+[ 4078.486388] [<ffffffffa06bab57>] ? btrfs_finish_ordered_io+0x2e7/0x600 [btrfs]
+[ 4078.494561] [<ffffffff8104cbc2>] ? process_one_work+0x142/0x3d0
+[ 4078.501278] [<ffffffff8104a507>] ? pwq_activate_delayed_work+0x27/0x40
+[ 4078.508673] [<ffffffff8104d729>] ? worker_thread+0x109/0x3b0
+[ 4078.515098] [<ffffffff8104d620>] ? manage_workers.isra.26+0x270/0x270
+[ 4078.522396] [<ffffffff81052b0f>] ? kthread+0xaf/0xc0
+[ 4078.528032] [<ffffffff81052a60>] ? kthread_create_on_node+0x110/0x110
+[ 4078.535325] [<ffffffff814a9ac8>] ? ret_from_fork+0x58/0x90
+[ 4078.541552] [<ffffffff81052a60>] ? kthread_create_on_node+0x110/0x110
+
+[ 4079.355824] user-space-program D ffff88107fd30640 0 32020 1 0x00000000
+[ 4079.363716] ffff880eae8eba10 0000000000000086 0000000000009000 ffff880eae8ebfd8
+[ 4079.372003] ffff880eae8ebfd8 ffff881016c162c0 ffffffffa06ecb26 ffff88101a5d6138
+[ 4079.380294] ffff880fbed4b4c8 ffff880eae8eba38 ffff880fbed4b528 ffff881016c162c0
+[ 4079.388586] Call Trace:
+[ 4079.398134] [<ffffffffa06ed595>] ? btrfs_tree_lock+0x85/0x2f0 [btrfs]
+[ 4079.405431] [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4079.411955] [<ffffffffa06876fb>] ? btrfs_lock_root_node+0x2b/0x40 [btrfs]
+[ 4079.419644] [<ffffffffa068ce83>] ? btrfs_search_slot+0xa03/0xb10 [btrfs]
+[ 4079.427237] [<ffffffffa06aba52>] ? btrfs_buffer_uptodate+0x52/0x70 [btrfs]
+[ 4079.435041] [<ffffffffa0689b60>] ? generic_bin_search.constprop.38+0x80/0x190 [btrfs]
+[ 4079.443897] [<ffffffffa068ea44>] ? btrfs_insert_empty_items+0x74/0xd0 [btrfs]
+[ 4079.451975] [<ffffffffa072c443>] ? copy_items+0x128/0x850 [btrfs]
+[ 4079.458890] [<ffffffffa072da10>] ? btrfs_log_inode+0x629/0xbf3 [btrfs]
+[ 4079.466292] [<ffffffffa06f34a1>] ? btrfs_log_inode_parent+0xc61/0xf30 [btrfs]
+[ 4079.474373] [<ffffffffa06f45a9>] ? btrfs_log_dentry_safe+0x59/0x80 [btrfs]
+[ 4079.482161] [<ffffffffa06c298d>] ? btrfs_sync_file+0x20d/0x330 [btrfs]
+[ 4079.489558] [<ffffffff8112777c>] ? do_fsync+0x4c/0x80
+[ 4079.495300] [<ffffffff81127a0a>] ? SyS_fdatasync+0xa/0x10
+[ 4079.501422] [<ffffffff814a9b72>] ? system_call_fastpath+0x16/0x1b
+
+[ 4079.508334] user-space-program D ffff88107fc30640 0 32021 1 0x00000004
+[ 4079.516226] ffff880eae8efbf8 0000000000000086 0000000000009000 ffff880eae8effd8
+[ 4079.524513] ffff880eae8effd8 ffff881030279610 ffffffffa06ecb26 ffff88101a5d6138
+[ 4079.532802] ffff880ebb671d88 ffff880eae8efc20 ffff880ebb671de8 ffff881030279610
+[ 4079.541092] Call Trace:
+[ 4079.550642] [<ffffffffa06ed595>] ? btrfs_tree_lock+0x85/0x2f0 [btrfs]
+[ 4079.557941] [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4079.564463] [<ffffffffa068cc1f>] ? btrfs_search_slot+0x79f/0xb10 [btrfs]
+[ 4079.572058] [<ffffffffa06bb7d8>] ? btrfs_truncate_inode_items+0x168/0xb90 [btrfs]
+[ 4079.580526] [<ffffffffa06b04be>] ? join_transaction.isra.15+0x1e/0x3a0 [btrfs]
+[ 4079.588701] [<ffffffffa06b206d>] ? start_transaction+0x8d/0x470 [btrfs]
+[ 4079.596196] [<ffffffffa0690ac6>] ? block_rsv_add_bytes+0x16/0x50 [btrfs]
+[ 4079.603789] [<ffffffffa06bc2e9>] ? btrfs_truncate+0xe9/0x2e0 [btrfs]
+[ 4079.610994] [<ffffffffa06bd00b>] ? btrfs_setattr+0x30b/0x410 [btrfs]
+[ 4079.618197] [<ffffffff81117c1c>] ? notify_change+0x1dc/0x680
+[ 4079.624625] [<ffffffff8123c8a4>] ? aa_path_perm+0xd4/0x160
+[ 4079.630854] [<ffffffff810f4fcb>] ? do_truncate+0x5b/0x90
+[ 4079.636889] [<ffffffff810f59fa>] ? do_sys_ftruncate.constprop.15+0x10a/0x160
+[ 4079.644869] [<ffffffff8110d87b>] ? SyS_fcntl+0x5b/0x570
+[ 4079.650805] [<ffffffff814a9b72>] ? system_call_fastpath+0x16/0x1b
+
+[ 4080.410607] user-space-program D ffff88107fc70640 0 32028 12639 0x00000004
+[ 4080.418489] ffff880eaeccbbe0 0000000000000086 0000000000009000 ffff880eaeccbfd8
+[ 4080.426778] ffff880eaeccbfd8 ffff880f317ef1e0 ffffffffa06ecb26 ffff88101a5d6138
+[ 4080.435067] ffff880ef7e93928 ffff880f317ef1e0 ffff880eaeccbc08 ffff880f317ef1e0
+[ 4080.443353] Call Trace:
+[ 4080.452920] [<ffffffffa06ed15d>] ? btrfs_tree_read_lock+0xdd/0x190 [btrfs]
+[ 4080.460703] [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4080.467225] [<ffffffffa06876bb>] ? btrfs_read_lock_root_node+0x2b/0x40 [btrfs]
+[ 4080.475400] [<ffffffffa068cc81>] ? btrfs_search_slot+0x801/0xb10 [btrfs]
+[ 4080.482994] [<ffffffffa06b2df0>] ? btrfs_clean_one_deleted_snapshot+0xe0/0xe0 [btrfs]
+[ 4080.491857] [<ffffffffa06a70a6>] ? btrfs_lookup_inode+0x26/0x90 [btrfs]
+[ 4080.499353] [<ffffffff810ec42f>] ? kmem_cache_alloc+0xaf/0xc0
+[ 4080.505879] [<ffffffffa06bd905>] ? btrfs_iget+0xd5/0x5d0 [btrfs]
+[ 4080.512696] [<ffffffffa06caf04>] ? btrfs_get_token_64+0x104/0x120 [btrfs]
+[ 4080.520387] [<ffffffffa06f341f>] ? btrfs_log_inode_parent+0xbdf/0xf30 [btrfs]
+[ 4080.528469] [<ffffffffa06f45a9>] ? btrfs_log_dentry_safe+0x59/0x80 [btrfs]
+[ 4080.536258] [<ffffffffa06c298d>] ? btrfs_sync_file+0x20d/0x330 [btrfs]
+[ 4080.543657] [<ffffffff8112777c>] ? do_fsync+0x4c/0x80
+[ 4080.549399] [<ffffffff81127a0a>] ? SyS_fdatasync+0xa/0x10
+[ 4080.555534] [<ffffffff814a9b72>] ? system_call_fastpath+0x16/0x1b
+
+Signed-off-by: Robbie Ko <robbieko@synology.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Fixes: 2f2ff0ee5e43 (Btrfs: fix metadata inconsistencies after directory fsync)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+[Modified changelog for clarity and correctness]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-log.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -5205,6 +5205,7 @@ process_leaf:
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
++ btrfs_release_path(path);
+ di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+ root, NULL);
+ if (IS_ERR(di_inode)) {
+@@ -5214,13 +5215,12 @@ process_leaf:
+
+ if (btrfs_inode_in_log(di_inode, trans->transid)) {
+ iput(di_inode);
+- continue;
++ break;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ log_mode = LOG_INODE_ALL;
+- btrfs_release_path(path);
+ ret = btrfs_log_inode(trans, root, di_inode,
+ log_mode, 0, LLONG_MAX, ctx);
+ if (!ret &&
--- /dev/null
+From f177d73949bf758542ca15a1c1945bd2e802cc65 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 23 Nov 2016 16:21:18 +0000
+Subject: Btrfs: fix emptiness check for dirtied extent buffers at check_leaf()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit f177d73949bf758542ca15a1c1945bd2e802cc65 upstream.
+
+We can not simply use the owner field from an extent buffer's header to
+get the id of the respective tree when the extent buffer is from a
+relocation tree. When we create the root for a relocation tree we leave
+(on purpose) the owner field with the same value as the subvolume's tree
+root (we do this at ctree.c:btrfs_copy_root()). So we must ignore extent
+buffers from relocation trees, which have the BTRFS_HEADER_FLAG_RELOC
+flag set, because otherwise we will always consider the extent buffer
+as not being the root of the tree (the root of original subvolume tree
+is always different from the root of the respective relocation tree).
+
+This lead to assertion failures when running with the integrity checker
+enabled (CONFIG_BTRFS_FS_CHECK_INTEGRITY=y) such as the following:
+
+[ 643.393409] BTRFS critical (device sdg): corrupt leaf, non-root leaf's nritems is 0: block=38506496, root=260, slot=0
+[ 643.397609] BTRFS info (device sdg): leaf 38506496 total ptrs 0 free space 3995
+[ 643.407075] assertion failed: 0, file: fs/btrfs/disk-io.c, line: 4078
+[ 643.408425] ------------[ cut here ]------------
+[ 643.409112] kernel BUG at fs/btrfs/ctree.h:3419!
+[ 643.409773] invalid opcode: 0000 [#1] PREEMPT SMP
+[ 643.410447] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq ppdev psmouse acpi_cpufreq parport_pc evdev parport tpm_tis tpm_tis_core pcspkr serio_raw i2c_piix4 sg tpm i2c_core button processor loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring scsi_mod virtio e1000 floppy
+[ 643.414356] CPU: 11 PID: 32726 Comm: btrfs Not tainted 4.8.0-rc8-btrfs-next-35+ #1
+[ 643.414356] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
+[ 643.414356] task: ffff880145e95b00 task.stack: ffff88014826c000
+[ 643.414356] RIP: 0010:[<ffffffffa0352759>] [<ffffffffa0352759>] assfail.constprop.41+0x1c/0x1e [btrfs]
+[ 643.414356] RSP: 0018:ffff88014826fa28 EFLAGS: 00010292
+[ 643.414356] RAX: 0000000000000039 RBX: ffff88014e2d7c38 RCX: 0000000000000001
+[ 643.414356] RDX: ffff88023f4d2f58 RSI: ffffffff81806c63 RDI: 00000000ffffffff
+[ 643.414356] RBP: ffff88014826fa28 R08: 0000000000000001 R09: 0000000000000000
+[ 643.414356] R10: ffff88014826f918 R11: ffffffff82f3c5ed R12: ffff880172910000
+[ 643.414356] R13: ffff880233992230 R14: ffff8801a68a3310 R15: fffffffffffffff8
+[ 643.414356] FS: 00007f9ca305e8c0(0000) GS:ffff88023f4c0000(0000) knlGS:0000000000000000
+[ 643.414356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 643.414356] CR2: 00007f9ca3071000 CR3: 000000015d01b000 CR4: 00000000000006e0
+[ 643.414356] Stack:
+[ 643.414356] ffff88014826fa50 ffffffffa02d655a 000000000000000a ffff88014e2d7c38
+[ 643.414356] 0000000000000000 ffff88014826faa8 ffffffffa02b72f3 ffff88014826fab8
+[ 643.414356] 00ffffffa03228e4 0000000000000000 0000000000000000 ffff8801bbd4e000
+[ 643.414356] Call Trace:
+[ 643.414356] [<ffffffffa02d655a>] btrfs_mark_buffer_dirty+0xdf/0xe5 [btrfs]
+[ 643.414356] [<ffffffffa02b72f3>] btrfs_copy_root+0x18a/0x1d1 [btrfs]
+[ 643.414356] [<ffffffffa0322921>] create_reloc_root+0x72/0x1ba [btrfs]
+[ 643.414356] [<ffffffffa03267c2>] btrfs_init_reloc_root+0x7b/0xa7 [btrfs]
+[ 643.414356] [<ffffffffa02d9e44>] record_root_in_trans+0xdf/0xed [btrfs]
+[ 643.414356] [<ffffffffa02db04e>] btrfs_record_root_in_trans+0x50/0x6a [btrfs]
+[ 643.414356] [<ffffffffa030ad2b>] create_subvol+0x472/0x773 [btrfs]
+[ 643.414356] [<ffffffffa030b406>] btrfs_mksubvol+0x3da/0x463 [btrfs]
+[ 643.414356] [<ffffffffa030b406>] ? btrfs_mksubvol+0x3da/0x463 [btrfs]
+[ 643.414356] [<ffffffff810781ac>] ? preempt_count_add+0x65/0x68
+[ 643.414356] [<ffffffff811a6e97>] ? __mnt_want_write+0x62/0x77
+[ 643.414356] [<ffffffffa030b55d>] btrfs_ioctl_snap_create_transid+0xce/0x187 [btrfs]
+[ 643.414356] [<ffffffffa030b67d>] btrfs_ioctl_snap_create+0x67/0x81 [btrfs]
+[ 643.414356] [<ffffffffa030ecfd>] btrfs_ioctl+0x508/0x20dd [btrfs]
+[ 643.414356] [<ffffffff81293e39>] ? __this_cpu_preempt_check+0x13/0x15
+[ 643.414356] [<ffffffff81155eca>] ? handle_mm_fault+0x976/0x9ab
+[ 643.414356] [<ffffffff81091300>] ? arch_local_irq_save+0x9/0xc
+[ 643.414356] [<ffffffff8119a2b0>] vfs_ioctl+0x18/0x34
+[ 643.414356] [<ffffffff8119a8e8>] do_vfs_ioctl+0x581/0x600
+[ 643.414356] [<ffffffff814b9552>] ? entry_SYSCALL_64_fastpath+0x5/0xa8
+[ 643.414356] [<ffffffff81093fe9>] ? trace_hardirqs_on_caller+0x17b/0x197
+[ 643.414356] [<ffffffff8119a9be>] SyS_ioctl+0x57/0x79
+[ 643.414356] [<ffffffff814b9565>] entry_SYSCALL_64_fastpath+0x18/0xa8
+[ 643.414356] [<ffffffff81091b08>] ? trace_hardirqs_off_caller+0x3f/0xaa
+[ 643.414356] Code: 89 83 88 00 00 00 31 c0 5b 41 5c 41 5d 5d c3 55 89 f1 48 c7 c2 98 bc 35 a0 48 89 fe 48 c7 c7 05 be 35 a0 48 89 e5 e8 13 46 dd e0 <0f> 0b 55 89 f1 48 c7 c2 9f d3 35 a0 48 89 fe 48 c7 c7 7a d5 35
+[ 643.414356] RIP [<ffffffffa0352759>] assfail.constprop.41+0x1c/0x1e [btrfs]
+[ 643.414356] RSP <ffff88014826fa28>
+[ 643.468267] ---[ end trace 6a1b3fb1a9d7d6e3 ]---
+
+This can be easily reproduced by running xfstests with the integrity
+checker enabled.
+
+Fixes: 1ba98d086fe3 (Btrfs: detect corruption when non-root leaf has zero item)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c | 13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -559,7 +559,15 @@ static noinline int check_leaf(struct bt
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+- if (nritems == 0) {
++ /*
++ * Extent buffers from a relocation tree have a owner field that
++ * corresponds to the subvolume tree they are based on. So just from an
++ * extent buffer alone we can not find out what is the id of the
++ * corresponding subvolume tree, so we can not figure out if the extent
++ * buffer corresponds to the root of the relocation tree or not. So skip
++ * this check for relocation trees.
++ */
++ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+ struct btrfs_root *check_root;
+
+ key.objectid = btrfs_header_owner(leaf);
+@@ -587,6 +595,9 @@ static noinline int check_leaf(struct bt
+ return 0;
+ }
+
++ if (nritems == 0)
++ return 0;
++
+ /* Check the 0 item */
+ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+ BTRFS_LEAF_DATA_SIZE(root)) {
--- /dev/null
+From 8d9eddad19467b008e0c881bc3133d7da94b7ec1 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 24 Nov 2016 02:09:04 +0000
+Subject: Btrfs: fix qgroup rescan worker initialization
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 8d9eddad19467b008e0c881bc3133d7da94b7ec1 upstream.
+
+We were setting the qgroup_rescan_running flag to true only after the
+rescan worker started (which is a task run by a queue). So if a user
+space task starts a rescan and immediately after asks to wait for the
+rescan worker to finish, this second call might happen before the rescan
+worker task starts running, in which case the rescan wait ioctl returns
+immediatley, not waiting for the rescan worker to finish.
+
+This was making the fstest btrfs/022 fail very often.
+
+Fixes: d2c609b834d6 (btrfs: properly track when rescan worker is running)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/qgroup.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -2332,10 +2332,6 @@ static void btrfs_qgroup_rescan_worker(s
+ int err = -ENOMEM;
+ int ret = 0;
+
+- mutex_lock(&fs_info->qgroup_rescan_lock);
+- fs_info->qgroup_rescan_running = true;
+- mutex_unlock(&fs_info->qgroup_rescan_lock);
+-
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out;
+@@ -2446,6 +2442,7 @@ qgroup_rescan_init(struct btrfs_fs_info
+ sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+ init_completion(&fs_info->qgroup_rescan_completion);
++ fs_info->qgroup_rescan_running = true;
+
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
--- /dev/null
+From 054570a1dc94de20e7a612cddcc5a97db9c37b6f Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 1 Nov 2016 11:23:31 +0000
+Subject: Btrfs: fix relocation incorrectly dropping data references
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 054570a1dc94de20e7a612cddcc5a97db9c37b6f upstream.
+
+During relocation of a data block group we create a relocation tree
+for each fs/subvol tree by making a snapshot of each tree using
+btrfs_copy_root() and the tree's commit root, and then setting the last
+snapshot field for the fs/subvol tree's root to the value of the current
+transaction id minus 1. However this can lead to relocation later
+dropping references that it did not create if we have qgroups enabled,
+leaving the filesystem in an inconsistent state that keeps aborting
+transactions.
+
+Lets consider the following example to explain the problem, which requires
+qgroups to be enabled.
+
+We are relocating data block group Y, we have a subvolume with id 258 that
+has a root at level 1, that subvolume is used to store directory entries
+for snapshots and we are currently at transaction 3404.
+
+When committing transaction 3404, we have a pending snapshot and therefore
+we call btrfs_run_delayed_items() at transaction.c:create_pending_snapshot()
+in order to create its dentry at subvolume 258. This results in COWing
+leaf A from root 258 in order to add the dentry. Note that leaf A
+also contains file extent items referring to extents from some other
+block group X (we are currently relocating block group Y). Later on, still
+at create_pending_snapshot() we call qgroup_account_snapshot(), which
+switches the commit root for root 258 when it calls switch_commit_roots(),
+so now the COWed version of leaf A, lets call it leaf A', is accessible
+from the commit root of tree 258. At the end of qgroup_account_snapshot(),
+we call record_root_in_trans() with 258 as its argument, which results
+in btrfs_init_reloc_root() being called, which in turn calls
+relocation.c:create_reloc_root() in order to create a relocation tree
+associated to root 258, which results in assigning the value of 3403
+(which is the current transaction id minus 1 = 3404 - 1) to the
+last_snapshot field of root 258. When creating the relocation tree root
+at ctree.c:btrfs_copy_root() we add a shared reference for leaf A',
+corresponding to the relocation tree's root, when we call btrfs_inc_ref()
+against the COWed root (a copy of the commit root from tree 258), which
+is at level 1. So at this point leaf A' has 2 references, one normal
+reference corresponding to root 258 and one shared reference corresponding
+to the root of the relocation tree.
+
+Transaction 3404 finishes its commit and transaction 3405 is started by
+relocation when calling merge_reloc_root() for the relocation tree
+associated to root 258. In the meanwhile leaf A' is COWed again, in
+response to some filesystem operation, when we are still at transaction
+3405. However when we COW leaf A', at ctree.c:update_ref_for_cow(), we
+call btrfs_block_can_be_shared() in order to figure out if other trees
+refer to the leaf and if any such trees exists, add a full back reference
+to leaf A' - but btrfs_block_can_be_shared() incorrectly returns false
+because the following condition is false:
+
+ btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)
+
+which evaluates to 3404 <= 3403. So after leaf A' is COWed, it stays with
+only one reference, corresponding to the shared reference we created when
+we called btrfs_copy_root() to create the relocation tree's root and
+btrfs_inc_ref() ends up not being called for leaf A' nor we end up setting
+the flag BTRFS_BLOCK_FLAG_FULL_BACKREF in leaf A'. This results in not
+adding shared references for the extents from block group X that leaf A'
+refers to with its file extent items.
+
+Later, after merging the relocation root we do a call to to
+btrfs_drop_snapshot() in order to delete the relocation tree. This ends
+up calling do_walk_down() when path->slots[1] points to leaf A', which
+results in calling btrfs_lookup_extent_info() to get the number of
+references for leaf A', which is 1 at this time (only the shared reference
+exists) and this value is stored at wc->refs[0]. After this walk_up_proc()
+is called when wc->level is 0 and path->nodes[0] corresponds to leaf A'.
+Because the current level is 0 and wc->refs[0] is 1, it does call
+btrfs_dec_ref() against leaf A', which results in removing the single
+references that the extents from block group X have which are associated
+to root 258 - the expectation was to have each of these extents with 2
+references - one reference for root 258 and one shared reference related
+to the root of the relocation tree, and so we would drop only the shared
+reference (because leaf A' was supposed to have the flag
+BTRFS_BLOCK_FLAG_FULL_BACKREF set).
+
+This leaves the filesystem in an inconsistent state as we now have file
+extent items in a subvolume tree that point to extents from block group X
+without references in the extent tree. So later on when we try to decrement
+the references for these extents, for example due to a file unlink operation,
+truncate operation or overwriting ranges of a file, we fail because the
+expected references do not exist in the extent tree.
+
+This leads to warnings and transaction aborts like the following:
+
+[ 588.965795] ------------[ cut here ]------------
+[ 588.965815] WARNING: CPU: 2 PID: 2479 at fs/btrfs/extent-tree.c:1625 lookup_inline_extent_backref+0x432/0x5b0 [btrfs]
+[ 588.965816] Modules linked in: af_packet iscsi_ibft iscsi_boot_sysfs xfs libcrc32c ppdev acpi_cpufreq button tpm_tis e1000 i2c_piix4 pcspkr parport_pc
+parport tpm qemu_fw_cfg joydev btrfs xor raid6_pq sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci bochs_drm virtio_ring drm_kms_helper syscopyarea
+sysfillrect sysimgblt fb_sys_fops virtio ttm serio_raw drm floppy sg
+[ 588.965831] CPU: 2 PID: 2479 Comm: kworker/u8:7 Not tainted 4.7.3-3-default-fdm+ #1
+[ 588.965832] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
+[ 588.965844] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
+[ 588.965845] 0000000000000000 ffff8802263bfa28 ffffffff813af542 0000000000000000
+[ 588.965847] 0000000000000000 ffff8802263bfa68 ffffffff81081e8b 0000065900000000
+[ 588.965848] ffff8801db2af000 000000012bbe2000 0000000000000000 ffff880215703b48
+[ 588.965849] Call Trace:
+[ 588.965852] [<ffffffff813af542>] dump_stack+0x63/0x81
+[ 588.965854] [<ffffffff81081e8b>] __warn+0xcb/0xf0
+[ 588.965855] [<ffffffff81081f7d>] warn_slowpath_null+0x1d/0x20
+[ 588.965863] [<ffffffffa0175042>] lookup_inline_extent_backref+0x432/0x5b0 [btrfs]
+[ 588.965865] [<ffffffff81143220>] ? trace_clock_local+0x10/0x30
+[ 588.965867] [<ffffffff8114c5df>] ? rb_reserve_next_event+0x6f/0x460
+[ 588.965875] [<ffffffffa0175215>] insert_inline_extent_backref+0x55/0xd0 [btrfs]
+[ 588.965882] [<ffffffffa017531f>] __btrfs_inc_extent_ref.isra.55+0x8f/0x240 [btrfs]
+[ 588.965890] [<ffffffffa017acea>] __btrfs_run_delayed_refs+0x74a/0x1260 [btrfs]
+[ 588.965892] [<ffffffff810cb046>] ? cpuacct_charge+0x86/0xa0
+[ 588.965900] [<ffffffffa017e74f>] btrfs_run_delayed_refs+0x9f/0x2c0 [btrfs]
+[ 588.965908] [<ffffffffa017ea04>] delayed_ref_async_start+0x94/0xb0 [btrfs]
+[ 588.965918] [<ffffffffa01c799a>] btrfs_scrubparity_helper+0xca/0x350 [btrfs]
+[ 588.965928] [<ffffffffa01c7c5e>] btrfs_extent_refs_helper+0xe/0x10 [btrfs]
+[ 588.965930] [<ffffffff8109b323>] process_one_work+0x1f3/0x4e0
+[ 588.965931] [<ffffffff8109b658>] worker_thread+0x48/0x4e0
+[ 588.965932] [<ffffffff8109b610>] ? process_one_work+0x4e0/0x4e0
+[ 588.965934] [<ffffffff810a1659>] kthread+0xc9/0xe0
+[ 588.965936] [<ffffffff816f2f1f>] ret_from_fork+0x1f/0x40
+[ 588.965937] [<ffffffff810a1590>] ? kthread_worker_fn+0x170/0x170
+[ 588.965938] ---[ end trace 34e5232c933a1749 ]---
+[ 588.966187] ------------[ cut here ]------------
+[ 588.966196] WARNING: CPU: 2 PID: 2479 at fs/btrfs/extent-tree.c:2966 btrfs_run_delayed_refs+0x28c/0x2c0 [btrfs]
+[ 588.966196] BTRFS: Transaction aborted (error -5)
+[ 588.966197] Modules linked in: af_packet iscsi_ibft iscsi_boot_sysfs xfs libcrc32c ppdev acpi_cpufreq button tpm_tis e1000 i2c_piix4 pcspkr parport_pc
+parport tpm qemu_fw_cfg joydev btrfs xor raid6_pq sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci bochs_drm virtio_ring drm_kms_helper syscopyarea
+sysfillrect sysimgblt fb_sys_fops virtio ttm serio_raw drm floppy sg
+[ 588.966206] CPU: 2 PID: 2479 Comm: kworker/u8:7 Tainted: G W 4.7.3-3-default-fdm+ #1
+[ 588.966207] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
+[ 588.966217] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
+[ 588.966217] 0000000000000000 ffff8802263bfc98 ffffffff813af542 ffff8802263bfce8
+[ 588.966219] 0000000000000000 ffff8802263bfcd8 ffffffff81081e8b 00000b96345ee000
+[ 588.966220] ffffffffa021ae1c ffff880215703b48 00000000000005fe ffff8802345ee000
+[ 588.966221] Call Trace:
+[ 588.966223] [<ffffffff813af542>] dump_stack+0x63/0x81
+[ 588.966224] [<ffffffff81081e8b>] __warn+0xcb/0xf0
+[ 588.966225] [<ffffffff81081eff>] warn_slowpath_fmt+0x4f/0x60
+[ 588.966233] [<ffffffffa017e93c>] btrfs_run_delayed_refs+0x28c/0x2c0 [btrfs]
+[ 588.966241] [<ffffffffa017ea04>] delayed_ref_async_start+0x94/0xb0 [btrfs]
+[ 588.966250] [<ffffffffa01c799a>] btrfs_scrubparity_helper+0xca/0x350 [btrfs]
+[ 588.966259] [<ffffffffa01c7c5e>] btrfs_extent_refs_helper+0xe/0x10 [btrfs]
+[ 588.966260] [<ffffffff8109b323>] process_one_work+0x1f3/0x4e0
+[ 588.966261] [<ffffffff8109b658>] worker_thread+0x48/0x4e0
+[ 588.966263] [<ffffffff8109b610>] ? process_one_work+0x4e0/0x4e0
+[ 588.966264] [<ffffffff810a1659>] kthread+0xc9/0xe0
+[ 588.966265] [<ffffffff816f2f1f>] ret_from_fork+0x1f/0x40
+[ 588.966267] [<ffffffff810a1590>] ? kthread_worker_fn+0x170/0x170
+[ 588.966268] ---[ end trace 34e5232c933a174a ]---
+[ 588.966269] BTRFS: error (device sda2) in btrfs_run_delayed_refs:2966: errno=-5 IO failure
+[ 588.966270] BTRFS info (device sda2): forced readonly
+
+This was happening often on openSUSE and SLE systems using btrfs as the
+root filesystem (with its default layout where multiple subvolumes are
+used) where balance happens in the background triggered by a cron job and
+snapshots are automatically created before/after package installations,
+upgrades and removals. The issue could be triggered simply by running the
+following loop on the first system boot post installation:
+
+ while true; do
+ zypper -n in nfs-kernel-server
+ zypper -n rm nfs-kernel-server
+ done
+
+(If we were fast enough and made that loop before the cron job triggered
+a balance operation and the balance finished)
+
+So fix by setting the last_snapshot field of the root to the value of the
+generation of its commit root. Like this btrfs_block_can_be_shared()
+behaves correctly for the case where the relocation root is created during
+a transaction commit and for the case where it's created before a
+transaction commit.
+
+Fixes: 6426c7ad697d (btrfs: qgroup: Fix qgroup accounting when creating snapshot)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: Josef Bacik <jbacik@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/relocation.c | 15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -1387,14 +1387,23 @@ static struct btrfs_root *create_reloc_r
+ root_key.offset = objectid;
+
+ if (root->root_key.objectid == objectid) {
++ u64 commit_root_gen;
++
+ /* called by btrfs_init_reloc_root */
+ ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+-
+ last_snap = btrfs_root_last_snapshot(&root->root_item);
+- btrfs_set_root_last_snapshot(&root->root_item,
+- trans->transid - 1);
++ /*
++ * Set the last_snapshot field to the generation of the commit
++ * root - like this ctree.c:btrfs_block_can_be_shared() behaves
++ * correctly (returns true) when the relocation root is created
++ * either inside the critical section of a transaction commit
++ * (through transaction.c:qgroup_account_snapshot()) and when
++ * it's created before the transaction commit is started.
++ */
++ commit_root_gen = btrfs_header_generation(root->commit_root);
++ btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
+ } else {
+ /*
+ * called by btrfs_reloc_post_snapshot_hook.
--- /dev/null
+From 2a7bf53f577e49c43de4ffa7776056de26db65d9 Mon Sep 17 00:00:00 2001
+From: Robbie Ko <robbieko@synology.com>
+Date: Fri, 7 Oct 2016 17:30:47 +0800
+Subject: Btrfs: fix tree search logic when replaying directory entry deletes
+
+From: Robbie Ko <robbieko@synology.com>
+
+commit 2a7bf53f577e49c43de4ffa7776056de26db65d9 upstream.
+
+If a log tree has a layout like the following:
+
+leaf N:
+ ...
+ item 240 key (282 DIR_LOG_ITEM 0) itemoff 8189 itemsize 8
+ dir log end 1275809046
+leaf N + 1:
+ item 0 key (282 DIR_LOG_ITEM 3936149215) itemoff 16275 itemsize 8
+ dir log end 18446744073709551615
+ ...
+
+When we pass the value 1275809046 + 1 as the parameter start_ret to the
+function tree-log.c:find_dir_range() (done by replay_dir_deletes()), we
+end up with path->slots[0] having the value 239 (points to the last item
+of leaf N, item 240). Because the dir log item in that position has an
+offset value smaller than *start_ret (1275809046 + 1) we need to move on
+to the next leaf, however the logic for that is wrong since it compares
+the current slot to the number of items in the leaf, which is smaller
+and therefore we don't lookup for the next leaf but instead we set the
+slot to point to an item that does not exist, at slot 240, and we later
+operate on that slot which has unexpected content or in the worst case
+can result in an invalid memory access (accessing beyond the last page
+of leaf N's extent buffer).
+
+So fix the logic that checks when we need to lookup at the next leaf
+by first incrementing the slot and only after to check if that slot
+is beyond the last item of the current leaf.
+
+Signed-off-by: Robbie Ko <robbieko@synology.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Fixes: e02119d5a7b4 (Btrfs: Add a write ahead tree log to optimize synchronous operations)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+[Modified changelog for clarity and correctness]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-log.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1940,12 +1940,11 @@ static noinline int find_dir_range(struc
+ next:
+ /* check the next slot in the tree to see if it is a valid item */
+ nritems = btrfs_header_nritems(path->nodes[0]);
++ path->slots[0]++;
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+- } else {
+- path->slots[0]++;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
--- /dev/null
+From 2939e1a86f758b55cdba73e29397dd3d94df13bc Mon Sep 17 00:00:00 2001
+From: Maxim Patlasov <mpatlasov@virtuozzo.com>
+Date: Mon, 12 Dec 2016 14:32:44 -0800
+Subject: btrfs: limit async_work allocation and worker func duration
+
+From: Maxim Patlasov <mpatlasov@virtuozzo.com>
+
+commit 2939e1a86f758b55cdba73e29397dd3d94df13bc upstream.
+
+Problem statement: unprivileged user who has read-write access to more than
+one btrfs subvolume may easily consume all kernel memory (eventually
+triggering oom-killer).
+
+Reproducer (./mkrmdir below essentially loops over mkdir/rmdir):
+
+[root@kteam1 ~]# cat prep.sh
+
+DEV=/dev/sdb
+mkfs.btrfs -f $DEV
+mount $DEV /mnt
+for i in `seq 1 16`
+do
+ mkdir /mnt/$i
+ btrfs subvolume create /mnt/SV_$i
+ ID=`btrfs subvolume list /mnt |grep "SV_$i$" |cut -d ' ' -f 2`
+ mount -t btrfs -o subvolid=$ID $DEV /mnt/$i
+ chmod a+rwx /mnt/$i
+done
+
+[root@kteam1 ~]# sh prep.sh
+
+[maxim@kteam1 ~]$ for i in `seq 1 16`; do ./mkrmdir /mnt/$i 2000 2000 & done
+
+[root@kteam1 ~]# for i in `seq 1 4`; do grep "kmalloc-128" /proc/slabinfo | grep -v dma; sleep 60; done
+kmalloc-128 10144 10144 128 32 1 : tunables 0 0 0 : slabdata 317 317 0
+kmalloc-128 9992352 9992352 128 32 1 : tunables 0 0 0 : slabdata 312261 312261 0
+kmalloc-128 24226752 24226752 128 32 1 : tunables 0 0 0 : slabdata 757086 757086 0
+kmalloc-128 42754240 42754240 128 32 1 : tunables 0 0 0 : slabdata 1336070 1336070 0
+
+The huge numbers above come from insane number of async_work-s allocated
+and queued by btrfs_wq_run_delayed_node.
+
+The problem is caused by btrfs_wq_run_delayed_node() queuing more and more
+works if the number of delayed items is above BTRFS_DELAYED_BACKGROUND. The
+worker func (btrfs_async_run_delayed_root) processes at least
+BTRFS_DELAYED_BATCH items (if they are present in the list). So, the machinery
+works as expected while the list is almost empty. As soon as it is getting
+bigger, worker func starts to process more than one item at a time, it takes
+longer, and the chances to have async_works queued more than needed is getting
+higher.
+
+The problem above is worsened by another flaw of delayed-inode implementation:
+if async_work was queued in a throttling branch (number of items >=
+BTRFS_DELAYED_WRITEBACK), corresponding worker func won't quit until
+the number of items < BTRFS_DELAYED_BACKGROUND / 2. So, it is possible that
+the func occupies CPU infinitely (up to 30sec in my experiments): while the
+func is trying to drain the list, the user activity may add more and more
+items to the list.
+
+The patch fixes both problems in straightforward way: refuse queuing too
+many works in btrfs_wq_run_delayed_node and bail out of worker func if
+at least BTRFS_DELAYED_WRITEBACK items are processed.
+
+Changed in v2: remove support of thresh == NO_THRESHOLD.
+
+Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/async-thread.c | 14 ++++++++++++++
+ fs/btrfs/async-thread.h | 1 +
+ fs/btrfs/delayed-inode.c | 6 ++++--
+ 3 files changed, 19 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/async-thread.c
++++ b/fs/btrfs/async-thread.c
+@@ -86,6 +86,20 @@ btrfs_work_owner(struct btrfs_work *work
+ return work->wq->fs_info;
+ }
+
++bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq)
++{
++ /*
++ * We could compare wq->normal->pending with num_online_cpus()
++ * to support "thresh == NO_THRESHOLD" case, but it requires
++ * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
++ * postpone it until someone needs the support of that case.
++ */
++ if (wq->normal->thresh == NO_THRESHOLD)
++ return false;
++
++ return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
++}
++
+ BTRFS_WORK_HELPER(worker_helper);
+ BTRFS_WORK_HELPER(delalloc_helper);
+ BTRFS_WORK_HELPER(flush_delalloc_helper);
+--- a/fs/btrfs/async-thread.h
++++ b/fs/btrfs/async-thread.h
+@@ -84,4 +84,5 @@ void btrfs_workqueue_set_max(struct btrf
+ void btrfs_set_work_high_priority(struct btrfs_work *work);
+ struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work);
+ struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq);
++bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq);
+ #endif
+--- a/fs/btrfs/delayed-inode.c
++++ b/fs/btrfs/delayed-inode.c
+@@ -1356,7 +1356,8 @@ release_path:
+ total_done++;
+
+ btrfs_release_prepared_delayed_node(delayed_node);
+- if (async_work->nr == 0 || total_done < async_work->nr)
++ if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) ||
++ total_done < async_work->nr)
+ goto again;
+
+ free_path:
+@@ -1372,7 +1373,8 @@ static int btrfs_wq_run_delayed_node(str
+ {
+ struct btrfs_async_delayed_work *async_work;
+
+- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
++ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND ||
++ btrfs_workqueue_normal_congested(fs_info->delayed_workers))
+ return 0;
+
+ async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
--- /dev/null
+From ed0df618b1b06d7431ee4d985317fc5419a5d559 Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.com>
+Date: Tue, 1 Nov 2016 14:21:23 +0100
+Subject: btrfs: store and load values of stripes_min/stripes_max in balance status item
+
+From: David Sterba <dsterba@suse.com>
+
+commit ed0df618b1b06d7431ee4d985317fc5419a5d559 upstream.
+
+The balance status item contains currently known filter values, but the
+stripes filter was unintentionally not among them. This would mean, that
+interrupted and automatically restarted balance does not apply the
+stripe filters.
+
+Fixes: dee32d0ac3719ef8d640efaf0884111df444730f
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.h | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -2210,6 +2210,8 @@ btrfs_disk_balance_args_to_cpu(struct bt
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
++ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
++ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+ }
+
+ static inline void
+@@ -2228,6 +2230,8 @@ btrfs_cpu_balance_args_to_disk(struct bt
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
++ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
++ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+ }
+
+ /* struct btrfs_super_block */