4.8-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 3 Jan 2017 19:24:40 +0000 (20:24 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 3 Jan 2017 19:24:40 +0000 (20:24 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 3 Jan 2017 19:24:40 +0000 (20:24 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 3 Jan 2017 19:24:40 +0000 (20:24 +0100)
diff --git a/queue-4.8/aoe-fix-crash-in-page-count-manipulation.patch b/queue-4.8/aoe-fix-crash-in-page-count-manipulation.patch

new file mode 100644 (file)

index 0000000..b6e24f4
--- /dev/null
+++ b/queue-4.8/aoe-fix-crash-in-page-count-manipulation.patch
@@ -0,0 +1,86 @@
+From 0cbc72a1781250f373327dd7e306e33859a42154 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Fri, 11 Nov 2016 18:28:50 -0700
+Subject: aoe: fix crash in page count manipulation
+
+From: Jens Axboe <axboe@fb.com>
+
+commit 0cbc72a1781250f373327dd7e306e33859a42154 upstream.
+
+aoeblk contains some mysterious code, that wants to elevate the bio
+vec page counts while it's under IO. That is not needed, it's
+fragile, and it's causing kernel oopses for some.
+
+Reported-by: Tested-by: Don Koch <kochd@us.ibm.com>
+Tested-by: Tested-by: Don Koch <kochd@us.ibm.com>
+Signed-off-by: Jens Axboe <axboe@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/aoe/aoecmd.c |   41 -----------------------------------------
+ 1 file changed, 41 deletions(-)
+
+--- a/drivers/block/aoe/aoecmd.c
++++ b/drivers/block/aoe/aoecmd.c
+@@ -853,45 +853,6 @@ rqbiocnt(struct request *r)
+       return n;
+ }
+ 
+-/* This can be removed if we are certain that no users of the block
+- * layer will ever use zero-count pages in bios.  Otherwise we have to
+- * protect against the put_page sometimes done by the network layer.
+- *
+- * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
+- * discussion.
+- *
+- * We cannot use get_page in the workaround, because it insists on a
+- * positive page count as a precondition.  So we use _refcount directly.
+- */
+-static void
+-bio_pageinc(struct bio *bio)
+-{
+-      struct bio_vec bv;
+-      struct page *page;
+-      struct bvec_iter iter;
+-
+-      bio_for_each_segment(bv, bio, iter) {
+-              /* Non-zero page count for non-head members of
+-               * compound pages is no longer allowed by the kernel.
+-               */
+-              page = compound_head(bv.bv_page);
+-              page_ref_inc(page);
+-      }
+-}
+-
+-static void
+-bio_pagedec(struct bio *bio)
+-{
+-      struct page *page;
+-      struct bio_vec bv;
+-      struct bvec_iter iter;
+-
+-      bio_for_each_segment(bv, bio, iter) {
+-              page = compound_head(bv.bv_page);
+-              page_ref_dec(page);
+-      }
+-}
+-
+ static void
+ bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+ {
+@@ -899,7 +860,6 @@ bufinit(struct buf *buf, struct request
+       buf->rq = rq;
+       buf->bio = bio;
+       buf->iter = bio->bi_iter;
+-      bio_pageinc(bio);
+ }
+ 
+ static struct buf *
+@@ -1127,7 +1087,6 @@ aoe_end_buf(struct aoedev *d, struct buf
+       if (buf == d->ip.buf)
+               d->ip.buf = NULL;
+       rq = buf->rq;
+-      bio_pagedec(buf->bio);
+       mempool_free(buf, d->bufpool);
+       n = (unsigned long) rq->special;
+       rq->special = (void *) --n;
diff --git a/queue-4.8/btrfs-fix-bug_on-in-btrfs_mark_buffer_dirty.patch b/queue-4.8/btrfs-fix-bug_on-in-btrfs_mark_buffer_dirty.patch

new file mode 100644 (file)

index 0000000..64a7bd8
--- /dev/null
+++ b/queue-4.8/btrfs-fix-bug_on-in-btrfs_mark_buffer_dirty.patch
@@ -0,0 +1,53 @@
+From ef85b25e982b5bba1530b936e283ef129f02ab9d Mon Sep 17 00:00:00 2001
+From: Liu Bo <bo.li.liu@oracle.com>
+Date: Fri, 2 Sep 2016 12:35:34 -0700
+Subject: Btrfs: fix BUG_ON in btrfs_mark_buffer_dirty
+
+From: Liu Bo <bo.li.liu@oracle.com>
+
+commit ef85b25e982b5bba1530b936e283ef129f02ab9d upstream.
+
+This can only happen with CONFIG_BTRFS_FS_CHECK_INTEGRITY=y.
+
+Commit 1ba98d0 ("Btrfs: detect corruption when non-root leaf has zero item")
+assumes that a leaf is its root when leaf->bytenr == btrfs_root_bytenr(root),
+however, we should not use btrfs_root_bytenr(root) since it's mainly got
+updated during committing transaction.  So the check can fail when doing
+COW on this leaf while it is a root.
+
+This changes to use "if (leaf == btrfs_root_node(root))" instead, just like
+how we check whether leaf is a root in __btrfs_cow_block().
+
+Fixes: 1ba98d086fe3 (Btrfs: detect corruption when non-root leaf has zero item)
+Reported-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c |   10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -572,13 +572,17 @@ static noinline int check_leaf(struct bt
+                * open_ctree() some roots has not yet been set up.
+                */
+               if (!IS_ERR_OR_NULL(check_root)) {
++                      struct extent_buffer *eb;
++
++                      eb = btrfs_root_node(check_root);
+                       /* if leaf is the root, then it's fine */
+-                      if (leaf->start !=
+-                          btrfs_root_bytenr(&check_root->root_item)) {
++                      if (leaf != eb) {
+                               CORRUPT("non-root leaf's nritems is 0",
+-                                      leaf, root, 0);
++                                      leaf, check_root, 0);
++                              free_extent_buffer(eb);
+                               return -EIO;
+                       }
++                      free_extent_buffer(eb);
+               }
+               return 0;
+       }
diff --git a/queue-4.8/btrfs-fix-deadlock-caused-by-fsync-when-logging-directory-entries.patch b/queue-4.8/btrfs-fix-deadlock-caused-by-fsync-when-logging-directory-entries.patch

new file mode 100644 (file)

index 0000000..a9157b7
--- /dev/null
+++ b/queue-4.8/btrfs-fix-deadlock-caused-by-fsync-when-logging-directory-entries.patch
@@ -0,0 +1,175 @@
+From ec125cfb7ae2157af3dd45dd8abe823e3e233eec Mon Sep 17 00:00:00 2001
+From: Robbie Ko <robbieko@synology.com>
+Date: Fri, 28 Oct 2016 10:48:26 +0800
+Subject: Btrfs: fix deadlock caused by fsync when logging directory entries
+
+From: Robbie Ko <robbieko@synology.com>
+
+commit ec125cfb7ae2157af3dd45dd8abe823e3e233eec upstream.
+
+While logging new directory entries, at tree-log.c:log_new_dir_dentries(),
+after we call btrfs_search_forward() we get a leaf with a read lock on it,
+and without unlocking that leaf we can end up calling btrfs_iget() to get
+an inode pointer. The later (btrfs_iget()) can end up doing a read-only
+search on the same tree again, if the inode is not in memory already, which
+ends up causing a deadlock if some other task in the meanwhile started a
+write search on the tree and is attempting to write lock the same leaf
+that btrfs_search_forward() locked while holding write locks on upper
+levels of the tree blocking the read search from btrfs_iget(). In this
+scenario we get a deadlock.
+
+So fix this by releasing the search path before calling btrfs_iget() at
+tree-log.c:log_new_dir_dentries().
+
+Example trace of such deadlock:
+
+[ 4077.478852] kworker/u24:10  D ffff88107fc90640     0 14431      2 0x00000000
+[ 4077.486752] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
+[ 4077.494346]  ffff880ffa56bad0 0000000000000046 0000000000009000 ffff880ffa56bfd8
+[ 4077.502629]  ffff880ffa56bfd8 ffff881016ce21c0 ffffffffa06ecb26 ffff88101a5d6138
+[ 4077.510915]  ffff880ebb5173b0 ffff880ffa56baf8 ffff880ebb517410 ffff881016ce21c0
+[ 4077.519202] Call Trace:
+[ 4077.528752]  [<ffffffffa06ed5ed>] ? btrfs_tree_lock+0xdd/0x2f0 [btrfs]
+[ 4077.536049]  [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4077.542574]  [<ffffffffa068cc1f>] ? btrfs_search_slot+0x79f/0xb10 [btrfs]
+[ 4077.550171]  [<ffffffffa06a5073>] ? btrfs_lookup_file_extent+0x33/0x40 [btrfs]
+[ 4077.558252]  [<ffffffffa06c600b>] ? __btrfs_drop_extents+0x13b/0xdf0 [btrfs]
+[ 4077.566140]  [<ffffffffa06fc9e2>] ? add_delayed_data_ref+0xe2/0x150 [btrfs]
+[ 4077.573928]  [<ffffffffa06fd629>] ? btrfs_add_delayed_data_ref+0x149/0x1d0 [btrfs]
+[ 4077.582399]  [<ffffffffa06cf3c0>] ? __set_extent_bit+0x4c0/0x5c0 [btrfs]
+[ 4077.589896]  [<ffffffffa06b4a64>] ? insert_reserved_file_extent.constprop.75+0xa4/0x320 [btrfs]
+[ 4077.599632]  [<ffffffffa06b206d>] ? start_transaction+0x8d/0x470 [btrfs]
+[ 4077.607134]  [<ffffffffa06bab57>] ? btrfs_finish_ordered_io+0x2e7/0x600 [btrfs]
+[ 4077.615329]  [<ffffffff8104cbc2>] ? process_one_work+0x142/0x3d0
+[ 4077.622043]  [<ffffffff8104d729>] ? worker_thread+0x109/0x3b0
+[ 4077.628459]  [<ffffffff8104d620>] ? manage_workers.isra.26+0x270/0x270
+[ 4077.635759]  [<ffffffff81052b0f>] ? kthread+0xaf/0xc0
+[ 4077.641404]  [<ffffffff81052a60>] ? kthread_create_on_node+0x110/0x110
+[ 4077.648696]  [<ffffffff814a9ac8>] ? ret_from_fork+0x58/0x90
+[ 4077.654926]  [<ffffffff81052a60>] ? kthread_create_on_node+0x110/0x110
+
+[ 4078.358087] kworker/u24:15  D ffff88107fcd0640     0 14436      2 0x00000000
+[ 4078.365981] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
+[ 4078.373574]  ffff880ffa57fad0 0000000000000046 0000000000009000 ffff880ffa57ffd8
+[ 4078.381864]  ffff880ffa57ffd8 ffff88103004d0a0 ffffffffa06ecb26 ffff88101a5d6138
+[ 4078.390163]  ffff880fbeffc298 ffff880ffa57faf8 ffff880fbeffc2f8 ffff88103004d0a0
+[ 4078.398466] Call Trace:
+[ 4078.408019]  [<ffffffffa06ed5ed>] ? btrfs_tree_lock+0xdd/0x2f0 [btrfs]
+[ 4078.415322]  [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4078.421844]  [<ffffffffa068cc1f>] ? btrfs_search_slot+0x79f/0xb10 [btrfs]
+[ 4078.429438]  [<ffffffffa06a5073>] ? btrfs_lookup_file_extent+0x33/0x40 [btrfs]
+[ 4078.437518]  [<ffffffffa06c600b>] ? __btrfs_drop_extents+0x13b/0xdf0 [btrfs]
+[ 4078.445404]  [<ffffffffa06fc9e2>] ? add_delayed_data_ref+0xe2/0x150 [btrfs]
+[ 4078.453194]  [<ffffffffa06fd629>] ? btrfs_add_delayed_data_ref+0x149/0x1d0 [btrfs]
+[ 4078.461663]  [<ffffffffa06cf3c0>] ? __set_extent_bit+0x4c0/0x5c0 [btrfs]
+[ 4078.469161]  [<ffffffffa06b4a64>] ? insert_reserved_file_extent.constprop.75+0xa4/0x320 [btrfs]
+[ 4078.478893]  [<ffffffffa06b206d>] ? start_transaction+0x8d/0x470 [btrfs]
+[ 4078.486388]  [<ffffffffa06bab57>] ? btrfs_finish_ordered_io+0x2e7/0x600 [btrfs]
+[ 4078.494561]  [<ffffffff8104cbc2>] ? process_one_work+0x142/0x3d0
+[ 4078.501278]  [<ffffffff8104a507>] ? pwq_activate_delayed_work+0x27/0x40
+[ 4078.508673]  [<ffffffff8104d729>] ? worker_thread+0x109/0x3b0
+[ 4078.515098]  [<ffffffff8104d620>] ? manage_workers.isra.26+0x270/0x270
+[ 4078.522396]  [<ffffffff81052b0f>] ? kthread+0xaf/0xc0
+[ 4078.528032]  [<ffffffff81052a60>] ? kthread_create_on_node+0x110/0x110
+[ 4078.535325]  [<ffffffff814a9ac8>] ? ret_from_fork+0x58/0x90
+[ 4078.541552]  [<ffffffff81052a60>] ? kthread_create_on_node+0x110/0x110
+
+[ 4079.355824] user-space-program D ffff88107fd30640     0 32020      1 0x00000000
+[ 4079.363716]  ffff880eae8eba10 0000000000000086 0000000000009000 ffff880eae8ebfd8
+[ 4079.372003]  ffff880eae8ebfd8 ffff881016c162c0 ffffffffa06ecb26 ffff88101a5d6138
+[ 4079.380294]  ffff880fbed4b4c8 ffff880eae8eba38 ffff880fbed4b528 ffff881016c162c0
+[ 4079.388586] Call Trace:
+[ 4079.398134]  [<ffffffffa06ed595>] ? btrfs_tree_lock+0x85/0x2f0 [btrfs]
+[ 4079.405431]  [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4079.411955]  [<ffffffffa06876fb>] ? btrfs_lock_root_node+0x2b/0x40 [btrfs]
+[ 4079.419644]  [<ffffffffa068ce83>] ? btrfs_search_slot+0xa03/0xb10 [btrfs]
+[ 4079.427237]  [<ffffffffa06aba52>] ? btrfs_buffer_uptodate+0x52/0x70 [btrfs]
+[ 4079.435041]  [<ffffffffa0689b60>] ? generic_bin_search.constprop.38+0x80/0x190 [btrfs]
+[ 4079.443897]  [<ffffffffa068ea44>] ? btrfs_insert_empty_items+0x74/0xd0 [btrfs]
+[ 4079.451975]  [<ffffffffa072c443>] ? copy_items+0x128/0x850 [btrfs]
+[ 4079.458890]  [<ffffffffa072da10>] ? btrfs_log_inode+0x629/0xbf3 [btrfs]
+[ 4079.466292]  [<ffffffffa06f34a1>] ? btrfs_log_inode_parent+0xc61/0xf30 [btrfs]
+[ 4079.474373]  [<ffffffffa06f45a9>] ? btrfs_log_dentry_safe+0x59/0x80 [btrfs]
+[ 4079.482161]  [<ffffffffa06c298d>] ? btrfs_sync_file+0x20d/0x330 [btrfs]
+[ 4079.489558]  [<ffffffff8112777c>] ? do_fsync+0x4c/0x80
+[ 4079.495300]  [<ffffffff81127a0a>] ? SyS_fdatasync+0xa/0x10
+[ 4079.501422]  [<ffffffff814a9b72>] ? system_call_fastpath+0x16/0x1b
+
+[ 4079.508334] user-space-program D ffff88107fc30640     0 32021      1 0x00000004
+[ 4079.516226]  ffff880eae8efbf8 0000000000000086 0000000000009000 ffff880eae8effd8
+[ 4079.524513]  ffff880eae8effd8 ffff881030279610 ffffffffa06ecb26 ffff88101a5d6138
+[ 4079.532802]  ffff880ebb671d88 ffff880eae8efc20 ffff880ebb671de8 ffff881030279610
+[ 4079.541092] Call Trace:
+[ 4079.550642]  [<ffffffffa06ed595>] ? btrfs_tree_lock+0x85/0x2f0 [btrfs]
+[ 4079.557941]  [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4079.564463]  [<ffffffffa068cc1f>] ? btrfs_search_slot+0x79f/0xb10 [btrfs]
+[ 4079.572058]  [<ffffffffa06bb7d8>] ? btrfs_truncate_inode_items+0x168/0xb90 [btrfs]
+[ 4079.580526]  [<ffffffffa06b04be>] ? join_transaction.isra.15+0x1e/0x3a0 [btrfs]
+[ 4079.588701]  [<ffffffffa06b206d>] ? start_transaction+0x8d/0x470 [btrfs]
+[ 4079.596196]  [<ffffffffa0690ac6>] ? block_rsv_add_bytes+0x16/0x50 [btrfs]
+[ 4079.603789]  [<ffffffffa06bc2e9>] ? btrfs_truncate+0xe9/0x2e0 [btrfs]
+[ 4079.610994]  [<ffffffffa06bd00b>] ? btrfs_setattr+0x30b/0x410 [btrfs]
+[ 4079.618197]  [<ffffffff81117c1c>] ? notify_change+0x1dc/0x680
+[ 4079.624625]  [<ffffffff8123c8a4>] ? aa_path_perm+0xd4/0x160
+[ 4079.630854]  [<ffffffff810f4fcb>] ? do_truncate+0x5b/0x90
+[ 4079.636889]  [<ffffffff810f59fa>] ? do_sys_ftruncate.constprop.15+0x10a/0x160
+[ 4079.644869]  [<ffffffff8110d87b>] ? SyS_fcntl+0x5b/0x570
+[ 4079.650805]  [<ffffffff814a9b72>] ? system_call_fastpath+0x16/0x1b
+
+[ 4080.410607] user-space-program D ffff88107fc70640     0 32028  12639 0x00000004
+[ 4080.418489]  ffff880eaeccbbe0 0000000000000086 0000000000009000 ffff880eaeccbfd8
+[ 4080.426778]  ffff880eaeccbfd8 ffff880f317ef1e0 ffffffffa06ecb26 ffff88101a5d6138
+[ 4080.435067]  ffff880ef7e93928 ffff880f317ef1e0 ffff880eaeccbc08 ffff880f317ef1e0
+[ 4080.443353] Call Trace:
+[ 4080.452920]  [<ffffffffa06ed15d>] ? btrfs_tree_read_lock+0xdd/0x190 [btrfs]
+[ 4080.460703]  [<ffffffff81053680>] ? wake_up_atomic_t+0x30/0x30
+[ 4080.467225]  [<ffffffffa06876bb>] ? btrfs_read_lock_root_node+0x2b/0x40 [btrfs]
+[ 4080.475400]  [<ffffffffa068cc81>] ? btrfs_search_slot+0x801/0xb10 [btrfs]
+[ 4080.482994]  [<ffffffffa06b2df0>] ? btrfs_clean_one_deleted_snapshot+0xe0/0xe0 [btrfs]
+[ 4080.491857]  [<ffffffffa06a70a6>] ? btrfs_lookup_inode+0x26/0x90 [btrfs]
+[ 4080.499353]  [<ffffffff810ec42f>] ? kmem_cache_alloc+0xaf/0xc0
+[ 4080.505879]  [<ffffffffa06bd905>] ? btrfs_iget+0xd5/0x5d0 [btrfs]
+[ 4080.512696]  [<ffffffffa06caf04>] ? btrfs_get_token_64+0x104/0x120 [btrfs]
+[ 4080.520387]  [<ffffffffa06f341f>] ? btrfs_log_inode_parent+0xbdf/0xf30 [btrfs]
+[ 4080.528469]  [<ffffffffa06f45a9>] ? btrfs_log_dentry_safe+0x59/0x80 [btrfs]
+[ 4080.536258]  [<ffffffffa06c298d>] ? btrfs_sync_file+0x20d/0x330 [btrfs]
+[ 4080.543657]  [<ffffffff8112777c>] ? do_fsync+0x4c/0x80
+[ 4080.549399]  [<ffffffff81127a0a>] ? SyS_fdatasync+0xa/0x10
+[ 4080.555534]  [<ffffffff814a9b72>] ? system_call_fastpath+0x16/0x1b
+
+Signed-off-by: Robbie Ko <robbieko@synology.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Fixes: 2f2ff0ee5e43 (Btrfs: fix metadata inconsistencies after directory fsync)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+[Modified changelog for clarity and correctness]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-log.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -5205,6 +5205,7 @@ process_leaf:
+                       if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+                               continue;
+ 
++                      btrfs_release_path(path);
+                       di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+                                             root, NULL);
+                       if (IS_ERR(di_inode)) {
+@@ -5214,13 +5215,12 @@ process_leaf:
+ 
+                       if (btrfs_inode_in_log(di_inode, trans->transid)) {
+                               iput(di_inode);
+-                              continue;
++                              break;
+                       }
+ 
+                       ctx->log_new_dentries = false;
+                       if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+                               log_mode = LOG_INODE_ALL;
+-                      btrfs_release_path(path);
+                       ret = btrfs_log_inode(trans, root, di_inode,
+                                             log_mode, 0, LLONG_MAX, ctx);
+                       if (!ret &&
diff --git a/queue-4.8/btrfs-fix-emptiness-check-for-dirtied-extent-buffers-at-check_leaf.patch b/queue-4.8/btrfs-fix-emptiness-check-for-dirtied-extent-buffers-at-check_leaf.patch

new file mode 100644 (file)

index 0000000..f976940
--- /dev/null
+++ b/queue-4.8/btrfs-fix-emptiness-check-for-dirtied-extent-buffers-at-check_leaf.patch
@@ -0,0 +1,117 @@
+From f177d73949bf758542ca15a1c1945bd2e802cc65 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 23 Nov 2016 16:21:18 +0000
+Subject: Btrfs: fix emptiness check for dirtied extent buffers at check_leaf()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit f177d73949bf758542ca15a1c1945bd2e802cc65 upstream.
+
+We can not simply use the owner field from an extent buffer's header to
+get the id of the respective tree when the extent buffer is from a
+relocation tree. When we create the root for a relocation tree we leave
+(on purpose) the owner field with the same value as the subvolume's tree
+root (we do this at ctree.c:btrfs_copy_root()). So we must ignore extent
+buffers from relocation trees, which have the BTRFS_HEADER_FLAG_RELOC
+flag set, because otherwise we will always consider the extent buffer
+as not being the root of the tree (the root of original subvolume tree
+is always different from the root of the respective relocation tree).
+
+This lead to assertion failures when running with the integrity checker
+enabled (CONFIG_BTRFS_FS_CHECK_INTEGRITY=y) such as the following:
+
+[  643.393409] BTRFS critical (device sdg): corrupt leaf, non-root leaf's nritems is 0: block=38506496, root=260, slot=0
+[  643.397609] BTRFS info (device sdg): leaf 38506496 total ptrs 0 free space 3995
+[  643.407075] assertion failed: 0, file: fs/btrfs/disk-io.c, line: 4078
+[  643.408425] ------------[ cut here ]------------
+[  643.409112] kernel BUG at fs/btrfs/ctree.h:3419!
+[  643.409773] invalid opcode: 0000 [#1] PREEMPT SMP
+[  643.410447] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq ppdev psmouse acpi_cpufreq parport_pc evdev parport tpm_tis tpm_tis_core pcspkr serio_raw i2c_piix4 sg tpm i2c_core button processor loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring scsi_mod virtio e1000 floppy
+[  643.414356] CPU: 11 PID: 32726 Comm: btrfs Not tainted 4.8.0-rc8-btrfs-next-35+ #1
+[  643.414356] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
+[  643.414356] task: ffff880145e95b00 task.stack: ffff88014826c000
+[  643.414356] RIP: 0010:[<ffffffffa0352759>]  [<ffffffffa0352759>] assfail.constprop.41+0x1c/0x1e [btrfs]
+[  643.414356] RSP: 0018:ffff88014826fa28  EFLAGS: 00010292
+[  643.414356] RAX: 0000000000000039 RBX: ffff88014e2d7c38 RCX: 0000000000000001
+[  643.414356] RDX: ffff88023f4d2f58 RSI: ffffffff81806c63 RDI: 00000000ffffffff
+[  643.414356] RBP: ffff88014826fa28 R08: 0000000000000001 R09: 0000000000000000
+[  643.414356] R10: ffff88014826f918 R11: ffffffff82f3c5ed R12: ffff880172910000
+[  643.414356] R13: ffff880233992230 R14: ffff8801a68a3310 R15: fffffffffffffff8
+[  643.414356] FS:  00007f9ca305e8c0(0000) GS:ffff88023f4c0000(0000) knlGS:0000000000000000
+[  643.414356] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  643.414356] CR2: 00007f9ca3071000 CR3: 000000015d01b000 CR4: 00000000000006e0
+[  643.414356] Stack:
+[  643.414356]  ffff88014826fa50 ffffffffa02d655a 000000000000000a ffff88014e2d7c38
+[  643.414356]  0000000000000000 ffff88014826faa8 ffffffffa02b72f3 ffff88014826fab8
+[  643.414356]  00ffffffa03228e4 0000000000000000 0000000000000000 ffff8801bbd4e000
+[  643.414356] Call Trace:
+[  643.414356]  [<ffffffffa02d655a>] btrfs_mark_buffer_dirty+0xdf/0xe5 [btrfs]
+[  643.414356]  [<ffffffffa02b72f3>] btrfs_copy_root+0x18a/0x1d1 [btrfs]
+[  643.414356]  [<ffffffffa0322921>] create_reloc_root+0x72/0x1ba [btrfs]
+[  643.414356]  [<ffffffffa03267c2>] btrfs_init_reloc_root+0x7b/0xa7 [btrfs]
+[  643.414356]  [<ffffffffa02d9e44>] record_root_in_trans+0xdf/0xed [btrfs]
+[  643.414356]  [<ffffffffa02db04e>] btrfs_record_root_in_trans+0x50/0x6a [btrfs]
+[  643.414356]  [<ffffffffa030ad2b>] create_subvol+0x472/0x773 [btrfs]
+[  643.414356]  [<ffffffffa030b406>] btrfs_mksubvol+0x3da/0x463 [btrfs]
+[  643.414356]  [<ffffffffa030b406>] ? btrfs_mksubvol+0x3da/0x463 [btrfs]
+[  643.414356]  [<ffffffff810781ac>] ? preempt_count_add+0x65/0x68
+[  643.414356]  [<ffffffff811a6e97>] ? __mnt_want_write+0x62/0x77
+[  643.414356]  [<ffffffffa030b55d>] btrfs_ioctl_snap_create_transid+0xce/0x187 [btrfs]
+[  643.414356]  [<ffffffffa030b67d>] btrfs_ioctl_snap_create+0x67/0x81 [btrfs]
+[  643.414356]  [<ffffffffa030ecfd>] btrfs_ioctl+0x508/0x20dd [btrfs]
+[  643.414356]  [<ffffffff81293e39>] ? __this_cpu_preempt_check+0x13/0x15
+[  643.414356]  [<ffffffff81155eca>] ? handle_mm_fault+0x976/0x9ab
+[  643.414356]  [<ffffffff81091300>] ? arch_local_irq_save+0x9/0xc
+[  643.414356]  [<ffffffff8119a2b0>] vfs_ioctl+0x18/0x34
+[  643.414356]  [<ffffffff8119a8e8>] do_vfs_ioctl+0x581/0x600
+[  643.414356]  [<ffffffff814b9552>] ? entry_SYSCALL_64_fastpath+0x5/0xa8
+[  643.414356]  [<ffffffff81093fe9>] ? trace_hardirqs_on_caller+0x17b/0x197
+[  643.414356]  [<ffffffff8119a9be>] SyS_ioctl+0x57/0x79
+[  643.414356]  [<ffffffff814b9565>] entry_SYSCALL_64_fastpath+0x18/0xa8
+[  643.414356]  [<ffffffff81091b08>] ? trace_hardirqs_off_caller+0x3f/0xaa
+[  643.414356] Code: 89 83 88 00 00 00 31 c0 5b 41 5c 41 5d 5d c3 55 89 f1 48 c7 c2 98 bc 35 a0 48 89 fe 48 c7 c7 05 be 35 a0 48 89 e5 e8 13 46 dd e0 <0f> 0b 55 89 f1 48 c7 c2 9f d3 35 a0 48 89 fe 48 c7 c7 7a d5 35
+[  643.414356] RIP  [<ffffffffa0352759>] assfail.constprop.41+0x1c/0x1e [btrfs]
+[  643.414356]  RSP <ffff88014826fa28>
+[  643.468267] ---[ end trace 6a1b3fb1a9d7d6e3 ]---
+
+This can be easily reproduced by running xfstests with the integrity
+checker enabled.
+
+Fixes: 1ba98d086fe3 (Btrfs: detect corruption when non-root leaf has zero item)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c |   13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -559,7 +559,15 @@ static noinline int check_leaf(struct bt
+       u32 nritems = btrfs_header_nritems(leaf);
+       int slot;
+ 
+-      if (nritems == 0) {
++      /*
++       * Extent buffers from a relocation tree have a owner field that
++       * corresponds to the subvolume tree they are based on. So just from an
++       * extent buffer alone we can not find out what is the id of the
++       * corresponding subvolume tree, so we can not figure out if the extent
++       * buffer corresponds to the root of the relocation tree or not. So skip
++       * this check for relocation trees.
++       */
++      if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+               struct btrfs_root *check_root;
+ 
+               key.objectid = btrfs_header_owner(leaf);
+@@ -587,6 +595,9 @@ static noinline int check_leaf(struct bt
+               return 0;
+       }
+ 
++      if (nritems == 0)
++              return 0;
++
+       /* Check the 0 item */
+       if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+           BTRFS_LEAF_DATA_SIZE(root)) {
diff --git a/queue-4.8/btrfs-fix-qgroup-rescan-worker-initialization.patch b/queue-4.8/btrfs-fix-qgroup-rescan-worker-initialization.patch

new file mode 100644 (file)

index 0000000..d399156
--- /dev/null
+++ b/queue-4.8/btrfs-fix-qgroup-rescan-worker-initialization.patch
@@ -0,0 +1,48 @@
+From 8d9eddad19467b008e0c881bc3133d7da94b7ec1 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 24 Nov 2016 02:09:04 +0000
+Subject: Btrfs: fix qgroup rescan worker initialization
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 8d9eddad19467b008e0c881bc3133d7da94b7ec1 upstream.
+
+We were setting the qgroup_rescan_running flag to true only after the
+rescan worker started (which is a task run by a queue). So if a user
+space task starts a rescan and immediately after asks to wait for the
+rescan worker to finish, this second call might happen before the rescan
+worker task starts running, in which case the rescan wait ioctl returns
+immediatley, not waiting for the rescan worker to finish.
+
+This was making the fstest btrfs/022 fail very often.
+
+Fixes: d2c609b834d6 (btrfs: properly track when rescan worker is running)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/qgroup.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -2332,10 +2332,6 @@ static void btrfs_qgroup_rescan_worker(s
+       int err = -ENOMEM;
+       int ret = 0;
+ 
+-      mutex_lock(&fs_info->qgroup_rescan_lock);
+-      fs_info->qgroup_rescan_running = true;
+-      mutex_unlock(&fs_info->qgroup_rescan_lock);
+-
+       path = btrfs_alloc_path();
+       if (!path)
+               goto out;
+@@ -2446,6 +2442,7 @@ qgroup_rescan_init(struct btrfs_fs_info
+               sizeof(fs_info->qgroup_rescan_progress));
+       fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+       init_completion(&fs_info->qgroup_rescan_completion);
++      fs_info->qgroup_rescan_running = true;
+ 
+       spin_unlock(&fs_info->qgroup_lock);
+       mutex_unlock(&fs_info->qgroup_rescan_lock);
diff --git a/queue-4.8/btrfs-fix-relocation-incorrectly-dropping-data-references.patch b/queue-4.8/btrfs-fix-relocation-incorrectly-dropping-data-references.patch

new file mode 100644 (file)

index 0000000..3954ec4
--- /dev/null
+++ b/queue-4.8/btrfs-fix-relocation-incorrectly-dropping-data-references.patch
@@ -0,0 +1,215 @@
+From 054570a1dc94de20e7a612cddcc5a97db9c37b6f Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 1 Nov 2016 11:23:31 +0000
+Subject: Btrfs: fix relocation incorrectly dropping data references
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 054570a1dc94de20e7a612cddcc5a97db9c37b6f upstream.
+
+During relocation of a data block group we create a relocation tree
+for each fs/subvol tree by making a snapshot of each tree using
+btrfs_copy_root() and the tree's commit root, and then setting the last
+snapshot field for the fs/subvol tree's root to the value of the current
+transaction id minus 1. However this can lead to relocation later
+dropping references that it did not create if we have qgroups enabled,
+leaving the filesystem in an inconsistent state that keeps aborting
+transactions.
+
+Lets consider the following example to explain the problem, which requires
+qgroups to be enabled.
+
+We are relocating data block group Y, we have a subvolume with id 258 that
+has a root at level 1, that subvolume is used to store directory entries
+for snapshots and we are currently at transaction 3404.
+
+When committing transaction 3404, we have a pending snapshot and therefore
+we call btrfs_run_delayed_items() at transaction.c:create_pending_snapshot()
+in order to create its dentry at subvolume 258. This results in COWing
+leaf A from root 258 in order to add the dentry. Note that leaf A
+also contains file extent items referring to extents from some other
+block group X (we are currently relocating block group Y). Later on, still
+at create_pending_snapshot() we call qgroup_account_snapshot(), which
+switches the commit root for root 258 when it calls switch_commit_roots(),
+so now the COWed version of leaf A, lets call it leaf A', is accessible
+from the commit root of tree 258. At the end of qgroup_account_snapshot(),
+we call record_root_in_trans() with 258 as its argument, which results
+in btrfs_init_reloc_root() being called, which in turn calls
+relocation.c:create_reloc_root() in order to create a relocation tree
+associated to root 258, which results in assigning the value of 3403
+(which is the current transaction id minus 1 = 3404 - 1) to the
+last_snapshot field of root 258. When creating the relocation tree root
+at ctree.c:btrfs_copy_root() we add a shared reference for leaf A',
+corresponding to the relocation tree's root, when we call btrfs_inc_ref()
+against the COWed root (a copy of the commit root from tree 258), which
+is at level 1. So at this point leaf A' has 2 references, one normal
+reference corresponding to root 258 and one shared reference corresponding
+to the root of the relocation tree.
+
+Transaction 3404 finishes its commit and transaction 3405 is started by
+relocation when calling merge_reloc_root() for the relocation tree
+associated to root 258. In the meanwhile leaf A' is COWed again, in
+response to some filesystem operation, when we are still at transaction
+3405. However when we COW leaf A', at ctree.c:update_ref_for_cow(), we
+call btrfs_block_can_be_shared() in order to figure out if other trees
+refer to the leaf and if any such trees exists, add a full back reference
+to leaf A' - but btrfs_block_can_be_shared() incorrectly returns false
+because the following condition is false:
+
+  btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)
+
+which evaluates to 3404 <= 3403. So after leaf A' is COWed, it stays with
+only one reference, corresponding to the shared reference we created when
+we called btrfs_copy_root() to create the relocation tree's root and
+btrfs_inc_ref() ends up not being called for leaf A' nor we end up setting
+the flag BTRFS_BLOCK_FLAG_FULL_BACKREF in leaf A'. This results in not
+adding shared references for the extents from block group X that leaf A'
+refers to with its file extent items.
+
+Later, after merging the relocation root we do a call to to
+btrfs_drop_snapshot() in order to delete the relocation tree. This ends
+up calling do_walk_down() when path->slots[1] points to leaf A', which
+results in calling btrfs_lookup_extent_info() to get the number of
+references for leaf A', which is 1 at this time (only the shared reference
+exists) and this value is stored at wc->refs[0]. After this walk_up_proc()
+is called when wc->level is 0 and path->nodes[0] corresponds to leaf A'.
+Because the current level is 0 and wc->refs[0] is 1, it does call
+btrfs_dec_ref() against leaf A', which results in removing the single
+references that the extents from block group X have which are associated
+to root 258 - the expectation was to have each of these extents with 2
+references - one reference for root 258 and one shared reference related
+to the root of the relocation tree, and so we would drop only the shared
+reference (because leaf A' was supposed to have the flag
+BTRFS_BLOCK_FLAG_FULL_BACKREF set).
+
+This leaves the filesystem in an inconsistent state as we now have file
+extent items in a subvolume tree that point to extents from block group X
+without references in the extent tree. So later on when we try to decrement
+the references for these extents, for example due to a file unlink operation,
+truncate operation or overwriting ranges of a file, we fail because the
+expected references do not exist in the extent tree.
+
+This leads to warnings and transaction aborts like the following:
+
+[  588.965795] ------------[ cut here ]------------
+[  588.965815] WARNING: CPU: 2 PID: 2479 at fs/btrfs/extent-tree.c:1625 lookup_inline_extent_backref+0x432/0x5b0 [btrfs]
+[  588.965816] Modules linked in: af_packet iscsi_ibft iscsi_boot_sysfs xfs libcrc32c ppdev acpi_cpufreq button tpm_tis e1000 i2c_piix4 pcspkr parport_pc
+parport tpm qemu_fw_cfg joydev btrfs xor raid6_pq sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci bochs_drm virtio_ring drm_kms_helper syscopyarea
+sysfillrect sysimgblt fb_sys_fops virtio ttm serio_raw drm floppy sg
+[  588.965831] CPU: 2 PID: 2479 Comm: kworker/u8:7 Not tainted 4.7.3-3-default-fdm+ #1
+[  588.965832] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
+[  588.965844] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
+[  588.965845]  0000000000000000 ffff8802263bfa28 ffffffff813af542 0000000000000000
+[  588.965847]  0000000000000000 ffff8802263bfa68 ffffffff81081e8b 0000065900000000
+[  588.965848]  ffff8801db2af000 000000012bbe2000 0000000000000000 ffff880215703b48
+[  588.965849] Call Trace:
+[  588.965852]  [<ffffffff813af542>] dump_stack+0x63/0x81
+[  588.965854]  [<ffffffff81081e8b>] __warn+0xcb/0xf0
+[  588.965855]  [<ffffffff81081f7d>] warn_slowpath_null+0x1d/0x20
+[  588.965863]  [<ffffffffa0175042>] lookup_inline_extent_backref+0x432/0x5b0 [btrfs]
+[  588.965865]  [<ffffffff81143220>] ? trace_clock_local+0x10/0x30
+[  588.965867]  [<ffffffff8114c5df>] ? rb_reserve_next_event+0x6f/0x460
+[  588.965875]  [<ffffffffa0175215>] insert_inline_extent_backref+0x55/0xd0 [btrfs]
+[  588.965882]  [<ffffffffa017531f>] __btrfs_inc_extent_ref.isra.55+0x8f/0x240 [btrfs]
+[  588.965890]  [<ffffffffa017acea>] __btrfs_run_delayed_refs+0x74a/0x1260 [btrfs]
+[  588.965892]  [<ffffffff810cb046>] ? cpuacct_charge+0x86/0xa0
+[  588.965900]  [<ffffffffa017e74f>] btrfs_run_delayed_refs+0x9f/0x2c0 [btrfs]
+[  588.965908]  [<ffffffffa017ea04>] delayed_ref_async_start+0x94/0xb0 [btrfs]
+[  588.965918]  [<ffffffffa01c799a>] btrfs_scrubparity_helper+0xca/0x350 [btrfs]
+[  588.965928]  [<ffffffffa01c7c5e>] btrfs_extent_refs_helper+0xe/0x10 [btrfs]
+[  588.965930]  [<ffffffff8109b323>] process_one_work+0x1f3/0x4e0
+[  588.965931]  [<ffffffff8109b658>] worker_thread+0x48/0x4e0
+[  588.965932]  [<ffffffff8109b610>] ? process_one_work+0x4e0/0x4e0
+[  588.965934]  [<ffffffff810a1659>] kthread+0xc9/0xe0
+[  588.965936]  [<ffffffff816f2f1f>] ret_from_fork+0x1f/0x40
+[  588.965937]  [<ffffffff810a1590>] ? kthread_worker_fn+0x170/0x170
+[  588.965938] ---[ end trace 34e5232c933a1749 ]---
+[  588.966187] ------------[ cut here ]------------
+[  588.966196] WARNING: CPU: 2 PID: 2479 at fs/btrfs/extent-tree.c:2966 btrfs_run_delayed_refs+0x28c/0x2c0 [btrfs]
+[  588.966196] BTRFS: Transaction aborted (error -5)
+[  588.966197] Modules linked in: af_packet iscsi_ibft iscsi_boot_sysfs xfs libcrc32c ppdev acpi_cpufreq button tpm_tis e1000 i2c_piix4 pcspkr parport_pc
+parport tpm qemu_fw_cfg joydev btrfs xor raid6_pq sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci bochs_drm virtio_ring drm_kms_helper syscopyarea
+sysfillrect sysimgblt fb_sys_fops virtio ttm serio_raw drm floppy sg
+[  588.966206] CPU: 2 PID: 2479 Comm: kworker/u8:7 Tainted: G        W       4.7.3-3-default-fdm+ #1
+[  588.966207] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
+[  588.966217] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
+[  588.966217]  0000000000000000 ffff8802263bfc98 ffffffff813af542 ffff8802263bfce8
+[  588.966219]  0000000000000000 ffff8802263bfcd8 ffffffff81081e8b 00000b96345ee000
+[  588.966220]  ffffffffa021ae1c ffff880215703b48 00000000000005fe ffff8802345ee000
+[  588.966221] Call Trace:
+[  588.966223]  [<ffffffff813af542>] dump_stack+0x63/0x81
+[  588.966224]  [<ffffffff81081e8b>] __warn+0xcb/0xf0
+[  588.966225]  [<ffffffff81081eff>] warn_slowpath_fmt+0x4f/0x60
+[  588.966233]  [<ffffffffa017e93c>] btrfs_run_delayed_refs+0x28c/0x2c0 [btrfs]
+[  588.966241]  [<ffffffffa017ea04>] delayed_ref_async_start+0x94/0xb0 [btrfs]
+[  588.966250]  [<ffffffffa01c799a>] btrfs_scrubparity_helper+0xca/0x350 [btrfs]
+[  588.966259]  [<ffffffffa01c7c5e>] btrfs_extent_refs_helper+0xe/0x10 [btrfs]
+[  588.966260]  [<ffffffff8109b323>] process_one_work+0x1f3/0x4e0
+[  588.966261]  [<ffffffff8109b658>] worker_thread+0x48/0x4e0
+[  588.966263]  [<ffffffff8109b610>] ? process_one_work+0x4e0/0x4e0
+[  588.966264]  [<ffffffff810a1659>] kthread+0xc9/0xe0
+[  588.966265]  [<ffffffff816f2f1f>] ret_from_fork+0x1f/0x40
+[  588.966267]  [<ffffffff810a1590>] ? kthread_worker_fn+0x170/0x170
+[  588.966268] ---[ end trace 34e5232c933a174a ]---
+[  588.966269] BTRFS: error (device sda2) in btrfs_run_delayed_refs:2966: errno=-5 IO failure
+[  588.966270] BTRFS info (device sda2): forced readonly
+
+This was happening often on openSUSE and SLE systems using btrfs as the
+root filesystem (with its default layout where multiple subvolumes are
+used) where balance happens in the background triggered by a cron job and
+snapshots are automatically created before/after package installations,
+upgrades and removals. The issue could be triggered simply by running the
+following loop on the first system boot post installation:
+
+  while true; do
+     zypper -n in nfs-kernel-server
+     zypper -n rm nfs-kernel-server
+  done
+
+(If we were fast enough and made that loop before the cron job triggered
+a balance operation and the balance finished)
+
+So fix by setting the last_snapshot field of the root to the value of the
+generation of its commit root. Like this btrfs_block_can_be_shared()
+behaves correctly for the case where the relocation root is created during
+a transaction commit and for the case where it's created before a
+transaction commit.
+
+Fixes: 6426c7ad697d (btrfs: qgroup: Fix qgroup accounting when creating snapshot)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: Josef Bacik <jbacik@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/relocation.c |   15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -1387,14 +1387,23 @@ static struct btrfs_root *create_reloc_r
+       root_key.offset = objectid;
+ 
+       if (root->root_key.objectid == objectid) {
++              u64 commit_root_gen;
++
+               /* called by btrfs_init_reloc_root */
+               ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+                                     BTRFS_TREE_RELOC_OBJECTID);
+               BUG_ON(ret);
+-
+               last_snap = btrfs_root_last_snapshot(&root->root_item);
+-              btrfs_set_root_last_snapshot(&root->root_item,
+-                                           trans->transid - 1);
++              /*
++               * Set the last_snapshot field to the generation of the commit
++               * root - like this ctree.c:btrfs_block_can_be_shared() behaves
++               * correctly (returns true) when the relocation root is created
++               * either inside the critical section of a transaction commit
++               * (through transaction.c:qgroup_account_snapshot()) and when
++               * it's created before the transaction commit is started.
++               */
++              commit_root_gen = btrfs_header_generation(root->commit_root);
++              btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
+       } else {
+               /*
+                * called by btrfs_reloc_post_snapshot_hook.
diff --git a/queue-4.8/btrfs-fix-tree-search-logic-when-replaying-directory-entry-deletes.patch b/queue-4.8/btrfs-fix-tree-search-logic-when-replaying-directory-entry-deletes.patch

new file mode 100644 (file)

index 0000000..239bd44
--- /dev/null
+++ b/queue-4.8/btrfs-fix-tree-search-logic-when-replaying-directory-entry-deletes.patch
@@ -0,0 +1,64 @@
+From 2a7bf53f577e49c43de4ffa7776056de26db65d9 Mon Sep 17 00:00:00 2001
+From: Robbie Ko <robbieko@synology.com>
+Date: Fri, 7 Oct 2016 17:30:47 +0800
+Subject: Btrfs: fix tree search logic when replaying directory entry deletes
+
+From: Robbie Ko <robbieko@synology.com>
+
+commit 2a7bf53f577e49c43de4ffa7776056de26db65d9 upstream.
+
+If a log tree has a layout like the following:
+
+leaf N:
+        ...
+        item 240 key (282 DIR_LOG_ITEM 0) itemoff 8189 itemsize 8
+                dir log end 1275809046
+leaf N + 1:
+        item 0 key (282 DIR_LOG_ITEM 3936149215) itemoff 16275 itemsize 8
+                dir log end 18446744073709551615
+        ...
+
+When we pass the value 1275809046 + 1 as the parameter start_ret to the
+function tree-log.c:find_dir_range() (done by replay_dir_deletes()), we
+end up with path->slots[0] having the value 239 (points to the last item
+of leaf N, item 240). Because the dir log item in that position has an
+offset value smaller than *start_ret (1275809046 + 1) we need to move on
+to the next leaf, however the logic for that is wrong since it compares
+the current slot to the number of items in the leaf, which is smaller
+and therefore we don't lookup for the next leaf but instead we set the
+slot to point to an item that does not exist, at slot 240, and we later
+operate on that slot which has unexpected content or in the worst case
+can result in an invalid memory access (accessing beyond the last page
+of leaf N's extent buffer).
+
+So fix the logic that checks when we need to lookup at the next leaf
+by first incrementing the slot and only after to check if that slot
+is beyond the last item of the current leaf.
+
+Signed-off-by: Robbie Ko <robbieko@synology.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Fixes: e02119d5a7b4 (Btrfs: Add a write ahead tree log to optimize synchronous operations)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+[Modified changelog for clarity and correctness]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-log.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1940,12 +1940,11 @@ static noinline int find_dir_range(struc
+ next:
+       /* check the next slot in the tree to see if it is a valid item */
+       nritems = btrfs_header_nritems(path->nodes[0]);
++      path->slots[0]++;
+       if (path->slots[0] >= nritems) {
+               ret = btrfs_next_leaf(root, path);
+               if (ret)
+                       goto out;
+-      } else {
+-              path->slots[0]++;
+       }
+ 
+       btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
diff --git a/queue-4.8/btrfs-limit-async_work-allocation-and-worker-func-duration.patch b/queue-4.8/btrfs-limit-async_work-allocation-and-worker-func-duration.patch

new file mode 100644 (file)

index 0000000..1c26b29
--- /dev/null
+++ b/queue-4.8/btrfs-limit-async_work-allocation-and-worker-func-duration.patch
@@ -0,0 +1,128 @@
+From 2939e1a86f758b55cdba73e29397dd3d94df13bc Mon Sep 17 00:00:00 2001
+From: Maxim Patlasov <mpatlasov@virtuozzo.com>
+Date: Mon, 12 Dec 2016 14:32:44 -0800
+Subject: btrfs: limit async_work allocation and worker func duration
+
+From: Maxim Patlasov <mpatlasov@virtuozzo.com>
+
+commit 2939e1a86f758b55cdba73e29397dd3d94df13bc upstream.
+
+Problem statement: unprivileged user who has read-write access to more than
+one btrfs subvolume may easily consume all kernel memory (eventually
+triggering oom-killer).
+
+Reproducer (./mkrmdir below essentially loops over mkdir/rmdir):
+
+[root@kteam1 ~]# cat prep.sh
+
+DEV=/dev/sdb
+mkfs.btrfs -f $DEV
+mount $DEV /mnt
+for i in `seq 1 16`
+do
+       mkdir /mnt/$i
+       btrfs subvolume create /mnt/SV_$i
+       ID=`btrfs subvolume list /mnt |grep "SV_$i$" |cut -d ' ' -f 2`
+       mount -t btrfs -o subvolid=$ID $DEV /mnt/$i
+       chmod a+rwx /mnt/$i
+done
+
+[root@kteam1 ~]# sh prep.sh
+
+[maxim@kteam1 ~]$ for i in `seq 1 16`; do ./mkrmdir /mnt/$i 2000 2000 & done
+
+[root@kteam1 ~]# for i in `seq 1 4`; do grep "kmalloc-128" /proc/slabinfo | grep -v dma; sleep 60; done
+kmalloc-128        10144  10144    128   32    1 : tunables    0    0    0 : slabdata    317    317      0
+kmalloc-128       9992352 9992352    128   32    1 : tunables    0    0    0 : slabdata 312261 312261      0
+kmalloc-128       24226752 24226752    128   32    1 : tunables    0    0    0 : slabdata 757086 757086      0
+kmalloc-128       42754240 42754240    128   32    1 : tunables    0    0    0 : slabdata 1336070 1336070      0
+
+The huge numbers above come from insane number of async_work-s allocated
+and queued by btrfs_wq_run_delayed_node.
+
+The problem is caused by btrfs_wq_run_delayed_node() queuing more and more
+works if the number of delayed items is above BTRFS_DELAYED_BACKGROUND. The
+worker func (btrfs_async_run_delayed_root) processes at least
+BTRFS_DELAYED_BATCH items (if they are present in the list). So, the machinery
+works as expected while the list is almost empty. As soon as it is getting
+bigger, worker func starts to process more than one item at a time, it takes
+longer, and the chances to have async_works queued more than needed is getting
+higher.
+
+The problem above is worsened by another flaw of delayed-inode implementation:
+if async_work was queued in a throttling branch (number of items >=
+BTRFS_DELAYED_WRITEBACK), corresponding worker func won't quit until
+the number of items < BTRFS_DELAYED_BACKGROUND / 2. So, it is possible that
+the func occupies CPU infinitely (up to 30sec in my experiments): while the
+func is trying to drain the list, the user activity may add more and more
+items to the list.
+
+The patch fixes both problems in straightforward way: refuse queuing too
+many works in btrfs_wq_run_delayed_node and bail out of worker func if
+at least BTRFS_DELAYED_WRITEBACK items are processed.
+
+Changed in v2: remove support of thresh == NO_THRESHOLD.
+
+Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/async-thread.c  |   14 ++++++++++++++
+ fs/btrfs/async-thread.h  |    1 +
+ fs/btrfs/delayed-inode.c |    6 ++++--
+ 3 files changed, 19 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/async-thread.c
++++ b/fs/btrfs/async-thread.c
+@@ -86,6 +86,20 @@ btrfs_work_owner(struct btrfs_work *work
+       return work->wq->fs_info;
+ }
+ 
++bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq)
++{
++      /*
++       * We could compare wq->normal->pending with num_online_cpus()
++       * to support "thresh == NO_THRESHOLD" case, but it requires
++       * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
++       * postpone it until someone needs the support of that case.
++       */
++      if (wq->normal->thresh == NO_THRESHOLD)
++              return false;
++
++      return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
++}
++
+ BTRFS_WORK_HELPER(worker_helper);
+ BTRFS_WORK_HELPER(delalloc_helper);
+ BTRFS_WORK_HELPER(flush_delalloc_helper);
+--- a/fs/btrfs/async-thread.h
++++ b/fs/btrfs/async-thread.h
+@@ -84,4 +84,5 @@ void btrfs_workqueue_set_max(struct btrf
+ void btrfs_set_work_high_priority(struct btrfs_work *work);
+ struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work);
+ struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq);
++bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq);
+ #endif
+--- a/fs/btrfs/delayed-inode.c
++++ b/fs/btrfs/delayed-inode.c
+@@ -1356,7 +1356,8 @@ release_path:
+       total_done++;
+ 
+       btrfs_release_prepared_delayed_node(delayed_node);
+-      if (async_work->nr == 0 || total_done < async_work->nr)
++      if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) ||
++          total_done < async_work->nr)
+               goto again;
+ 
+ free_path:
+@@ -1372,7 +1373,8 @@ static int btrfs_wq_run_delayed_node(str
+ {
+       struct btrfs_async_delayed_work *async_work;
+ 
+-      if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
++      if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND ||
++          btrfs_workqueue_normal_congested(fs_info->delayed_workers))
+               return 0;
+ 
+       async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
diff --git a/queue-4.8/btrfs-store-and-load-values-of-stripes_min-stripes_max-in-balance-status-item.patch b/queue-4.8/btrfs-store-and-load-values-of-stripes_min-stripes_max-in-balance-status-item.patch

new file mode 100644 (file)

index 0000000..ad5fcc5
--- /dev/null
+++ b/queue-4.8/btrfs-store-and-load-values-of-stripes_min-stripes_max-in-balance-status-item.patch
@@ -0,0 +1,42 @@
+From ed0df618b1b06d7431ee4d985317fc5419a5d559 Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.com>
+Date: Tue, 1 Nov 2016 14:21:23 +0100
+Subject: btrfs: store and load values of stripes_min/stripes_max in balance status item
+
+From: David Sterba <dsterba@suse.com>
+
+commit ed0df618b1b06d7431ee4d985317fc5419a5d559 upstream.
+
+The balance status item contains currently known filter values, but the
+stripes filter was unintentionally not among them. This would mean, that
+interrupted and automatically restarted balance does not apply the
+stripe filters.
+
+Fixes: dee32d0ac3719ef8d640efaf0884111df444730f
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.h |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -2210,6 +2210,8 @@ btrfs_disk_balance_args_to_cpu(struct bt
+       cpu->target = le64_to_cpu(disk->target);
+       cpu->flags = le64_to_cpu(disk->flags);
+       cpu->limit = le64_to_cpu(disk->limit);
++      cpu->stripes_min = le32_to_cpu(disk->stripes_min);
++      cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+ }
+ 
+ static inline void
+@@ -2228,6 +2230,8 @@ btrfs_cpu_balance_args_to_disk(struct bt
+       disk->target = cpu_to_le64(cpu->target);
+       disk->flags = cpu_to_le64(cpu->flags);
+       disk->limit = cpu_to_le64(cpu->limit);
++      disk->stripes_min = cpu_to_le32(cpu->stripes_min);
++      disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+ }
+ 
+ /* struct btrfs_super_block */
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 3 Jan 2017 19:24:40 +0000 (20:24 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 3 Jan 2017 19:24:40 +0000 (20:24 +0100)
queue-4.8/aoe-fix-crash-in-page-count-manipulation.patch	[new file with mode: 0644]	patch \| blob
queue-4.8/btrfs-fix-bug_on-in-btrfs_mark_buffer_dirty.patch	[new file with mode: 0644]	patch \| blob
queue-4.8/btrfs-fix-deadlock-caused-by-fsync-when-logging-directory-entries.patch	[new file with mode: 0644]	patch \| blob
queue-4.8/btrfs-fix-emptiness-check-for-dirtied-extent-buffers-at-check_leaf.patch	[new file with mode: 0644]	patch \| blob
queue-4.8/btrfs-fix-qgroup-rescan-worker-initialization.patch	[new file with mode: 0644]	patch \| blob
queue-4.8/btrfs-fix-relocation-incorrectly-dropping-data-references.patch	[new file with mode: 0644]	patch \| blob
queue-4.8/btrfs-fix-tree-search-logic-when-replaying-directory-entry-deletes.patch	[new file with mode: 0644]	patch \| blob
queue-4.8/btrfs-limit-async_work-allocation-and-worker-func-duration.patch	[new file with mode: 0644]	patch \| blob
queue-4.8/btrfs-store-and-load-values-of-stripes_min-stripes_max-in-balance-status-item.patch	[new file with mode: 0644]	patch \| blob