From: Greg Kroah-Hartman Date: Fri, 11 Dec 2015 17:19:25 +0000 (-0800) Subject: 4.1-stable patches X-Git-Tag: v4.1.15~13 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=28a0960448af1cb439319aaea0718bb888e2faf5;p=thirdparty%2Fkernel%2Fstable-queue.git 4.1-stable patches added patches: alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch btrfs-check-unsupported-filters-in-balance-arguments.patch btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch btrfs-fix-race-when-listing-an-inode-s-xattrs.patch btrfs-fix-truncation-of-compressed-and-inlined-extents.patch ceph-fix-message-length-computation.patch debugfs-fix-refcount-imbalance-in-start_creating.patch ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch nfs4-start-callback_ident-at-idr-1.patch nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch nfsd-serialize-state-seqid-morphing-operations.patch ocfs2-fix-umask-ignored-issue.patch rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch --- diff --git a/queue-4.1/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch b/queue-4.1/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch new file mode 100644 index 00000000000..542db357766 --- /dev/null +++ b/queue-4.1/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch @@ -0,0 +1,33 @@ +From e2656412f2a7343ecfd13eb74bac0a6e6e9c5aad Mon Sep 17 00:00:00 2001 +From: "Lu, Han" +Date: Wed, 11 Nov 2015 16:54:27 +0800 +Subject: ALSA: hda/hdmi - apply Skylake fix-ups to Broxton display codec + +From: "Lu, Han" + +commit e2656412f2a7343ecfd13eb74bac0a6e6e9c5aad upstream. + +Broxton and Skylake have the same behavior on display audio. So this patch +applys Skylake fix-ups to Broxton. + +Signed-off-by: Lu, Han +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman + +--- + sound/pci/hda/patch_hdmi.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/sound/pci/hda/patch_hdmi.c ++++ b/sound/pci/hda/patch_hdmi.c +@@ -48,8 +48,9 @@ MODULE_PARM_DESC(static_hdmi_pcm, "Don't + #define is_haswell(codec) ((codec)->core.vendor_id == 0x80862807) + #define is_broadwell(codec) ((codec)->core.vendor_id == 0x80862808) + #define is_skylake(codec) ((codec)->core.vendor_id == 0x80862809) ++#define is_broxton(codec) ((codec)->core.vendor_id == 0x8086280a) + #define is_haswell_plus(codec) (is_haswell(codec) || is_broadwell(codec) \ +- || is_skylake(codec)) ++ || is_skylake(codec) || is_broxton(codec)) + + #define is_valleyview(codec) ((codec)->core.vendor_id == 0x80862882) + #define is_cherryview(codec) ((codec)->core.vendor_id == 0x80862883) diff --git a/queue-4.1/btrfs-check-unsupported-filters-in-balance-arguments.patch b/queue-4.1/btrfs-check-unsupported-filters-in-balance-arguments.patch new file mode 100644 index 00000000000..aaef7ed7fcd --- /dev/null +++ b/queue-4.1/btrfs-check-unsupported-filters-in-balance-arguments.patch @@ -0,0 +1,57 @@ +From 849ef9286f30c88113906dc35f44a499c0cb385d Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Mon, 12 Oct 2015 16:55:54 +0200 +Subject: btrfs: check unsupported filters in balance arguments + +From: David Sterba + +commit 849ef9286f30c88113906dc35f44a499c0cb385d upstream. + +We don't verify that all the balance filter arguments supplemented by +the flags are actually known to the kernel. Thus we let it silently pass +and do nothing. + +At the moment this means only the 'limit' filter, but we're going to add +a few more soon so it's better to have that fixed. Also in older stable +kernels so that it works with newer userspace tools. + +Signed-off-by: David Sterba +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 5 +++++ + fs/btrfs/volumes.h | 8 ++++++++ + 2 files changed, 13 insertions(+) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -4497,6 +4497,11 @@ locked: + goto out_bctl; + } + ++ if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { ++ ret = -EINVAL; ++ goto out_bargs; ++ } ++ + do_balance: + /* + * Ownership of bctl and mutually_exclusive_operation_running +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -380,6 +380,14 @@ struct map_lookup { + BTRFS_BALANCE_ARGS_VRANGE | \ + BTRFS_BALANCE_ARGS_LIMIT) + ++#define BTRFS_BALANCE_ARGS_MASK \ ++ (BTRFS_BALANCE_ARGS_PROFILES | \ ++ BTRFS_BALANCE_ARGS_USAGE | \ ++ BTRFS_BALANCE_ARGS_DEVID | \ ++ BTRFS_BALANCE_ARGS_DRANGE | \ ++ BTRFS_BALANCE_ARGS_VRANGE | \ ++ BTRFS_BALANCE_ARGS_LIMIT) ++ + /* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be diff --git a/queue-4.1/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch b/queue-4.1/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch new file mode 100644 index 00000000000..87e2c97689f --- /dev/null +++ b/queue-4.1/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch @@ -0,0 +1,443 @@ +From 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 13 Oct 2015 15:15:00 +0100 +Subject: Btrfs: fix file corruption and data loss after cloning inline extents + +From: Filipe Manana + +commit 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 upstream. + +Currently the clone ioctl allows to clone an inline extent from one file +to another that already has other (non-inlined) extents. This is a problem +because btrfs is not designed to deal with files having inline and regular +extents, if a file has an inline extent then it must be the only extent +in the file and must start at file offset 0. Having a file with an inline +extent followed by regular extents results in EIO errors when doing reads +or writes against the first 4K of the file. + +Also, the clone ioctl allows one to lose data if the source file consists +of a single inline extent, with a size of N bytes, and the destination +file consists of a single inline extent with a size of M bytes, where we +have M > N. In this case the clone operation removes the inline extent +from the destination file and then copies the inline extent from the +source file into the destination file - we lose the M - N bytes from the +destination file, a read operation will get the value 0x00 for any bytes +in the the range [N, M] (the destination inode's i_size remained as M, +that's why we can read past N bytes). + +So fix this by not allowing such destructive operations to happen and +return errno EOPNOTSUPP to user space. + +Currently the fstest btrfs/035 tests the data loss case but it totally +ignores this - i.e. expects the operation to succeed and does not check +the we got data loss. + +The following test case for fstests exercises all these cases that result +in file corruption and data loss: + + seq=`basename $0` + seqres=$RESULT_DIR/$seq + echo "QA output created by $seq" + tmp=/tmp/$$ + status=1 # failure is the default! + trap "_cleanup; exit \$status" 0 1 2 3 15 + + _cleanup() + { + rm -f $tmp.* + } + + # get standard environment, filters and checks + . ./common/rc + . ./common/filter + + # real QA test starts here + _need_to_be_root + _supported_fs btrfs + _supported_os Linux + _require_scratch + _require_cloner + _require_btrfs_fs_feature "no_holes" + _require_btrfs_mkfs_feature "no-holes" + + rm -f $seqres.full + + test_cloning_inline_extents() + { + local mkfs_opts=$1 + local mount_opts=$2 + + _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 + _scratch_mount $mount_opts + + # File bar, the source for all the following clone operations, consists + # of a single inline extent (50 bytes). + $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ + | _filter_xfs_io + + # Test cloning into a file with an extent (non-inlined) where the + # destination offset overlaps that extent. It should not be possible to + # clone the inline extent from file bar into this file. + $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ + | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo + + # Doing IO against any range in the first 4K of the file should work. + # Due to a past clone ioctl bug which allowed cloning the inline extent, + # these operations resulted in EIO errors. + echo "File foo data after clone operation:" + # All bytes should have the value 0xaa (clone operation failed and did + # not modify our file). + od -t x1 $SCRATCH_MNT/foo + $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io + + # Test cloning the inline extent against a file which has a hole in its + # first 4K followed by a non-inlined extent. It should not be possible + # as well to clone the inline extent from file bar into this file. + $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ + | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 + + # Doing IO against any range in the first 4K of the file should work. + # Due to a past clone ioctl bug which allowed cloning the inline extent, + # these operations resulted in EIO errors. + echo "File foo2 data after clone operation:" + # All bytes should have the value 0x00 (clone operation failed and did + # not modify our file). + od -t x1 $SCRATCH_MNT/foo2 + $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io + + # Test cloning the inline extent against a file which has a size of zero + # but has a prealloc extent. It should not be possible as well to clone + # the inline extent from file bar into this file. + $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 + + # Doing IO against any range in the first 4K of the file should work. + # Due to a past clone ioctl bug which allowed cloning the inline extent, + # these operations resulted in EIO errors. + echo "First 50 bytes of foo3 after clone operation:" + # Should not be able to read any bytes, file has 0 bytes i_size (the + # clone operation failed and did not modify our file). + od -t x1 $SCRATCH_MNT/foo3 + $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io + + # Test cloning the inline extent against a file which consists of a + # single inline extent that has a size not greater than the size of + # bar's inline extent (40 < 50). + # It should be possible to do the extent cloning from bar to this file. + $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ + | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 + + # Doing IO against any range in the first 4K of the file should work. + echo "File foo4 data after clone operation:" + # Must match file bar's content. + od -t x1 $SCRATCH_MNT/foo4 + $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io + + # Test cloning the inline extent against a file which consists of a + # single inline extent that has a size greater than the size of bar's + # inline extent (60 > 50). + # It should not be possible to clone the inline extent from file bar + # into this file. + $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ + | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 + + # Reading the file should not fail. + echo "File foo5 data after clone operation:" + # Must have a size of 60 bytes, with all bytes having a value of 0x03 + # (the clone operation failed and did not modify our file). + od -t x1 $SCRATCH_MNT/foo5 + + # Test cloning the inline extent against a file which has no extents but + # has a size greater than bar's inline extent (16K > 50). + # It should not be possible to clone the inline extent from file bar + # into this file. + $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 + + # Reading the file should not fail. + echo "File foo6 data after clone operation:" + # Must have a size of 16K, with all bytes having a value of 0x00 (the + # clone operation failed and did not modify our file). + od -t x1 $SCRATCH_MNT/foo6 + + # Test cloning the inline extent against a file which has no extents but + # has a size not greater than bar's inline extent (30 < 50). + # It should be possible to clone the inline extent from file bar into + # this file. + $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 + + # Reading the file should not fail. + echo "File foo7 data after clone operation:" + # Must have a size of 50 bytes, with all bytes having a value of 0xbb. + od -t x1 $SCRATCH_MNT/foo7 + + # Test cloning the inline extent against a file which has a size not + # greater than the size of bar's inline extent (20 < 50) but has + # a prealloc extent that goes beyond the file's size. It should not be + # possible to clone the inline extent from bar into this file. + $XFS_IO_PROG -f -c "falloc -k 0 1M" \ + -c "pwrite -S 0x88 0 20" \ + $SCRATCH_MNT/foo8 | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 + + echo "File foo8 data after clone operation:" + # Must have a size of 20 bytes, with all bytes having a value of 0x88 + # (the clone operation did not modify our file). + od -t x1 $SCRATCH_MNT/foo8 + + _scratch_unmount + } + + echo -e "\nTesting without compression and without the no-holes feature...\n" + test_cloning_inline_extents + + echo -e "\nTesting with compression and without the no-holes feature...\n" + test_cloning_inline_extents "" "-o compress" + + echo -e "\nTesting without compression and with the no-holes feature...\n" + test_cloning_inline_extents "-O no-holes" "" + + echo -e "\nTesting with compression and with the no-holes feature...\n" + test_cloning_inline_extents "-O no-holes" "-o compress" + + status=0 + exit + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 195 ++++++++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 152 insertions(+), 43 deletions(-) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -3166,6 +3166,150 @@ static void clone_update_extent_map(stru + &BTRFS_I(inode)->runtime_flags); + } + ++/* ++ * Make sure we do not end up inserting an inline extent into a file that has ++ * already other (non-inline) extents. If a file has an inline extent it can ++ * not have any other extents and the (single) inline extent must start at the ++ * file offset 0. Failing to respect these rules will lead to file corruption, ++ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc ++ * ++ * We can have extents that have been already written to disk or we can have ++ * dirty ranges still in delalloc, in which case the extent maps and items are ++ * created only when we run delalloc, and the delalloc ranges might fall outside ++ * the range we are currently locking in the inode's io tree. So we check the ++ * inode's i_size because of that (i_size updates are done while holding the ++ * i_mutex, which we are holding here). ++ * We also check to see if the inode has a size not greater than "datal" but has ++ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are ++ * protected against such concurrent fallocate calls by the i_mutex). ++ * ++ * If the file has no extents but a size greater than datal, do not allow the ++ * copy because we would need turn the inline extent into a non-inline one (even ++ * with NO_HOLES enabled). If we find our destination inode only has one inline ++ * extent, just overwrite it with the source inline extent if its size is less ++ * than the source extent's size, or we could copy the source inline extent's ++ * data into the destination inode's inline extent if the later is greater then ++ * the former. ++ */ ++static int clone_copy_inline_extent(struct inode *src, ++ struct inode *dst, ++ struct btrfs_trans_handle *trans, ++ struct btrfs_path *path, ++ struct btrfs_key *new_key, ++ const u64 drop_start, ++ const u64 datal, ++ const u64 skip, ++ const u64 size, ++ char *inline_data) ++{ ++ struct btrfs_root *root = BTRFS_I(dst)->root; ++ const u64 aligned_end = ALIGN(new_key->offset + datal, ++ root->sectorsize); ++ int ret; ++ struct btrfs_key key; ++ ++ if (new_key->offset > 0) ++ return -EOPNOTSUPP; ++ ++ key.objectid = btrfs_ino(dst); ++ key.type = BTRFS_EXTENT_DATA_KEY; ++ key.offset = 0; ++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ++ if (ret < 0) { ++ return ret; ++ } else if (ret > 0) { ++ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ++ ret = btrfs_next_leaf(root, path); ++ if (ret < 0) ++ return ret; ++ else if (ret > 0) ++ goto copy_inline_extent; ++ } ++ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ++ if (key.objectid == btrfs_ino(dst) && ++ key.type == BTRFS_EXTENT_DATA_KEY) { ++ ASSERT(key.offset > 0); ++ return -EOPNOTSUPP; ++ } ++ } else if (i_size_read(dst) <= datal) { ++ struct btrfs_file_extent_item *ei; ++ u64 ext_len; ++ ++ /* ++ * If the file size is <= datal, make sure there are no other ++ * extents following (can happen do to an fallocate call with ++ * the flag FALLOC_FL_KEEP_SIZE). ++ */ ++ ei = btrfs_item_ptr(path->nodes[0], path->slots[0], ++ struct btrfs_file_extent_item); ++ /* ++ * If it's an inline extent, it can not have other extents ++ * following it. ++ */ ++ if (btrfs_file_extent_type(path->nodes[0], ei) == ++ BTRFS_FILE_EXTENT_INLINE) ++ goto copy_inline_extent; ++ ++ ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); ++ if (ext_len > aligned_end) ++ return -EOPNOTSUPP; ++ ++ ret = btrfs_next_item(root, path); ++ if (ret < 0) { ++ return ret; ++ } else if (ret == 0) { ++ btrfs_item_key_to_cpu(path->nodes[0], &key, ++ path->slots[0]); ++ if (key.objectid == btrfs_ino(dst) && ++ key.type == BTRFS_EXTENT_DATA_KEY) ++ return -EOPNOTSUPP; ++ } ++ } ++ ++copy_inline_extent: ++ /* ++ * We have no extent items, or we have an extent at offset 0 which may ++ * or may not be inlined. All these cases are dealt the same way. ++ */ ++ if (i_size_read(dst) > datal) { ++ /* ++ * If the destination inode has an inline extent... ++ * This would require copying the data from the source inline ++ * extent into the beginning of the destination's inline extent. ++ * But this is really complex, both extents can be compressed ++ * or just one of them, which would require decompressing and ++ * re-compressing data (which could increase the new compressed ++ * size, not allowing the compressed data to fit anymore in an ++ * inline extent). ++ * So just don't support this case for now (it should be rare, ++ * we are not really saving space when cloning inline extents). ++ */ ++ return -EOPNOTSUPP; ++ } ++ ++ btrfs_release_path(path); ++ ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); ++ if (ret) ++ return ret; ++ ret = btrfs_insert_empty_item(trans, root, path, new_key, size); ++ if (ret) ++ return ret; ++ ++ if (skip) { ++ const u32 start = btrfs_file_extent_calc_inline_size(0); ++ ++ memmove(inline_data + start, inline_data + start + skip, datal); ++ } ++ ++ write_extent_buffer(path->nodes[0], inline_data, ++ btrfs_item_ptr_offset(path->nodes[0], ++ path->slots[0]), ++ size); ++ inode_add_bytes(dst, datal); ++ ++ return 0; ++} ++ + /** + * btrfs_clone() - clone a range from inode file to another + * +@@ -3432,21 +3576,6 @@ process_slot: + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; +- u64 aligned_end = 0; +- +- /* +- * Don't copy an inline extent into an offset +- * greater than zero. Having an inline extent +- * at such an offset results in chaos as btrfs +- * isn't prepared for such cases. Just skip +- * this case for the same reasons as commented +- * at btrfs_ioctl_clone(). +- */ +- if (last_dest_end > 0) { +- ret = -EOPNOTSUPP; +- btrfs_end_transaction(trans, root); +- goto out; +- } + + if (off > key.offset) { + skip = off - key.offset; +@@ -3464,42 +3593,22 @@ process_slot: + size -= skip + trim; + datal -= skip + trim; + +- aligned_end = ALIGN(new_key.offset + datal, +- root->sectorsize); +- ret = btrfs_drop_extents(trans, root, inode, +- drop_start, +- aligned_end, +- 1); ++ ret = clone_copy_inline_extent(src, inode, ++ trans, path, ++ &new_key, ++ drop_start, ++ datal, ++ skip, size, buf); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, +- root, ret); +- btrfs_end_transaction(trans, root); +- goto out; +- } +- +- ret = btrfs_insert_empty_item(trans, root, path, +- &new_key, size); +- if (ret) { +- btrfs_abort_transaction(trans, root, +- ret); ++ root, ++ ret); + btrfs_end_transaction(trans, root); + goto out; + } +- +- if (skip) { +- u32 start = +- btrfs_file_extent_calc_inline_size(0); +- memmove(buf+start, buf+start+skip, +- datal); +- } +- + leaf = path->nodes[0]; + slot = path->slots[0]; +- write_extent_buffer(leaf, buf, +- btrfs_item_ptr_offset(leaf, slot), +- size); +- inode_add_bytes(inode, datal); + } + + /* If we have an implicit hole (NO_HOLES feature). */ diff --git a/queue-4.1/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch b/queue-4.1/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch new file mode 100644 index 00000000000..b613df92037 --- /dev/null +++ b/queue-4.1/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch @@ -0,0 +1,122 @@ +From 1d512cb77bdbda80f0dd0620a3b260d697fd581d Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 9 Nov 2015 00:33:58 +0000 +Subject: Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow + +From: Filipe Manana + +commit 1d512cb77bdbda80f0dd0620a3b260d697fd581d upstream. + +If we are using the NO_HOLES feature, we have a tiny time window when +running delalloc for a nodatacow inode where we can race with a concurrent +link or xattr add operation leading to a BUG_ON. + +This happens because at run_delalloc_nocow() we end up casting a leaf item +of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a +file extent item (struct btrfs_file_extent_item) and then analyse its +extent type field, which won't match any of the expected extent types +(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an +explicit BUG_ON(1). + +The following sequence diagram shows how the race happens when running a +no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following +neighbour leafs: + + Leaf X (has N items) Leaf Y + + [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ] + slot N - 2 slot N - 1 slot 0 + + (Note the implicit hole for inode 257 regarding the [0, 8K[ range) + + CPU 1 CPU 2 + + run_dealloc_nocow() + btrfs_lookup_file_extent() + --> searches for a key with value + (257 EXTENT_DATA 4096) in the + fs/subvol tree + --> returns us a path with + path->nodes[0] == leaf X and + path->slots[0] == N + + because path->slots[0] is >= + btrfs_header_nritems(leaf X), it + calls btrfs_next_leaf() + + btrfs_next_leaf() + --> releases the path + + hard link added to our inode, + with key (257 INODE_REF 500) + added to the end of leaf X, + so leaf X now has N + 1 keys + + --> searches for the key + (257 INODE_REF 256), because + it was the last key in leaf X + before it released the path, + with path->keep_locks set to 1 + + --> ends up at leaf X again and + it verifies that the key + (257 INODE_REF 256) is no longer + the last key in the leaf, so it + returns with path->nodes[0] == + leaf X and path->slots[0] == N, + pointing to the new item with + key (257 INODE_REF 500) + + the loop iteration of run_dealloc_nocow() + does not break out the loop and continues + because the key referenced in the path + at path->nodes[0] and path->slots[0] is + for inode 257, its type is < BTRFS_EXTENT_DATA_KEY + and its offset (500) is less then our delalloc + range's end (8192) + + the item pointed by the path, an inode reference item, + is (incorrectly) interpreted as a file extent item and + we get an invalid extent type, leading to the BUG_ON(1): + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + (...) + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + (...) + } else { + BUG_ON(1) + } + +The same can happen if a xattr is added concurrently and ends up having +a key with an offset smaller then the delalloc's range end. + +So fix this by skipping keys with a type smaller than +BTRFS_EXTENT_DATA_KEY. + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1294,8 +1294,14 @@ next_slot: + num_bytes = 0; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + +- if (found_key.objectid > ino || +- found_key.type > BTRFS_EXTENT_DATA_KEY || ++ if (found_key.objectid > ino) ++ break; ++ if (WARN_ON_ONCE(found_key.objectid < ino) || ++ found_key.type < BTRFS_EXTENT_DATA_KEY) { ++ path->slots[0]++; ++ goto next_slot; ++ } ++ if (found_key.type > BTRFS_EXTENT_DATA_KEY || + found_key.offset > end) + break; + diff --git a/queue-4.1/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch b/queue-4.1/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch new file mode 100644 index 00000000000..3211eec4c23 --- /dev/null +++ b/queue-4.1/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch @@ -0,0 +1,198 @@ +From aeafbf8486c9e2bd53f5cc3c10c0b7fd7149d69c Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 6 Nov 2015 13:33:33 +0000 +Subject: Btrfs: fix race leading to incorrect item deletion when dropping extents + +From: Filipe Manana + +commit aeafbf8486c9e2bd53f5cc3c10c0b7fd7149d69c upstream. + +While running a stress test I got the following warning triggered: + + [191627.672810] ------------[ cut here ]------------ + [191627.673949] WARNING: CPU: 8 PID: 8447 at fs/btrfs/file.c:779 __btrfs_drop_extents+0x391/0xa50 [btrfs]() + (...) + [191627.701485] Call Trace: + [191627.702037] [] dump_stack+0x4f/0x7b + [191627.702992] [] ? console_unlock+0x356/0x3a2 + [191627.704091] [] warn_slowpath_common+0xa1/0xbb + [191627.705380] [] ? __btrfs_drop_extents+0x391/0xa50 [btrfs] + [191627.706637] [] warn_slowpath_null+0x1a/0x1c + [191627.707789] [] __btrfs_drop_extents+0x391/0xa50 [btrfs] + [191627.709155] [] ? cache_alloc_debugcheck_after.isra.32+0x171/0x1d0 + [191627.712444] [] ? kmemleak_alloc_recursive.constprop.40+0x16/0x18 + [191627.714162] [] insert_reserved_file_extent.constprop.40+0x83/0x24e [btrfs] + [191627.715887] [] ? start_transaction+0x3bb/0x610 [btrfs] + [191627.717287] [] btrfs_finish_ordered_io+0x273/0x4e2 [btrfs] + [191627.728865] [] finish_ordered_fn+0x15/0x17 [btrfs] + [191627.730045] [] normal_work_helper+0x14c/0x32c [btrfs] + [191627.731256] [] btrfs_endio_write_helper+0x12/0x14 [btrfs] + [191627.732661] [] process_one_work+0x24c/0x4ae + [191627.733822] [] worker_thread+0x206/0x2c2 + [191627.734857] [] ? process_scheduled_works+0x2f/0x2f + [191627.736052] [] ? process_scheduled_works+0x2f/0x2f + [191627.737349] [] kthread+0xef/0xf7 + [191627.738267] [] ? time_hardirqs_on+0x15/0x28 + [191627.739330] [] ? __kthread_parkme+0xad/0xad + [191627.741976] [] ret_from_fork+0x42/0x70 + [191627.743080] [] ? __kthread_parkme+0xad/0xad + [191627.744206] ---[ end trace bbfddacb7aaada8d ]--- + + $ cat -n fs/btrfs/file.c + 691 int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + (...) + 758 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + 759 if (key.objectid > ino || + 760 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + 761 break; + 762 + 763 fi = btrfs_item_ptr(leaf, path->slots[0], + 764 struct btrfs_file_extent_item); + 765 extent_type = btrfs_file_extent_type(leaf, fi); + 766 + 767 if (extent_type == BTRFS_FILE_EXTENT_REG || + 768 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + (...) + 774 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + (...) + 778 } else { + 779 WARN_ON(1); + 780 extent_end = search_start; + 781 } + (...) + +This happened because the item we were processing did not match a file +extent item (its key type != BTRFS_EXTENT_DATA_KEY), and even on this +case we cast the item to a struct btrfs_file_extent_item pointer and +then find a type field value that does not match any of the expected +values (BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]). This scenario happens +due to a tiny time window where a race can happen as exemplified below. +For example, consider the following scenario where we're using the +NO_HOLES feature and we have the following two neighbour leafs: + + Leaf X (has N items) Leaf Y + +[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ] + slot N - 2 slot N - 1 slot 0 + +Our inode 257 has an implicit hole in the range [0, 8K[ (implicit rather +than explicit because NO_HOLES is enabled). Now if our inode has an +ordered extent for the range [4K, 8K[ that is finishing, the following +can happen: + + CPU 1 CPU 2 + + btrfs_finish_ordered_io() + insert_reserved_file_extent() + __btrfs_drop_extents() + Searches for the key + (257 EXTENT_DATA 4096) through + btrfs_lookup_file_extent() + + Key not found and we get a path where + path->nodes[0] == leaf X and + path->slots[0] == N + + Because path->slots[0] is >= + btrfs_header_nritems(leaf X), we call + btrfs_next_leaf() + + btrfs_next_leaf() releases the path + + inserts key + (257 INODE_REF 4096) + at the end of leaf X, + leaf X now has N + 1 keys, + and the new key is at + slot N + + btrfs_next_leaf() searches for + key (257 INODE_REF 256), with + path->keep_locks set to 1, + because it was the last key it + saw in leaf X + + finds it in leaf X again and + notices it's no longer the last + key of the leaf, so it returns 0 + with path->nodes[0] == leaf X and + path->slots[0] == N (which is now + < btrfs_header_nritems(leaf X)), + pointing to the new key + (257 INODE_REF 4096) + + __btrfs_drop_extents() casts the + item at path->nodes[0], slot + path->slots[0], to a struct + btrfs_file_extent_item - it does + not skip keys for the target + inode with a type less than + BTRFS_EXTENT_DATA_KEY + (BTRFS_INODE_REF_KEY < BTRFS_EXTENT_DATA_KEY) + + sees a bogus value for the type + field triggering the WARN_ON in + the trace shown above, and sets + extent_end = search_start (4096) + + does the if-then-else logic to + fixup 0 length extent items created + by a past bug from hole punching: + + if (extent_end == key.offset && + extent_end >= search_start) + goto delete_extent_item; + + that evaluates to true and it ends + up deleting the key pointed to by + path->slots[0], (257 INODE_REF 4096), + from leaf X + +The same could happen for example for a xattr that ends up having a key +with an offset value that matches search_start (very unlikely but not +impossible). + +So fix this by ensuring that keys smaller than BTRFS_EXTENT_DATA_KEY are +skipped, never casted to struct btrfs_file_extent_item and never deleted +by accident. Also protect against the unexpected case of getting a key +for a lower inode number by skipping that key and issuing a warning. + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 16 ++++++++++++---- + 1 file changed, 12 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -756,8 +756,16 @@ next_slot: + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +- if (key.objectid > ino || +- key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) ++ ++ if (key.objectid > ino) ++ break; ++ if (WARN_ON_ONCE(key.objectid < ino) || ++ key.type < BTRFS_EXTENT_DATA_KEY) { ++ ASSERT(del_nr == 0); ++ path->slots[0]++; ++ goto next_slot; ++ } ++ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], +@@ -776,8 +784,8 @@ next_slot: + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); + } else { +- WARN_ON(1); +- extent_end = search_start; ++ /* can't happen */ ++ BUG(); + } + + /* diff --git a/queue-4.1/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch b/queue-4.1/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch new file mode 100644 index 00000000000..0a13530cd6c --- /dev/null +++ b/queue-4.1/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch @@ -0,0 +1,92 @@ +From f1cd1f0b7d1b5d4aaa5711e8f4e4898b0045cb6d Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 9 Nov 2015 18:06:38 +0000 +Subject: Btrfs: fix race when listing an inode's xattrs + +From: Filipe Manana + +commit f1cd1f0b7d1b5d4aaa5711e8f4e4898b0045cb6d upstream. + +When listing a inode's xattrs we have a time window where we race against +a concurrent operation for adding a new hard link for our inode that makes +us not return any xattr to user space. In order for this to happen, the +first xattr of our inode needs to be at slot 0 of a leaf and the previous +leaf must still have room for an inode ref (or extref) item, and this can +happen because an inode's listxattrs callback does not lock the inode's +i_mutex (nor does the VFS does it for us), but adding a hard link to an +inode makes the VFS lock the inode's i_mutex before calling the inode's +link callback. + +If we have the following leafs: + + Leaf X (has N items) Leaf Y + + [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 XATTR_ITEM 12345), ... ] + slot N - 2 slot N - 1 slot 0 + +The race illustrated by the following sequence diagram is possible: + + CPU 1 CPU 2 + + btrfs_listxattr() + + searches for key (257 XATTR_ITEM 0) + + gets path with path->nodes[0] == leaf X + and path->slots[0] == N + + because path->slots[0] is >= + btrfs_header_nritems(leaf X), it calls + btrfs_next_leaf() + + btrfs_next_leaf() + releases the path + + adds key (257 INODE_REF 666) + to the end of leaf X (slot N), + and leaf X now has N + 1 items + + searches for the key (257 INODE_REF 256), + with path->keep_locks == 1, because that + is the last key it saw in leaf X before + releasing the path + + ends up at leaf X again and it verifies + that the key (257 INODE_REF 256) is no + longer the last key in leaf X, so it + returns with path->nodes[0] == leaf X + and path->slots[0] == N, pointing to + the new item with key (257 INODE_REF 666) + + btrfs_listxattr's loop iteration sees that + the type of the key pointed by the path is + different from the type BTRFS_XATTR_ITEM_KEY + and so it breaks the loop and stops looking + for more xattr items + --> the application doesn't get any xattr + listed for our inode + +So fix this by breaking the loop only if the key's type is greater than +BTRFS_XATTR_ITEM_KEY and skip the current key if its type is smaller. + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/xattr.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/xattr.c ++++ b/fs/btrfs/xattr.c +@@ -313,8 +313,10 @@ ssize_t btrfs_listxattr(struct dentry *d + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; +- if (found_key.type != BTRFS_XATTR_ITEM_KEY) ++ if (found_key.type > BTRFS_XATTR_ITEM_KEY) + break; ++ if (found_key.type < BTRFS_XATTR_ITEM_KEY) ++ goto next; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + if (verify_dir_item(root, leaf, di)) diff --git a/queue-4.1/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch b/queue-4.1/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch new file mode 100644 index 00000000000..d4d7f449ef2 --- /dev/null +++ b/queue-4.1/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch @@ -0,0 +1,288 @@ +From 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 16 Oct 2015 12:34:25 +0100 +Subject: Btrfs: fix truncation of compressed and inlined extents + +From: Filipe Manana + +commit 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 upstream. + +When truncating a file to a smaller size which consists of an inline +extent that is compressed, we did not discard (or made unusable) the +data between the new file size and the old file size, wasting metadata +space and allowing for the truncated data to be leaked and the data +corruption/loss mentioned below. +We were also not correctly decrementing the number of bytes used by the +inode, we were setting it to zero, giving a wrong report for callers of +the stat(2) syscall. The fsck tool also reported an error about a mismatch +between the nbytes of the file versus the real space used by the file. + +Now because we weren't discarding the truncated region of the file, it +was possible for a caller of the clone ioctl to actually read the data +that was truncated, allowing for a security breach without requiring root +access to the system, using only standard filesystem operations. The +scenario is the following: + + 1) User A creates a file which consists of an inline and compressed + extent with a size of 2000 bytes - the file is not accessible to + any other users (no read, write or execution permission for anyone + else); + + 2) The user truncates the file to a size of 1000 bytes; + + 3) User A makes the file world readable; + + 4) User B creates a file consisting of an inline extent of 2000 bytes; + + 5) User B issues a clone operation from user A's file into its own + file (using a length argument of 0, clone the whole range); + + 6) User B now gets to see the 1000 bytes that user A truncated from + its file before it made its file world readbale. User B also lost + the bytes in the range [1000, 2000[ bytes from its own file, but + that might be ok if his/her intention was reading stale data from + user A that was never supposed to be public. + +Note that this contrasts with the case where we truncate a file from 2000 +bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In +this case reading any byte from the range [1000, 2000[ will return a value +of 0x00, instead of the original data. + +This problem exists since the clone ioctl was added and happens both with +and without my recent data loss and file corruption fixes for the clone +ioctl (patch "Btrfs: fix file corruption and data loss after cloning +inline extents"). + +So fix this by truncating the compressed inline extents as we do for the +non-compressed case, which involves decompressing, if the data isn't already +in the page cache, compressing the truncated version of the extent, writing +the compressed content into the inline extent and then truncate it. + +The following test case for fstests reproduces the problem. In order for +the test to pass both this fix and my previous fix for the clone ioctl +that forbids cloning a smaller inline extent into a larger one, +which is titled "Btrfs: fix file corruption and data loss after cloning +inline extents", are needed. Without that other fix the test fails in a +different way that does not leak the truncated data, instead part of +destination file gets replaced with zeroes (because the destination file +has a larger inline extent than the source). + + seq=`basename $0` + seqres=$RESULT_DIR/$seq + echo "QA output created by $seq" + tmp=/tmp/$$ + status=1 # failure is the default! + trap "_cleanup; exit \$status" 0 1 2 3 15 + + _cleanup() + { + rm -f $tmp.* + } + + # get standard environment, filters and checks + . ./common/rc + . ./common/filter + + # real QA test starts here + _need_to_be_root + _supported_fs btrfs + _supported_os Linux + _require_scratch + _require_cloner + + rm -f $seqres.full + + _scratch_mkfs >>$seqres.full 2>&1 + _scratch_mount "-o compress" + + # Create our test files. File foo is going to be the source of a clone operation + # and consists of a single inline extent with an uncompressed size of 512 bytes, + # while file bar consists of a single inline extent with an uncompressed size of + # 256 bytes. For our test's purpose, it's important that file bar has an inline + # extent with a size smaller than foo's inline extent. + $XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \ + -c "pwrite -S 0x2a 128 384" \ + $SCRATCH_MNT/foo | _filter_xfs_io + $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io + + # Now durably persist all metadata and data. We do this to make sure that we get + # on disk an inline extent with a size of 512 bytes for file foo. + sync + + # Now truncate our file foo to a smaller size. Because it consists of a + # compressed and inline extent, btrfs did not shrink the inline extent to the + # new size (if the extent was not compressed, btrfs would shrink it to 128 + # bytes), it only updates the inode's i_size to 128 bytes. + $XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo + + # Now clone foo's inline extent into bar. + # This clone operation should fail with errno EOPNOTSUPP because the source + # file consists only of an inline extent and the file's size is smaller than + # the inline extent of the destination (128 bytes < 256 bytes). However the + # clone ioctl was not prepared to deal with a file that has a size smaller + # than the size of its inline extent (something that happens only for compressed + # inline extents), resulting in copying the full inline extent from the source + # file into the destination file. + # + # Note that btrfs' clone operation for inline extents consists of removing the + # inline extent from the destination inode and copy the inline extent from the + # source inode into the destination inode, meaning that if the destination + # inode's inline extent is larger (N bytes) than the source inode's inline + # extent (M bytes), some bytes (N - M bytes) will be lost from the destination + # file. Btrfs could copy the source inline extent's data into the destination's + # inline extent so that we would not lose any data, but that's currently not + # done due to the complexity that would be needed to deal with such cases + # (specially when one or both extents are compressed), returning EOPNOTSUPP, as + # it's normally not a very common case to clone very small files (only case + # where we get inline extents) and copying inline extents does not save any + # space (unlike for normal, non-inlined extents). + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + + # Now because the above clone operation used to succeed, and due to foo's inline + # extent not being shinked by the truncate operation, our file bar got the whole + # inline extent copied from foo, making us lose the last 128 bytes from bar + # which got replaced by the bytes in range [128, 256[ from foo before foo was + # truncated - in other words, data loss from bar and being able to read old and + # stale data from foo that should not be possible to read anymore through normal + # filesystem operations. Contrast with the case where we truncate a file from a + # size N to a smaller size M, truncate it back to size N and then read the range + # [M, N[, we should always get the value 0x00 for all the bytes in that range. + + # We expected the clone operation to fail with errno EOPNOTSUPP and therefore + # not modify our file's bar data/metadata. So its content should be 256 bytes + # long with all bytes having the value 0xbb. + # + # Without the btrfs bug fix, the clone operation succeeded and resulted in + # leaking truncated data from foo, the bytes that belonged to its range + # [128, 256[, and losing data from bar in that same range. So reading the + # file gave us the following content: + # + # 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 + # * + # 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a + # * + # 0000400 + echo "File bar's content after the clone operation:" + od -t x1 $SCRATCH_MNT/bar + + # Also because the foo's inline extent was not shrunk by the truncate + # operation, btrfs' fsck, which is run by the fstests framework everytime a + # test completes, failed reporting the following error: + # + # root 5 inode 257 errors 400, nbytes wrong + + status=0 + exit + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 82 +++++++++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 68 insertions(+), 14 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -4184,6 +4184,47 @@ static int truncate_space_check(struct b + + } + ++static int truncate_inline_extent(struct inode *inode, ++ struct btrfs_path *path, ++ struct btrfs_key *found_key, ++ const u64 item_end, ++ const u64 new_size) ++{ ++ struct extent_buffer *leaf = path->nodes[0]; ++ int slot = path->slots[0]; ++ struct btrfs_file_extent_item *fi; ++ u32 size = (u32)(new_size - found_key->offset); ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ ++ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); ++ ++ if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { ++ loff_t offset = new_size; ++ loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); ++ ++ /* ++ * Zero out the remaining of the last page of our inline extent, ++ * instead of directly truncating our inline extent here - that ++ * would be much more complex (decompressing all the data, then ++ * compressing the truncated data, which might be bigger than ++ * the size of the inline extent, resize the extent, etc). ++ * We release the path because to get the page we might need to ++ * read the extent item from disk (data not in the page cache). ++ */ ++ btrfs_release_path(path); ++ return btrfs_truncate_page(inode, offset, page_end - offset, 0); ++ } ++ ++ btrfs_set_file_extent_ram_bytes(leaf, fi, size); ++ size = btrfs_file_extent_calc_inline_size(size); ++ btrfs_truncate_item(root, path, size, 1); ++ ++ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) ++ inode_sub_bytes(inode, item_end + 1 - new_size); ++ ++ return 0; ++} ++ + /* + * this can truncate away extent items, csum items and directory items. + * It starts at a high offset and removes keys until it can't find +@@ -4378,27 +4419,40 @@ search_again: + * special encodings + */ + if (!del_item && +- btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { +- u32 size = new_size - found_key.offset; +- +- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) +- inode_sub_bytes(inode, item_end + 1 - +- new_size); + + /* +- * update the ram bytes to properly reflect +- * the new size of our item ++ * Need to release path in order to truncate a ++ * compressed extent. So delete any accumulated ++ * extent items so far. + */ +- btrfs_set_file_extent_ram_bytes(leaf, fi, size); +- size = +- btrfs_file_extent_calc_inline_size(size); +- btrfs_truncate_item(root, path, size, 1); ++ if (btrfs_file_extent_compression(leaf, fi) != ++ BTRFS_COMPRESS_NONE && pending_del_nr) { ++ err = btrfs_del_items(trans, root, path, ++ pending_del_slot, ++ pending_del_nr); ++ if (err) { ++ btrfs_abort_transaction(trans, ++ root, ++ err); ++ goto error; ++ } ++ pending_del_nr = 0; ++ } ++ ++ err = truncate_inline_extent(inode, path, ++ &found_key, ++ item_end, ++ new_size); ++ if (err) { ++ btrfs_abort_transaction(trans, ++ root, err); ++ goto error; ++ } + } else if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) { +- inode_sub_bytes(inode, item_end + 1 - +- found_key.offset); ++ inode_sub_bytes(inode, item_end + 1 - new_size); + } + } + delete: diff --git a/queue-4.1/ceph-fix-message-length-computation.patch b/queue-4.1/ceph-fix-message-length-computation.patch new file mode 100644 index 00000000000..e4059b8175c --- /dev/null +++ b/queue-4.1/ceph-fix-message-length-computation.patch @@ -0,0 +1,37 @@ +From 777d738a5e58ba3b6f3932ab1543ce93703f4873 Mon Sep 17 00:00:00 2001 +From: Arnd Bergmann +Date: Wed, 30 Sep 2015 15:04:42 +0200 +Subject: ceph: fix message length computation + +From: Arnd Bergmann + +commit 777d738a5e58ba3b6f3932ab1543ce93703f4873 upstream. + +create_request_message() computes the maximum length of a message, +but uses the wrong type for the time stamp: sizeof(struct timespec) +may be 8 or 16 depending on the architecture, while sizeof(struct +ceph_timespec) is always 8, and that is what gets put into the +message. + +Found while auditing the uses of timespec for y2038 problems. + +Fixes: b8e69066d8af ("ceph: include time stamp in every MDS request") +Signed-off-by: Arnd Bergmann +Signed-off-by: Yan, Zheng +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ceph/mds_client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ceph/mds_client.c ++++ b/fs/ceph/mds_client.c +@@ -1905,7 +1905,7 @@ static struct ceph_msg *create_request_m + + len = sizeof(*head) + + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + +- sizeof(struct timespec); ++ sizeof(struct ceph_timespec); + + /* calculate (max) length for cap releases */ + len += sizeof(struct ceph_mds_request_release) * diff --git a/queue-4.1/debugfs-fix-refcount-imbalance-in-start_creating.patch b/queue-4.1/debugfs-fix-refcount-imbalance-in-start_creating.patch new file mode 100644 index 00000000000..a1eebf4c141 --- /dev/null +++ b/queue-4.1/debugfs-fix-refcount-imbalance-in-start_creating.patch @@ -0,0 +1,47 @@ +From 0ee9608c89e81a1ccee52ecb58a7ff040e2522d9 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Thu, 5 Nov 2015 00:01:51 +0100 +Subject: debugfs: fix refcount imbalance in start_creating + +From: Daniel Borkmann + +commit 0ee9608c89e81a1ccee52ecb58a7ff040e2522d9 upstream. + +In debugfs' start_creating(), we pin the file system to safely access +its root. When we failed to create a file, we unpin the file system via +failed_creating() to release the mount count and eventually the reference +of the vfsmount. + +However, when we run into an error during lookup_one_len() when still +in start_creating(), we only release the parent's mutex but not so the +reference on the mount. Looks like it was done in the past, but after +splitting portions of __create_file() into start_creating() and +end_creating() via 190afd81e4a5 ("debugfs: split the beginning and the +end of __create_file() off"), this seemed missed. Noticed during code +review. + +Fixes: 190afd81e4a5 ("debugfs: split the beginning and the end of __create_file() off") +Signed-off-by: Daniel Borkmann +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + fs/debugfs/inode.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/debugfs/inode.c ++++ b/fs/debugfs/inode.c +@@ -276,8 +276,12 @@ static struct dentry *start_creating(con + dput(dentry); + dentry = ERR_PTR(-EEXIST); + } +- if (IS_ERR(dentry)) ++ ++ if (IS_ERR(dentry)) { + mutex_unlock(&d_inode(parent)->i_mutex); ++ simple_release_fs(&debugfs_mount, &debugfs_mount_count); ++ } ++ + return dentry; + } + diff --git a/queue-4.1/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch b/queue-4.1/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch new file mode 100644 index 00000000000..a027d21e87c --- /dev/null +++ b/queue-4.1/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch @@ -0,0 +1,51 @@ +From 937d7b84dca58f2565715f2c8e52f14c3d65fb22 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Fri, 2 Oct 2015 23:54:58 -0400 +Subject: ext4 crypto: fix memory leak in ext4_bio_write_page() + +From: Theodore Ts'o + +commit 937d7b84dca58f2565715f2c8e52f14c3d65fb22 upstream. + +There are times when ext4_bio_write_page() is called even though we +don't actually need to do any I/O. This happens when ext4_writepage() +gets called by the jbd2 commit path when an inode needs to force its +pages written out in order to provide data=ordered guarantees --- and +a page is backed by an unwritten (e.g., uninitialized) block on disk, +or if delayed allocation means the page's backing store hasn't been +allocated yet. In that case, we need to skip the call to +ext4_encrypt_page(), since in addition to wasting CPU, it leads to a +bounce page and an ext4 crypto context getting leaked. + +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/page-io.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/fs/ext4/page-io.c ++++ b/fs/ext4/page-io.c +@@ -426,6 +426,7 @@ int ext4_bio_write_page(struct ext4_io_s + struct buffer_head *bh, *head; + int ret = 0; + int nr_submitted = 0; ++ int nr_to_submit = 0; + + blocksize = 1 << inode->i_blkbits; + +@@ -478,11 +479,13 @@ int ext4_bio_write_page(struct ext4_io_s + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + set_buffer_async_write(bh); ++ nr_to_submit++; + } while ((bh = bh->b_this_page) != head); + + bh = head = page_buffers(page); + +- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { ++ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && ++ nr_to_submit) { + data_page = ext4_encrypt(inode, page); + if (IS_ERR(data_page)) { + ret = PTR_ERR(data_page); diff --git a/queue-4.1/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch b/queue-4.1/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch new file mode 100644 index 00000000000..a427ddc7a20 --- /dev/null +++ b/queue-4.1/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch @@ -0,0 +1,44 @@ +From 6934da9238da947628be83635e365df41064b09b Mon Sep 17 00:00:00 2001 +From: Lukas Czerner +Date: Sat, 17 Oct 2015 22:57:06 -0400 +Subject: ext4: fix potential use after free in __ext4_journal_stop + +From: Lukas Czerner + +commit 6934da9238da947628be83635e365df41064b09b upstream. + +There is a use-after-free possibility in __ext4_journal_stop() in the +case that we free the handle in the first jbd2_journal_stop() because +we're referencing handle->h_err afterwards. This was introduced in +9705acd63b125dee8b15c705216d7186daea4625 and it is wrong. Fix it by +storing the handle->h_err value beforehand and avoid referencing +potentially freed handle. + +Fixes: 9705acd63b125dee8b15c705216d7186daea4625 +Signed-off-by: Lukas Czerner +Reviewed-by: Andreas Dilger +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/ext4_jbd2.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/ext4/ext4_jbd2.c ++++ b/fs/ext4/ext4_jbd2.c +@@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *wher + return 0; + } + ++ err = handle->h_err; + if (!handle->h_transaction) { +- err = jbd2_journal_stop(handle); +- return handle->h_err ? handle->h_err : err; ++ rc = jbd2_journal_stop(handle); ++ return err ? err : rc; + } + + sb = handle->h_transaction->t_journal->j_private; +- err = handle->h_err; + rc = jbd2_journal_stop(handle); + + if (!err) diff --git a/queue-4.1/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch b/queue-4.1/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch new file mode 100644 index 00000000000..aec773085a7 --- /dev/null +++ b/queue-4.1/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch @@ -0,0 +1,104 @@ +From 4327ba52afd03fc4b5afa0ee1d774c9c5b0e85c5 Mon Sep 17 00:00:00 2001 +From: Daeho Jeong +Date: Sun, 18 Oct 2015 17:02:56 -0400 +Subject: ext4, jbd2: ensure entering into panic after recording an error in superblock + +From: Daeho Jeong + +commit 4327ba52afd03fc4b5afa0ee1d774c9c5b0e85c5 upstream. + +If a EXT4 filesystem utilizes JBD2 journaling and an error occurs, the +journaling will be aborted first and the error number will be recorded +into JBD2 superblock and, finally, the system will enter into the +panic state in "errors=panic" option. But, in the rare case, this +sequence is little twisted like the below figure and it will happen +that the system enters into panic state, which means the system reset +in mobile environment, before completion of recording an error in the +journal superblock. In this case, e2fsck cannot recognize that the +filesystem failure occurred in the previous run and the corruption +wouldn't be fixed. + +Task A Task B +ext4_handle_error() +-> jbd2_journal_abort() + -> __journal_abort_soft() + -> __jbd2_journal_abort_hard() + | -> journal->j_flags |= JBD2_ABORT; + | + | __ext4_abort() + | -> jbd2_journal_abort() + | | -> __journal_abort_soft() + | | -> if (journal->j_flags & JBD2_ABORT) + | | return; + | -> panic() + | + -> jbd2_journal_update_sb_errno() + +Tested-by: Hobin Woo +Signed-off-by: Daeho Jeong +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/super.c | 12 ++++++++++-- + fs/jbd2/journal.c | 6 +++++- + include/linux/jbd2.h | 1 + + 3 files changed, 16 insertions(+), 3 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -396,9 +396,13 @@ static void ext4_handle_error(struct sup + smp_wmb(); + sb->s_flags |= MS_RDONLY; + } +- if (test_opt(sb, ERRORS_PANIC)) ++ if (test_opt(sb, ERRORS_PANIC)) { ++ if (EXT4_SB(sb)->s_journal && ++ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) ++ return; + panic("EXT4-fs (device %s): panic forced after error\n", + sb->s_id); ++ } + } + + #define ext4_error_ratelimit(sb) \ +@@ -587,8 +591,12 @@ void __ext4_abort(struct super_block *sb + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + save_error_info(sb, function, line); + } +- if (test_opt(sb, ERRORS_PANIC)) ++ if (test_opt(sb, ERRORS_PANIC)) { ++ if (EXT4_SB(sb)->s_journal && ++ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) ++ return; + panic("EXT4-fs panic from previous error\n"); ++ } + } + + void __ext4_msg(struct super_block *sb, +--- a/fs/jbd2/journal.c ++++ b/fs/jbd2/journal.c +@@ -2086,8 +2086,12 @@ static void __journal_abort_soft (journa + + __jbd2_journal_abort_hard(journal); + +- if (errno) ++ if (errno) { + jbd2_journal_update_sb_errno(journal); ++ write_lock(&journal->j_state_lock); ++ journal->j_flags |= JBD2_REC_ERR; ++ write_unlock(&journal->j_state_lock); ++ } + } + + /** +--- a/include/linux/jbd2.h ++++ b/include/linux/jbd2.h +@@ -1007,6 +1007,7 @@ struct journal_s + #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file + * data write error in ordered + * mode */ ++#define JBD2_REC_ERR 0x080 /* The errno in the sb has been recorded */ + + /* + * Function declarations for the journaling transaction and buffer diff --git a/queue-4.1/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch b/queue-4.1/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch new file mode 100644 index 00000000000..a034d06b8b0 --- /dev/null +++ b/queue-4.1/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch @@ -0,0 +1,71 @@ +From 100ceb66d5c40cc0c7018e06a9474302470be73c Mon Sep 17 00:00:00 2001 +From: Stefan Richter +Date: Tue, 3 Nov 2015 01:46:21 +0100 +Subject: firewire: ohci: fix JMicron JMB38x IT context discovery + +From: Stefan Richter + +commit 100ceb66d5c40cc0c7018e06a9474302470be73c upstream. + +Reported by Clifford and Craig for JMicron OHCI-1394 + SDHCI combo +controllers: Often or even most of the time, the controller is +initialized with the message "added OHCI v1.10 device as card 0, 4 IR + +0 IT contexts, quirks 0x10". With 0 isochronous transmit DMA contexts +(IT contexts), applications like audio output are impossible. + +However, OHCI-1394 demands that at least 4 IT contexts are implemented +by the link layer controller, and indeed JMicron JMB38x do implement +four of them. Only their IsoXmitIntMask register is unreliable at early +access. + +With my own JMB381 single function controller I found: + - I can reproduce the problem with a lower probability than Craig's. + - If I put a loop around the section which clears and reads + IsoXmitIntMask, then either the first or the second attempt will + return the correct initial mask of 0x0000000f. I never encountered + a case of needing more than a second attempt. + - Consequently, if I put a dummy reg_read(...IsoXmitIntMaskSet) + before the first write, the subsequent read will return the correct + result. + - If I merely ignore a wrong read result and force the known real + result, later isochronous transmit DMA usage works just fine. + +So let's just fix this chip bug up by the latter method. Tested with +JMB381 on kernel 3.13 and 4.3. + +Since OHCI-1394 generally requires 4 IT contexts at a minium, this +workaround is simply applied whenever the initial read of IsoXmitIntMask +returns 0, regardless whether it's a JMicron chip or not. I never heard +of this issue together with any other chip though. + +I am not 100% sure that this fix works on the OHCI-1394 part of JMB380 +and JMB388 combo controllers exactly the same as on the JMB381 single- +function controller, but so far I haven't had a chance to let an owner +of a combo chip run a patched kernel. + +Strangely enough, IsoRecvIntMask is always reported correctly, even +though it is probed right before IsoXmitIntMask. + +Reported-by: Clifford Dunn +Reported-by: Craig Moore +Signed-off-by: Stefan Richter +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/firewire/ohci.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/firewire/ohci.c ++++ b/drivers/firewire/ohci.c +@@ -3675,6 +3675,11 @@ static int pci_probe(struct pci_dev *dev + + reg_write(ohci, OHCI1394_IsoXmitIntMaskSet, ~0); + ohci->it_context_support = reg_read(ohci, OHCI1394_IsoXmitIntMaskSet); ++ /* JMicron JMB38x often shows 0 at first read, just ignore it */ ++ if (!ohci->it_context_support) { ++ ohci_notice(ohci, "overriding IsoXmitIntMask\n"); ++ ohci->it_context_support = 0xf; ++ } + reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, ~0); + ohci->it_context_mask = ohci->it_context_support; + ohci->n_it = hweight32(ohci->it_context_mask); diff --git a/queue-4.1/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch b/queue-4.1/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch new file mode 100644 index 00000000000..7a18a7b25c8 --- /dev/null +++ b/queue-4.1/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch @@ -0,0 +1,39 @@ +From c812012f9ca7cf89c9e1a1cd512e6c3b5be04b85 Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Wed, 25 Nov 2015 13:50:11 -0500 +Subject: nfs: if we have no valid attrs, then don't declare the attribute cache valid + +From: Jeff Layton + +commit c812012f9ca7cf89c9e1a1cd512e6c3b5be04b85 upstream. + +If we pass in an empty nfs_fattr struct to nfs_update_inode, it will +(correctly) not update any of the attributes, but it then clears the +NFS_INO_INVALID_ATTR flag, which indicates that the attributes are +up to date. Don't clear the flag if the fattr struct has no valid +attrs to apply. + +Reviewed-by: Steve French +Signed-off-by: Jeff Layton +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/inode.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/nfs/inode.c ++++ b/fs/nfs/inode.c +@@ -1813,7 +1813,11 @@ static int nfs_update_inode(struct inode + if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) + nfsi->attr_gencount = fattr->gencount; + } +- invalid &= ~NFS_INO_INVALID_ATTR; ++ ++ /* Don't declare attrcache up to date if there were no attrs! */ ++ if (fattr->valid != 0) ++ invalid &= ~NFS_INO_INVALID_ATTR; ++ + /* Don't invalidate the data if we were to blame */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) diff --git a/queue-4.1/nfs4-start-callback_ident-at-idr-1.patch b/queue-4.1/nfs4-start-callback_ident-at-idr-1.patch new file mode 100644 index 00000000000..c9ef3180ecd --- /dev/null +++ b/queue-4.1/nfs4-start-callback_ident-at-idr-1.patch @@ -0,0 +1,33 @@ +From c68a027c05709330fe5b2f50c50d5fa02124b5d8 Mon Sep 17 00:00:00 2001 +From: Benjamin Coddington +Date: Fri, 20 Nov 2015 09:56:20 -0500 +Subject: nfs4: start callback_ident at idr 1 + +From: Benjamin Coddington + +commit c68a027c05709330fe5b2f50c50d5fa02124b5d8 upstream. + +If clp->cl_cb_ident is zero, then nfs_cb_idr_remove_locked() skips removing +it when the nfs_client is freed. A decoding or server bug can then find +and try to put that first nfs_client which would lead to a crash. + +Signed-off-by: Benjamin Coddington +Fixes: d6870312659d ("nfs4client: convert to idr_alloc()") +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/nfs4client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/nfs/nfs4client.c ++++ b/fs/nfs/nfs4client.c +@@ -33,7 +33,7 @@ static int nfs_get_cb_ident_idr(struct n + return ret; + idr_preload(GFP_KERNEL); + spin_lock(&nn->nfs_client_lock); +- ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT); ++ ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT); + if (ret >= 0) + clp->cl_cb_ident = ret; + spin_unlock(&nn->nfs_client_lock); diff --git a/queue-4.1/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch b/queue-4.1/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch new file mode 100644 index 00000000000..e4631f71635 --- /dev/null +++ b/queue-4.1/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch @@ -0,0 +1,200 @@ +From 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 Mon Sep 17 00:00:00 2001 +From: Andrew Elble +Date: Thu, 15 Oct 2015 12:07:28 -0400 +Subject: nfsd: eliminate sending duplicate and repeated delegations + +From: Andrew Elble + +commit 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 upstream. + +We've observed the nfsd server in a state where there are +multiple delegations on the same nfs4_file for the same client. +The nfs client does attempt to DELEGRETURN these when they are presented to +it - but apparently under some (unknown) circumstances the client does not +manage to return all of them. This leads to the eventual +attempt to CB_RECALL more than one delegation with the same nfs +filehandle to the same client. The first recall will succeed, but the +next recall will fail with NFS4ERR_BADHANDLE. This leads to the server +having delegations on cl_revoked that the client has no way to FREE +or DELEGRETURN, with resulting inability to recover. The state manager +on the server will continually assert SEQ4_STATUS_RECALLABLE_STATE_REVOKED, +and the state manager on the client will be looping unable to satisfy +the server. + +List discussion also reports a race between OPEN and DELEGRETURN that +will be avoided by only sending the delegation once to the +client. This is also logically in accordance with RFC5561 9.1.1 and 10.2. + +So, let's: + +1.) Not hand out duplicate delegations. +2.) Only send them to the client once. + +RFC 5561: + +9.1.1: +"Delegations and layouts, on the other hand, are not associated with a +specific owner but are associated with the client as a whole +(identified by a client ID)." + +10.2: +"...the stateid for a delegation is associated with a client ID and may be +used on behalf of all the open-owners for the given client. A +delegation is made to the client as a whole and not to any specific +process or thread of control within it." + +Reported-by: Eric Meddaugh +Cc: Trond Myklebust +Cc: Olga Kornievskaia +Signed-off-by: Andrew Elble +Signed-off-by: J. Bruce Fields +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfsd/nfs4state.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 84 insertions(+), 10 deletions(-) + +--- a/fs/nfsd/nfs4state.c ++++ b/fs/nfsd/nfs4state.c +@@ -765,16 +765,68 @@ void nfs4_unhash_stid(struct nfs4_stid * + s->sc_type = 0; + } + +-static void ++/** ++ * nfs4_get_existing_delegation - Discover if this delegation already exists ++ * @clp: a pointer to the nfs4_client we're granting a delegation to ++ * @fp: a pointer to the nfs4_file we're granting a delegation on ++ * ++ * Return: ++ * On success: NULL if an existing delegation was not found. ++ * ++ * On error: -EAGAIN if one was previously granted to this nfs4_client ++ * for this nfs4_file. ++ * ++ */ ++ ++static int ++nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_delegation *searchdp = NULL; ++ struct nfs4_client *searchclp = NULL; ++ ++ lockdep_assert_held(&state_lock); ++ lockdep_assert_held(&fp->fi_lock); ++ ++ list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { ++ searchclp = searchdp->dl_stid.sc_client; ++ if (clp == searchclp) { ++ return -EAGAIN; ++ } ++ } ++ return 0; ++} ++ ++/** ++ * hash_delegation_locked - Add a delegation to the appropriate lists ++ * @dp: a pointer to the nfs4_delegation we are adding. ++ * @fp: a pointer to the nfs4_file we're granting a delegation on ++ * ++ * Return: ++ * On success: NULL if the delegation was successfully hashed. ++ * ++ * On error: -EAGAIN if one was previously granted to this ++ * nfs4_client for this nfs4_file. Delegation is not hashed. ++ * ++ */ ++ ++static int + hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) + { ++ int status; ++ struct nfs4_client *clp = dp->dl_stid.sc_client; ++ + lockdep_assert_held(&state_lock); + lockdep_assert_held(&fp->fi_lock); + ++ status = nfs4_get_existing_delegation(clp, fp); ++ if (status) ++ return status; ++ ++fp->fi_delegees; + atomic_inc(&dp->dl_stid.sc_count); + dp->dl_stid.sc_type = NFS4_DELEG_STID; + list_add(&dp->dl_perfile, &fp->fi_delegations); +- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); ++ list_add(&dp->dl_perclnt, &clp->cl_delegations); ++ return 0; + } + + static bool +@@ -3941,6 +3993,18 @@ static struct file_lock *nfs4_alloc_init + return fl; + } + ++/** ++ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer ++ * @dp: a pointer to the nfs4_delegation we're adding. ++ * ++ * Return: ++ * On success: Return code will be 0 on success. ++ * ++ * On error: -EAGAIN if there was an existing delegation. ++ * nonzero if there is an error in other cases. ++ * ++ */ ++ + static int nfs4_setlease(struct nfs4_delegation *dp) + { + struct nfs4_file *fp = dp->dl_stid.sc_file; +@@ -3972,16 +4036,19 @@ static int nfs4_setlease(struct nfs4_del + goto out_unlock; + /* Race breaker */ + if (fp->fi_deleg_file) { +- status = 0; +- ++fp->fi_delegees; +- hash_delegation_locked(dp, fp); ++ status = hash_delegation_locked(dp, fp); + goto out_unlock; + } + fp->fi_deleg_file = filp; +- fp->fi_delegees = 1; +- hash_delegation_locked(dp, fp); ++ fp->fi_delegees = 0; ++ status = hash_delegation_locked(dp, fp); + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); ++ if (status) { ++ /* Should never happen, this is a new fi_deleg_file */ ++ WARN_ON_ONCE(1); ++ goto out_fput; ++ } + return 0; + out_unlock: + spin_unlock(&fp->fi_lock); +@@ -4001,6 +4068,15 @@ nfs4_set_delegation(struct nfs4_client * + if (fp->fi_had_conflict) + return ERR_PTR(-EAGAIN); + ++ spin_lock(&state_lock); ++ spin_lock(&fp->fi_lock); ++ status = nfs4_get_existing_delegation(clp, fp); ++ spin_unlock(&fp->fi_lock); ++ spin_unlock(&state_lock); ++ ++ if (status) ++ return ERR_PTR(status); ++ + dp = alloc_init_deleg(clp, fh, odstate); + if (!dp) + return ERR_PTR(-ENOMEM); +@@ -4019,9 +4095,7 @@ nfs4_set_delegation(struct nfs4_client * + status = -EAGAIN; + goto out_unlock; + } +- ++fp->fi_delegees; +- hash_delegation_locked(dp, fp); +- status = 0; ++ status = hash_delegation_locked(dp, fp); + out_unlock: + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); diff --git a/queue-4.1/nfsd-serialize-state-seqid-morphing-operations.patch b/queue-4.1/nfsd-serialize-state-seqid-morphing-operations.patch new file mode 100644 index 00000000000..388e6cdaee5 --- /dev/null +++ b/queue-4.1/nfsd-serialize-state-seqid-morphing-operations.patch @@ -0,0 +1,207 @@ +From 35a92fe8770ce54c5eb275cd76128645bea2d200 Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Thu, 17 Sep 2015 07:47:08 -0400 +Subject: nfsd: serialize state seqid morphing operations + +From: Jeff Layton + +commit 35a92fe8770ce54c5eb275cd76128645bea2d200 upstream. + +Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were +running in parallel. The server would receive the OPEN_DOWNGRADE first +and check its seqid, but then an OPEN would race in and bump it. The +OPEN_DOWNGRADE would then complete and bump the seqid again. The result +was that the OPEN_DOWNGRADE would be applied after the OPEN, even though +it should have been rejected since the seqid changed. + +The only recourse we have here I think is to serialize operations that +bump the seqid in a stateid, particularly when we're given a seqid in +the call. To address this, we add a new rw_semaphore to the +nfs4_ol_stateid struct. We do a down_write prior to checking the seqid +after looking up the stateid to ensure that nothing else is going to +bump it while we're operating on it. + +In the case of OPEN, we do a down_read, as the call doesn't contain a +seqid. Those can run in parallel -- we just need to serialize them when +there is a concurrent OPEN_DOWNGRADE or CLOSE. + +LOCK and LOCKU however always take the write lock as there is no +opportunity for parallelizing those. + +Reported-and-Tested-by: Andrew W Elble +Signed-off-by: Jeff Layton +Signed-off-by: J. Bruce Fields +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++----- + fs/nfsd/state.h | 19 ++++++++++--------- + 2 files changed, 38 insertions(+), 14 deletions(-) + +--- a/fs/nfsd/nfs4state.c ++++ b/fs/nfsd/nfs4state.c +@@ -3351,6 +3351,7 @@ static void init_open_stateid(struct nfs + stp->st_access_bmap = 0; + stp->st_deny_bmap = 0; + stp->st_openstp = NULL; ++ init_rwsem(&stp->st_rwsem); + spin_lock(&oo->oo_owner.so_client->cl_lock); + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); + spin_lock(&fp->fi_lock); +@@ -4181,15 +4182,20 @@ nfsd4_process_open2(struct svc_rqst *rqs + */ + if (stp) { + /* Stateid was found, this is an OPEN upgrade */ ++ down_read(&stp->st_rwsem); + status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); +- if (status) ++ if (status) { ++ up_read(&stp->st_rwsem); + goto out; ++ } + } else { + stp = open->op_stp; + open->op_stp = NULL; + init_open_stateid(stp, fp, open); ++ down_read(&stp->st_rwsem); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + if (status) { ++ up_read(&stp->st_rwsem); + release_open_stateid(stp); + goto out; + } +@@ -4201,6 +4207,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + } + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); ++ up_read(&stp->st_rwsem); + + if (nfsd4_has_session(&resp->cstate)) { + if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { +@@ -4777,10 +4784,13 @@ static __be32 nfs4_seqid_op_checks(struc + * revoked delegations are kept only for free_stateid. + */ + return nfserr_bad_stateid; ++ down_write(&stp->st_rwsem); + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); +- if (status) +- return status; +- return nfs4_check_fh(current_fh, &stp->st_stid); ++ if (status == nfs_ok) ++ status = nfs4_check_fh(current_fh, &stp->st_stid); ++ if (status != nfs_ok) ++ up_write(&stp->st_rwsem); ++ return status; + } + + /* +@@ -4827,6 +4837,7 @@ static __be32 nfs4_preprocess_confirmed_ + return status; + oo = openowner(stp->st_stateowner); + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { ++ up_write(&stp->st_rwsem); + nfs4_put_stid(&stp->st_stid); + return nfserr_bad_stateid; + } +@@ -4857,11 +4868,14 @@ nfsd4_open_confirm(struct svc_rqst *rqst + goto out; + oo = openowner(stp->st_stateowner); + status = nfserr_bad_stateid; +- if (oo->oo_flags & NFS4_OO_CONFIRMED) ++ if (oo->oo_flags & NFS4_OO_CONFIRMED) { ++ up_write(&stp->st_rwsem); + goto put_stateid; ++ } + oo->oo_flags |= NFS4_OO_CONFIRMED; + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); ++ up_write(&stp->st_rwsem); + dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); + +@@ -4940,6 +4954,7 @@ nfsd4_open_downgrade(struct svc_rqst *rq + memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + status = nfs_ok; + put_stateid: ++ up_write(&stp->st_rwsem); + nfs4_put_stid(&stp->st_stid); + out: + nfsd4_bump_seqid(cstate, status); +@@ -4993,6 +5008,7 @@ nfsd4_close(struct svc_rqst *rqstp, stru + goto out; + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); ++ up_write(&stp->st_rwsem); + + nfsd4_close_open_stateid(stp); + +@@ -5223,6 +5239,7 @@ init_lock_stateid(struct nfs4_ol_stateid + stp->st_access_bmap = 0; + stp->st_deny_bmap = open_stp->st_deny_bmap; + stp->st_openstp = open_stp; ++ init_rwsem(&stp->st_rwsem); + list_add(&stp->st_locks, &open_stp->st_locks); + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + spin_lock(&fp->fi_lock); +@@ -5391,6 +5408,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struc + &open_stp, nn); + if (status) + goto out; ++ up_write(&open_stp->st_rwsem); + open_sop = openowner(open_stp->st_stateowner); + status = nfserr_bad_stateid; + if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, +@@ -5398,6 +5416,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struc + goto out; + status = lookup_or_create_lock_state(cstate, open_stp, lock, + &lock_stp, &new); ++ if (status == nfs_ok) ++ down_write(&lock_stp->st_rwsem); + } else { + status = nfs4_preprocess_seqid_op(cstate, + lock->lk_old_lock_seqid, +@@ -5503,6 +5523,8 @@ out: + seqid_mutating_err(ntohl(status))) + lock_sop->lo_owner.so_seqid++; + ++ up_write(&lock_stp->st_rwsem); ++ + /* + * If this is a new, never-before-used stateid, and we are + * returning an error, then just go ahead and release it. +@@ -5673,6 +5695,7 @@ nfsd4_locku(struct svc_rqst *rqstp, stru + fput: + fput(filp); + put_stateid: ++ up_write(&stp->st_rwsem); + nfs4_put_stid(&stp->st_stid); + out: + nfsd4_bump_seqid(cstate, status); +--- a/fs/nfsd/state.h ++++ b/fs/nfsd/state.h +@@ -533,15 +533,16 @@ struct nfs4_file { + * Better suggestions welcome. + */ + struct nfs4_ol_stateid { +- struct nfs4_stid st_stid; /* must be first field */ +- struct list_head st_perfile; +- struct list_head st_perstateowner; +- struct list_head st_locks; +- struct nfs4_stateowner * st_stateowner; +- struct nfs4_clnt_odstate * st_clnt_odstate; +- unsigned char st_access_bmap; +- unsigned char st_deny_bmap; +- struct nfs4_ol_stateid * st_openstp; ++ struct nfs4_stid st_stid; ++ struct list_head st_perfile; ++ struct list_head st_perstateowner; ++ struct list_head st_locks; ++ struct nfs4_stateowner *st_stateowner; ++ struct nfs4_clnt_odstate *st_clnt_odstate; ++ unsigned char st_access_bmap; ++ unsigned char st_deny_bmap; ++ struct nfs4_ol_stateid *st_openstp; ++ struct rw_semaphore st_rwsem; + }; + + static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) diff --git a/queue-4.1/ocfs2-fix-umask-ignored-issue.patch b/queue-4.1/ocfs2-fix-umask-ignored-issue.patch new file mode 100644 index 00000000000..e7564e8b475 --- /dev/null +++ b/queue-4.1/ocfs2-fix-umask-ignored-issue.patch @@ -0,0 +1,36 @@ +From 8f1eb48758aacf6c1ffce18179295adbf3bd7640 Mon Sep 17 00:00:00 2001 +From: Junxiao Bi +Date: Fri, 20 Nov 2015 15:57:30 -0800 +Subject: ocfs2: fix umask ignored issue + +From: Junxiao Bi + +commit 8f1eb48758aacf6c1ffce18179295adbf3bd7640 upstream. + +New created file's mode is not masked with umask, and this makes umask not +work for ocfs2 volume. + +Fixes: 702e5bc ("ocfs2: use generic posix ACL infrastructure") +Signed-off-by: Junxiao Bi +Cc: Gang He +Cc: Mark Fasheh +Cc: Joel Becker +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/namei.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/ocfs2/namei.c ++++ b/fs/ocfs2/namei.c +@@ -365,6 +365,8 @@ static int ocfs2_mknod(struct inode *dir + mlog_errno(status); + goto leave; + } ++ /* update inode->i_mode after mask with "umask". */ ++ inode->i_mode = mode; + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, + S_ISDIR(mode), diff --git a/queue-4.1/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch b/queue-4.1/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch new file mode 100644 index 00000000000..8c1adf83a26 --- /dev/null +++ b/queue-4.1/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch @@ -0,0 +1,34 @@ +From 70b16db86f564977df074072143284aec2cb1162 Mon Sep 17 00:00:00 2001 +From: Ilya Dryomov +Date: Fri, 27 Nov 2015 19:23:24 +0100 +Subject: rbd: don't put snap_context twice in rbd_queue_workfn() + +From: Ilya Dryomov + +commit 70b16db86f564977df074072143284aec2cb1162 upstream. + +Commit 4e752f0ab0e8 ("rbd: access snapshot context and mapping size +safely") moved ceph_get_snap_context() out of rbd_img_request_create() +and into rbd_queue_workfn(), adding a ceph_put_snap_context() to the +error path in rbd_queue_workfn(). However, rbd_img_request_create() +consumes a ref on snapc, so calling ceph_put_snap_context() after +a successful rbd_img_request_create() leads to an extra put. Fix it. + +Signed-off-by: Ilya Dryomov +Reviewed-by: Josh Durgin +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/rbd.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -3417,6 +3417,7 @@ static void rbd_queue_workfn(struct work + goto err_rq; + } + img_request->rq = rq; ++ snapc = NULL; /* img_request consumes a ref */ + + if (op_type == OBJ_OP_DISCARD) + result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, diff --git a/queue-4.1/series b/queue-4.1/series index b0de9e5252c..75282c391af 100644 --- a/queue-4.1/series +++ b/queue-4.1/series @@ -24,3 +24,22 @@ ipv6-add-complete-rcu-protection-around-np-opt.patch net-neighbour-fix-crash-at-dumping-device-agnostic-proxy-entries.patch ipv6-sctp-implement-sctp_v6_destroy_sock.patch net_sched-fix-qdisc_tree_decrease_qlen-races.patch +btrfs-check-unsupported-filters-in-balance-arguments.patch +btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch +btrfs-fix-truncation-of-compressed-and-inlined-extents.patch +btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch +btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch +btrfs-fix-race-when-listing-an-inode-s-xattrs.patch +rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch +ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch +ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch +ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch +firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch +nfsd-serialize-state-seqid-morphing-operations.patch +nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch +debugfs-fix-refcount-imbalance-in-start_creating.patch +nfs4-start-callback_ident-at-idr-1.patch +nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch +ocfs2-fix-umask-ignored-issue.patch +ceph-fix-message-length-computation.patch +alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch